This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
markstack_grow(): fix debugging stuff
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
f65819ce
CO
5 * One Ring to rule them all, One Ring to find them
6 *
4ac71550
TC
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
b992490d 83#include "invlist_inline.h"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
bbac6b20
KW
86#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
88
a0bd1a30
KW
89static const char utf8_locale_required[] =
90 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
91
e1cf74e3
CB
92#ifdef DEBUGGING
93/* At least one required character in the target string is expressible only in
94 * UTF-8. */
95static const char* const non_utf8_target_but_utf8_required
96 = "Can't match, because target string needs to be in UTF-8\n";
97#endif
98
99#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
100 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
101 goto target; \
102} STMT_END
103
c74f6de9
KW
104#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
2d66f61e 110/* Valid only for non-utf8 strings: avoids the reginclass
7e2509c1
KW
111 * call if there are no complications: i.e., if everything matchable is
112 * straight forward in the bitmap */
3db24e1e 113#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \
af364d03 114 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 115
c277df42
IZ
116/*
117 * Forwards.
118 */
119
f2ed9b32 120#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
ba44c216 121#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
a0ed51b3 122
3dab1dad 123#define HOPc(pos,off) \
ba44c216 124 (char *)(reginfo->is_utf8_target \
220db18a 125 ? reghop3((U8*)pos, off, \
9d9163fb 126 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad 127 : (U8*)(pos + off))
557f47af 128
3dab1dad 129#define HOPBACKc(pos, off) \
ba44c216 130 (char*)(reginfo->is_utf8_target \
9d9163fb
DM
131 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
132 : (pos - off >= reginfo->strbeg) \
8e11feef 133 ? (U8*)pos - off \
3dab1dad 134 : NULL)
efb30f32 135
ba44c216 136#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 137#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 138
557f47af
DM
139/* lim must be +ve. Returns NULL on overshoot */
140#define HOPMAYBE3(pos,off,lim) \
141 (reginfo->is_utf8_target \
142 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
143 : ((U8*)pos + off <= lim) \
144 ? (U8*)pos + off \
145 : NULL)
146
8e9f2289
DM
147/* like HOP3, but limits the result to <= lim even for the non-utf8 case.
148 * off must be >=0; args should be vars rather than expressions */
149#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
150 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
151 : (U8*)((pos + off) > lim ? lim : (pos + off)))
152
2974eaec
DM
153#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
154 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
155 : (U8*)(pos + off))
156#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
7016d6eb
DM
157
158#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
159#define NEXTCHR_IS_EOS (nextchr < 0)
160
161#define SET_nextchr \
220db18a 162 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
7016d6eb
DM
163
164#define SET_locinput(p) \
165 locinput = (p); \
166 SET_nextchr
167
168
2a16ac92 169#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
c7304fe2
KW
170 if (!swash_ptr) { \
171 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
c7304fe2 172 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
2a16ac92 173 1, 0, invlist, &flags); \
c7304fe2
KW
174 assert(swash_ptr); \
175 } \
176 } STMT_END
177
178/* If in debug mode, we test that a known character properly matches */
179#ifdef DEBUGGING
180# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
181 property_name, \
2a16ac92 182 invlist, \
c7304fe2 183 utf8_char_in_property) \
2a16ac92 184 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
c7304fe2
KW
185 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
186#else
187# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
188 property_name, \
2a16ac92 189 invlist, \
c7304fe2 190 utf8_char_in_property) \
2a16ac92 191 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
c7304fe2 192#endif
d1eb3177 193
c7304fe2
KW
194#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
195 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
2a16ac92
KW
196 "", \
197 PL_XPosix_ptrs[_CC_WORDCHAR], \
0766489e 198 LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
c7304fe2 199
c7304fe2 200#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
201/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
202
5f80c4cf 203/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
204/* it would be nice to rework regcomp.sym to generate this stuff. sigh
205 *
206 * NOTE that *nothing* that affects backtracking should be in here, specifically
207 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
208 * node that is in between two EXACT like nodes when ascertaining what the required
209 * "follow" character is. This should probably be moved to regex compile time
210 * although it may be done at run time beause of the REF possibility - more
211 * investigation required. -- demerphq
212*/
baa60164
KW
213#define JUMPABLE(rn) ( \
214 OP(rn) == OPEN || \
3e901dc0 215 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
baa60164
KW
216 OP(rn) == EVAL || \
217 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
218 OP(rn) == PLUS || OP(rn) == MINMOD || \
219 OP(rn) == KEEPS || \
220 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 221)
ee9b8eae 222#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 223
ee9b8eae
YO
224#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
225
226#if 0
227/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
a4525e78 228 we don't need this definition. XXX These are now out-of-sync*/
ee9b8eae 229#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
098b07d5 230#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
231#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
232
233#else
234/* ... so we use this as its faster. */
a4525e78
KW
235#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==EXACTL )
236#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
ee9b8eae
YO
237#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
238#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
239
240#endif
e2d8ce26 241
a84d97b6
HS
242/*
243 Search for mandatory following text node; for lookahead, the text must
244 follow but for lookbehind (rn->flags != 0) we skip to the next step.
245*/
baa60164 246#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
247 while (JUMPABLE(rn)) { \
248 const OPCODE type = OP(rn); \
249 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 250 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 251 else if (type == PLUS) \
cca55fe3 252 rn = NEXTOPER(rn); \
3dab1dad 253 else if (type == IFMATCH) \
a84d97b6 254 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 255 else rn += NEXT_OFF(rn); \
3dab1dad 256 } \
5f80c4cf 257} STMT_END
74750237 258
006f26b2
DM
259#define SLAB_FIRST(s) (&(s)->states[0])
260#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
261
a75351a1 262static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 263static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
bf2039a9 264static regmatch_state * S_push_slab(pTHX);
51371543 265
87c0511b 266#define REGCP_PAREN_ELEMS 3
f067efbf 267#define REGCP_OTHER_ELEMS 3
e0fa7e2b 268#define REGCP_FRAME_ELEMS 1
620d5b66
NC
269/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
270 * are needed for the regexp context stack bookkeeping. */
271
76e3520e 272STATIC CHECKPOINT
92da3157 273S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
a0d0e21e 274{
a3b680e6 275 const int retval = PL_savestack_ix;
92da3157
DM
276 const int paren_elems_to_push =
277 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
278 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
279 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 280 I32 p;
40a82448 281 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 282
b93070ed
DM
283 PERL_ARGS_ASSERT_REGCPPUSH;
284
e49a9654 285 if (paren_elems_to_push < 0)
e8a85d26
JH
286 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
287 (int)paren_elems_to_push, (int)maxopenparen,
288 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
e49a9654 289
e0fa7e2b
NC
290 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
291 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0 292 " out of range (%lu-%ld)",
92da3157
DM
293 total_elems,
294 (unsigned long)maxopenparen,
295 (long)parenfloor);
e0fa7e2b 296
620d5b66 297 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 298
495f47a5 299 DEBUG_BUFFERS_r(
92da3157 300 if ((int)maxopenparen > (int)parenfloor)
495f47a5
DM
301 PerlIO_printf(Perl_debug_log,
302 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
303 PTR2UV(rex),
304 PTR2UV(rex->offs)
305 );
306 );
92da3157 307 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 308/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
99a90e59
FC
309 SSPUSHIV(rex->offs[p].end);
310 SSPUSHIV(rex->offs[p].start);
1ca2007e 311 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 312 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
313 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
314 (UV)p,
315 (IV)rex->offs[p].start,
316 (IV)rex->offs[p].start_tmp,
317 (IV)rex->offs[p].end
40a82448 318 ));
a0d0e21e 319 }
b1ce53c5 320/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 321 SSPUSHINT(maxopenparen);
b93070ed
DM
322 SSPUSHINT(rex->lastparen);
323 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 324 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 325
a0d0e21e
LW
326 return retval;
327}
328
c277df42 329/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
330#define REGCP_SET(cp) \
331 DEBUG_STATE_r( \
ab3bbdeb 332 PerlIO_printf(Perl_debug_log, \
e4f74956 333 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
334 (IV)PL_savestack_ix)); \
335 cp = PL_savestack_ix
c3464db5 336
ab3bbdeb 337#define REGCP_UNWIND(cp) \
e4f74956 338 DEBUG_STATE_r( \
ab3bbdeb 339 if (cp != PL_savestack_ix) \
e4f74956
YO
340 PerlIO_printf(Perl_debug_log, \
341 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
342 (IV)(cp), (IV)PL_savestack_ix)); \
343 regcpblow(cp)
c277df42 344
a8d1f4b4
DM
345#define UNWIND_PAREN(lp, lcp) \
346 for (n = rex->lastparen; n > lp; n--) \
347 rex->offs[n].end = -1; \
348 rex->lastparen = n; \
349 rex->lastcloseparen = lcp;
350
351
f067efbf 352STATIC void
92da3157 353S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
a0d0e21e 354{
e0fa7e2b 355 UV i;
87c0511b 356 U32 paren;
a3621e74
YO
357 GET_RE_DEBUG_FLAGS_DECL;
358
7918f24d
NC
359 PERL_ARGS_ASSERT_REGCPPOP;
360
b1ce53c5 361 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 362 i = SSPOPUV;
e0fa7e2b
NC
363 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
364 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
365 rex->lastcloseparen = SSPOPINT;
366 rex->lastparen = SSPOPINT;
92da3157 367 *maxopenparen_p = SSPOPINT;
b1ce53c5 368
620d5b66 369 i -= REGCP_OTHER_ELEMS;
b1ce53c5 370 /* Now restore the parentheses context. */
495f47a5
DM
371 DEBUG_BUFFERS_r(
372 if (i || rex->lastparen + 1 <= rex->nparens)
373 PerlIO_printf(Perl_debug_log,
374 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
375 PTR2UV(rex),
376 PTR2UV(rex->offs)
377 );
378 );
92da3157 379 paren = *maxopenparen_p;
620d5b66 380 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
ea3daa5d 381 SSize_t tmps;
1ca2007e 382 rex->offs[paren].start_tmp = SSPOPINT;
99a90e59
FC
383 rex->offs[paren].start = SSPOPIV;
384 tmps = SSPOPIV;
b93070ed
DM
385 if (paren <= rex->lastparen)
386 rex->offs[paren].end = tmps;
495f47a5
DM
387 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
388 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
389 (UV)paren,
390 (IV)rex->offs[paren].start,
391 (IV)rex->offs[paren].start_tmp,
392 (IV)rex->offs[paren].end,
393 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 394 );
87c0511b 395 paren--;
a0d0e21e 396 }
daf18116 397#if 1
dafc8851
JH
398 /* It would seem that the similar code in regtry()
399 * already takes care of this, and in fact it is in
400 * a better location to since this code can #if 0-ed out
401 * but the code in regtry() is needed or otherwise tests
402 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
403 * (as of patchlevel 7877) will fail. Then again,
404 * this code seems to be necessary or otherwise
225593e1
DM
405 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
406 * --jhi updated by dapm */
b93070ed 407 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 408 if (i > *maxopenparen_p)
b93070ed
DM
409 rex->offs[i].start = -1;
410 rex->offs[i].end = -1;
495f47a5
DM
411 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
412 " \\%"UVuf": %s ..-1 undeffing\n",
413 (UV)i,
92da3157 414 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 415 ));
a0d0e21e 416 }
dafc8851 417#endif
a0d0e21e
LW
418}
419
74088413
DM
420/* restore the parens and associated vars at savestack position ix,
421 * but without popping the stack */
422
423STATIC void
92da3157 424S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
74088413
DM
425{
426 I32 tmpix = PL_savestack_ix;
427 PL_savestack_ix = ix;
92da3157 428 regcppop(rex, maxopenparen_p);
74088413
DM
429 PL_savestack_ix = tmpix;
430}
431
02db2b7b 432#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 433
31c7f561
KW
434STATIC bool
435S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
436{
437 /* Returns a boolean as to whether or not 'character' is a member of the
438 * Posix character class given by 'classnum' that should be equivalent to a
439 * value in the typedef '_char_class_number'.
440 *
441 * Ideally this could be replaced by a just an array of function pointers
442 * to the C library functions that implement the macros this calls.
443 * However, to compile, the precise function signatures are required, and
444 * these may vary from platform to to platform. To avoid having to figure
445 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
446 * which adds an extra layer of function call overhead (unless the C
447 * optimizer strips it away). But we don't particularly care about
448 * performance with locales anyway. */
31c7f561
KW
449
450 switch ((_char_class_number) classnum) {
15861f94 451 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 452 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
453 case _CC_ENUM_ASCII: return isASCII_LC(character);
454 case _CC_ENUM_BLANK: return isBLANK_LC(character);
b0d691b2
KW
455 case _CC_ENUM_CASED: return isLOWER_LC(character)
456 || isUPPER_LC(character);
e8d596e0 457 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
458 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
459 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
460 case _CC_ENUM_LOWER: return isLOWER_LC(character);
461 case _CC_ENUM_PRINT: return isPRINT_LC(character);
462 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 463 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
464 case _CC_ENUM_UPPER: return isUPPER_LC(character);
465 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 466 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
467 default: /* VERTSPACE should never occur in locales */
468 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
469 }
470
e5964223 471 NOT_REACHED; /* NOTREACHED */
31c7f561
KW
472 return FALSE;
473}
474
3018b823
KW
475STATIC bool
476S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
477{
478 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
479 * 'character' is a member of the Posix character class given by 'classnum'
480 * that should be equivalent to a value in the typedef
481 * '_char_class_number'.
482 *
483 * This just calls isFOO_lc on the code point for the character if it is in
2f306ab9 484 * the range 0-255. Outside that range, all characters use Unicode
3018b823
KW
485 * rules, ignoring any locale. So use the Unicode function if this class
486 * requires a swash, and use the Unicode macro otherwise. */
487
488 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
489
490 if (UTF8_IS_INVARIANT(*character)) {
491 return isFOO_lc(classnum, *character);
492 }
493 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
494 return isFOO_lc(classnum,
a62b247b 495 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
3018b823
KW
496 }
497
613abc6d
KW
498 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
499
3018b823
KW
500 if (classnum < _FIRST_NON_SWASH_CC) {
501
502 /* Initialize the swash unless done already */
503 if (! PL_utf8_swash_ptrs[classnum]) {
504 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2a16ac92
KW
505 PL_utf8_swash_ptrs[classnum] =
506 _core_swash_init("utf8",
507 "",
508 &PL_sv_undef, 1, 0,
509 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
510 }
511
92a2046b
KW
512 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
513 character,
514 TRUE /* is UTF */ ));
3018b823
KW
515 }
516
517 switch ((_char_class_number) classnum) {
779cf272 518 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
3018b823
KW
519 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
520 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
521 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
e1ee3960 522 default: break;
3018b823
KW
523 }
524
e1ee3960 525 return FALSE; /* Things like CNTRL are always below 256 */
3018b823
KW
526}
527
a687059c 528/*
e50aee73 529 * pregexec and friends
a687059c
LW
530 */
531
76234dfb 532#ifndef PERL_IN_XSUB_RE
a687059c 533/*
c277df42 534 - pregexec - match a regexp against a string
a687059c 535 */
c277df42 536I32
5aaab254 537Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
ea3daa5d 538 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
8fd1a950
DM
539/* stringarg: the point in the string at which to begin matching */
540/* strend: pointer to null at end of string */
541/* strbeg: real beginning of string */
542/* minend: end of match must be >= minend bytes after stringarg. */
543/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
544 * itself is accessed via the pointers above */
545/* nosave: For optimizations. */
c277df42 546{
7918f24d
NC
547 PERL_ARGS_ASSERT_PREGEXEC;
548
c277df42 549 return
9041c2e3 550 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
551 nosave ? 0 : REXEC_COPY_STR);
552}
76234dfb 553#endif
22e551b9 554
cad2e5aa 555
6eb5f6b9 556
1a4edc3c
DM
557/* re_intuit_start():
558 *
559 * Based on some optimiser hints, try to find the earliest position in the
560 * string where the regex could match.
561 *
562 * rx: the regex to match against
563 * sv: the SV being matched: only used for utf8 flag; the string
564 * itself is accessed via the pointers below. Note that on
565 * something like an overloaded SV, SvPOK(sv) may be false
566 * and the string pointers may point to something unrelated to
567 * the SV itself.
568 * strbeg: real beginning of string
569 * strpos: the point in the string at which to begin matching
570 * strend: pointer to the byte following the last char of the string
571 * flags currently unused; set to 0
572 * data: currently unused; set to NULL
573 *
574 * The basic idea of re_intuit_start() is to use some known information
575 * about the pattern, namely:
576 *
577 * a) the longest known anchored substring (i.e. one that's at a
578 * constant offset from the beginning of the pattern; but not
579 * necessarily at a fixed offset from the beginning of the
580 * string);
581 * b) the longest floating substring (i.e. one that's not at a constant
582 * offset from the beginning of the pattern);
583 * c) Whether the pattern is anchored to the string; either
584 * an absolute anchor: /^../, or anchored to \n: /^.../m,
585 * or anchored to pos(): /\G/;
586 * d) A start class: a real or synthetic character class which
587 * represents which characters are legal at the start of the pattern;
588 *
589 * to either quickly reject the match, or to find the earliest position
590 * within the string at which the pattern might match, thus avoiding
591 * running the full NFA engine at those earlier locations, only to
592 * eventually fail and retry further along.
593 *
594 * Returns NULL if the pattern can't match, or returns the address within
595 * the string which is the earliest place the match could occur.
596 *
597 * The longest of the anchored and floating substrings is called 'check'
598 * and is checked first. The other is called 'other' and is checked
599 * second. The 'other' substring may not be present. For example,
600 *
601 * /(abc|xyz)ABC\d{0,3}DEFG/
602 *
603 * will have
604 *
605 * check substr (float) = "DEFG", offset 6..9 chars
606 * other substr (anchored) = "ABC", offset 3..3 chars
607 * stclass = [ax]
608 *
609 * Be aware that during the course of this function, sometimes 'anchored'
610 * refers to a substring being anchored relative to the start of the
611 * pattern, and sometimes to the pattern itself being anchored relative to
612 * the string. For example:
613 *
614 * /\dabc/: "abc" is anchored to the pattern;
615 * /^\dabc/: "abc" is anchored to the pattern and the string;
616 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
617 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
618 * but the pattern is anchored to the string.
52a21eb3
DM
619 */
620
cad2e5aa 621char *
52a21eb3
DM
622Perl_re_intuit_start(pTHX_
623 REGEXP * const rx,
624 SV *sv,
625 const char * const strbeg,
626 char *strpos,
627 char *strend,
628 const U32 flags,
629 re_scream_pos_data *data)
cad2e5aa 630{
8d919b0a 631 struct regexp *const prog = ReANY(rx);
6b071d16 632 SSize_t start_shift = prog->check_offset_min;
cad2e5aa 633 /* Should be nonnegative! */
ea3daa5d 634 SSize_t end_shift = 0;
0fc004dd
DM
635 /* current lowest pos in string where the regex can start matching */
636 char *rx_origin = strpos;
eb578fdb 637 SV *check;
f2ed9b32 638 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
6480a6c4 639 U8 other_ix = 1 - prog->substrs->check_ix;
6ad5ffb3 640 bool ml_anch = 0;
8f4bf5fc 641 char *other_last = strpos;/* latest pos 'other' substr already checked to */
bd61b366 642 char *check_at = NULL; /* check substr found at this pos */
bbe252da 643 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 644 RXi_GET_DECL(prog,progi);
02d5137b
DM
645 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
646 regmatch_info *const reginfo = &reginfo_buf;
a3621e74
YO
647 GET_RE_DEBUG_FLAGS_DECL;
648
7918f24d 649 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
650 PERL_UNUSED_ARG(flags);
651 PERL_UNUSED_ARG(data);
7918f24d 652
1dc475d0
DM
653 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
654 "Intuit: trying to determine minimum start position...\n"));
655
fb9bbddb
DM
656 /* for now, assume that all substr offsets are positive. If at some point
657 * in the future someone wants to do clever things with look-behind and
658 * -ve offsets, they'll need to fix up any code in this function
659 * which uses these offsets. See the thread beginning
660 * <20140113145929.GF27210@iabyn.com>
661 */
662 assert(prog->substrs->data[0].min_offset >= 0);
663 assert(prog->substrs->data[0].max_offset >= 0);
664 assert(prog->substrs->data[1].min_offset >= 0);
665 assert(prog->substrs->data[1].max_offset >= 0);
666 assert(prog->substrs->data[2].min_offset >= 0);
667 assert(prog->substrs->data[2].max_offset >= 0);
668
f7022b5a 669 /* for now, assume that if both present, that the floating substring
83f2232d 670 * doesn't start before the anchored substring.
f7022b5a
DM
671 * If you break this assumption (e.g. doing better optimisations
672 * with lookahead/behind), then you'll need to audit the code in this
673 * function carefully first
674 */
675 assert(
676 ! ( (prog->anchored_utf8 || prog->anchored_substr)
677 && (prog->float_utf8 || prog->float_substr))
678 || (prog->float_min_offset >= prog->anchored_offset));
679
1a4edc3c
DM
680 /* byte rather than char calculation for efficiency. It fails
681 * to quickly reject some cases that can't match, but will reject
682 * them later after doing full char arithmetic */
c344f387 683 if (prog->minlen > strend - strpos) {
a3621e74 684 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 685 " String too short...\n"));
cad2e5aa 686 goto fail;
2c2d71f5 687 }
d8da0584 688
ab4e48c1 689 RX_MATCH_UTF8_set(rx,utf8_target);
6c3fea77 690 reginfo->is_utf8_target = cBOOL(utf8_target);
bf2039a9 691 reginfo->info_aux = NULL;
9d9163fb 692 reginfo->strbeg = strbeg;
220db18a 693 reginfo->strend = strend;
aed7b151 694 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 695 reginfo->intuit = 1;
1cb48e53
DM
696 /* not actually used within intuit, but zero for safety anyway */
697 reginfo->poscache_maxiter = 0;
02d5137b 698
f2ed9b32 699 if (utf8_target) {
33b8afdf
JH
700 if (!prog->check_utf8 && prog->check_substr)
701 to_utf8_substr(prog);
702 check = prog->check_utf8;
703 } else {
7e0d5ad7
KW
704 if (!prog->check_substr && prog->check_utf8) {
705 if (! to_byte_substr(prog)) {
6b54ddc5 706 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
707 }
708 }
33b8afdf
JH
709 check = prog->check_substr;
710 }
274cd312 711
1dc475d0
DM
712 /* dump the various substring data */
713 DEBUG_OPTIMISE_MORE_r({
714 int i;
715 for (i=0; i<=2; i++) {
716 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
717 : prog->substrs->data[i].substr);
718 if (!sv)
719 continue;
720
721 PerlIO_printf(Perl_debug_log,
722 " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
723 " useful=%"IVdf" utf8=%d [%s]\n",
724 i,
725 (IV)prog->substrs->data[i].min_offset,
726 (IV)prog->substrs->data[i].max_offset,
727 (IV)prog->substrs->data[i].end_shift,
728 BmUSEFUL(sv),
729 utf8_target ? 1 : 0,
730 SvPEEK(sv));
731 }
732 });
733
8e1490ee 734 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
9fc7410e
DM
735
736 /* ml_anch: check after \n?
737 *
0fa70a06 738 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
9fc7410e
DM
739 * with /.*.../, these flags will have been added by the
740 * compiler:
741 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
742 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
743 */
7d2d37f5
DM
744 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
745 && !(prog->intflags & PREGf_IMPLICIT);
cad2e5aa 746
343c8a29 747 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
c889ccc8
DM
748 /* we are only allowed to match at BOS or \G */
749
57fcbfa7 750 /* trivially reject if there's a BOS anchor and we're not at BOS.
7bb3b9eb
DM
751 *
752 * Note that we don't try to do a similar quick reject for
753 * \G, since generally the caller will have calculated strpos
754 * based on pos() and gofs, so the string is already correctly
755 * anchored by definition; and handling the exceptions would
756 * be too fiddly (e.g. REXEC_IGNOREPOS).
57fcbfa7 757 */
7bb3b9eb 758 if ( strpos != strbeg
d3d47aac 759 && (prog->intflags & PREGf_ANCH_SBOL))
c889ccc8
DM
760 {
761 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 762 " Not at start...\n"));
c889ccc8
DM
763 goto fail;
764 }
765
a5d12a4b
DM
766 /* in the presence of an anchor, the anchored (relative to the
767 * start of the regex) substr must also be anchored relative
66b7ec5c
DM
768 * to strpos. So quickly reject if substr isn't found there.
769 * This works for \G too, because the caller will already have
770 * subtracted gofs from pos, and gofs is the offset from the
771 * \G to the start of the regex. For example, in /.abc\Gdef/,
772 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
773 * caller will have set strpos=pos()-4; we look for the substr
774 * at position pos()-4+1, which lines up with the "a" */
a5d12a4b 775
33c28ab2 776 if (prog->check_offset_min == prog->check_offset_max) {
c889ccc8 777 /* Substring at constant offset from beg-of-str... */
d307bf57 778 SSize_t slen = SvCUR(check);
343c8a29 779 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 780
1dc475d0
DM
781 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
782 " Looking for check substr at fixed offset %"IVdf"...\n",
783 (IV)prog->check_offset_min));
784
7742aa66
DM
785 if (SvTAIL(check)) {
786 /* In this case, the regex is anchored at the end too.
787 * Unless it's a multiline match, the lengths must match
788 * exactly, give or take a \n. NB: slen >= 1 since
789 * the last char of check is \n */
790 if (!multiline
791 && ( strend - s > slen
792 || strend - s < slen - 1
793 || (strend - s == slen && strend[-1] != '\n')))
c889ccc8
DM
794 {
795 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 796 " String too long...\n"));
c889ccc8
DM
797 goto fail_finish;
798 }
799 /* Now should match s[0..slen-2] */
800 slen--;
c889ccc8 801 }
d307bf57
DM
802 if (slen && (*SvPVX_const(check) != *s
803 || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
804 {
805 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 806 " String not equal...\n"));
d307bf57
DM
807 goto fail_finish;
808 }
c889ccc8
DM
809
810 check_at = s;
811 goto success_at_start;
cad2e5aa 812 }
cad2e5aa 813 }
cad2e5aa 814 }
0fc004dd 815
c0e0ec46 816 end_shift = prog->check_end_shift;
cad2e5aa 817
19188028 818#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 819 if (end_shift < 0)
1de06328 820 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 821 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
822#endif
823
2c2d71f5 824 restart:
1de06328 825
66b7ec5c
DM
826 /* This is the (re)entry point of the main loop in this function.
827 * The goal of this loop is to:
828 * 1) find the "check" substring in the region rx_origin..strend
829 * (adjusted by start_shift / end_shift). If not found, reject
830 * immediately.
831 * 2) If it exists, look for the "other" substr too if defined; for
832 * example, if the check substr maps to the anchored substr, then
833 * check the floating substr, and vice-versa. If not found, go
834 * back to (1) with rx_origin suitably incremented.
835 * 3) If we find an rx_origin position that doesn't contradict
836 * either of the substrings, then check the possible additional
837 * constraints on rx_origin of /^.../m or a known start class.
838 * If these fail, then depending on which constraints fail, jump
839 * back to here, or to various other re-entry points further along
840 * that skip some of the first steps.
841 * 4) If we pass all those tests, update the BmUSEFUL() count on the
842 * substring. If the start position was determined to be at the
843 * beginning of the string - so, not rejected, but not optimised,
844 * since we have to run regmatch from position 0 - decrement the
845 * BmUSEFUL() count. Otherwise increment it.
846 */
847
1a4edc3c
DM
848
849 /* first, look for the 'check' substring */
850
1de06328 851 {
c33e64f0
FC
852 U8* start_point;
853 U8* end_point;
c889ccc8 854
c889ccc8 855 DEBUG_OPTIMISE_MORE_r({
1dc475d0 856 PerlIO_printf(Perl_debug_log,
ae5d4331 857 " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
1dc475d0 858 " Start shift: %"IVdf" End shift %"IVdf
4d281893 859 " Real end Shift: %"IVdf"\n",
675e93ee 860 (IV)(rx_origin - strbeg),
c889ccc8 861 (IV)prog->check_offset_min,
12fbc530
DM
862 (IV)start_shift,
863 (IV)end_shift,
c889ccc8
DM
864 (IV)prog->check_end_shift);
865 });
1de06328 866
33c28ab2
DM
867 end_point = HOP3(strend, -end_shift, strbeg);
868 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
869 if (!start_point)
870 goto fail_finish;
c889ccc8 871
557f47af 872
e0362b86 873 /* If the regex is absolutely anchored to either the start of the
d3d47aac 874 * string (SBOL) or to pos() (ANCH_GPOS), then
e0362b86
DM
875 * check_offset_max represents an upper bound on the string where
876 * the substr could start. For the ANCH_GPOS case, we assume that
877 * the caller of intuit will have already set strpos to
878 * pos()-gofs, so in this case strpos + offset_max will still be
879 * an upper bound on the substr.
880 */
c19c836a
DM
881 if (!ml_anch
882 && prog->intflags & PREGf_ANCH
e0362b86 883 && prog->check_offset_max != SSize_t_MAX)
c19c836a 884 {
1a08ba3a 885 SSize_t len = SvCUR(check) - !!SvTAIL(check);
e0362b86
DM
886 const char * const anchor =
887 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
888
889 /* do a bytes rather than chars comparison. It's conservative;
890 * so it skips doing the HOP if the result can't possibly end
891 * up earlier than the old value of end_point.
892 */
893 if ((char*)end_point - anchor > prog->check_offset_max) {
894 end_point = HOP3lim((U8*)anchor,
895 prog->check_offset_max,
896 end_point -len)
897 + len;
898 }
d6ef1678
DM
899 }
900
ae5d4331 901 check_at = fbm_instr( start_point, end_point,
7fba1cd6 902 check, multiline ? FBMrf_MULTILINE : 0);
c889ccc8 903
675e93ee
DM
904 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
905 " doing 'check' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
906 (IV)((char*)start_point - strbeg),
907 (IV)((char*)end_point - strbeg),
908 (IV)(check_at ? check_at - strbeg : -1)
909 ));
910
8fd34720
DM
911 /* Update the count-of-usability, remove useless subpatterns,
912 unshift s. */
913
914 DEBUG_EXECUTE_r({
915 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
916 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
917 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s",
918 (check_at ? "Found" : "Did not find"),
919 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
920 ? "anchored" : "floating"),
921 quoted,
922 RE_SV_TAIL(check),
923 (check_at ? " at offset " : "...\n") );
924 });
2c2d71f5 925
8fd34720
DM
926 if (!check_at)
927 goto fail_finish;
8fd34720
DM
928 /* set rx_origin to the minimum position where the regex could start
929 * matching, given the constraint of the just-matched check substring.
930 * But don't set it lower than previously.
931 */
fdc003fd 932
8fd34720
DM
933 if (check_at - rx_origin > prog->check_offset_max)
934 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
675e93ee
DM
935 /* Finish the diagnostic message */
936 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
937 "%ld (rx_origin now %"IVdf")...\n",
938 (long)(check_at - strbeg),
939 (IV)(rx_origin - strbeg)
940 ));
8fd34720 941 }
fdc003fd
DM
942
943
1a4edc3c 944 /* now look for the 'other' substring if defined */
2c2d71f5 945
6480a6c4
DM
946 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
947 : prog->substrs->data[other_ix].substr)
1de06328 948 {
30944b6d 949 /* Take into account the "other" substring. */
6c3343a6
DM
950 char *last, *last1;
951 char *s;
952 SV* must;
953 struct reg_substr_datum *other;
954
955 do_other_substr:
956 other = &prog->substrs->data[other_ix];
957
958 /* if "other" is anchored:
959 * we've previously found a floating substr starting at check_at.
960 * This means that the regex origin must lie somewhere
961 * between min (rx_origin): HOP3(check_at, -check_offset_max)
962 * and max: HOP3(check_at, -check_offset_min)
963 * (except that min will be >= strpos)
964 * So the fixed substr must lie somewhere between
965 * HOP3(min, anchored_offset)
966 * HOP3(max, anchored_offset) + SvCUR(substr)
967 */
968
969 /* if "other" is floating
970 * Calculate last1, the absolute latest point where the
971 * floating substr could start in the string, ignoring any
972 * constraints from the earlier fixed match. It is calculated
973 * as follows:
974 *
975 * strend - prog->minlen (in chars) is the absolute latest
976 * position within the string where the origin of the regex
977 * could appear. The latest start point for the floating
978 * substr is float_min_offset(*) on from the start of the
979 * regex. last1 simply combines thee two offsets.
980 *
981 * (*) You might think the latest start point should be
982 * float_max_offset from the regex origin, and technically
983 * you'd be correct. However, consider
984 * /a\d{2,4}bcd\w/
985 * Here, float min, max are 3,5 and minlen is 7.
986 * This can match either
987 * /a\d\dbcd\w/
988 * /a\d\d\dbcd\w/
989 * /a\d\d\d\dbcd\w/
990 * In the first case, the regex matches minlen chars; in the
991 * second, minlen+1, in the third, minlen+2.
992 * In the first case, the floating offset is 3 (which equals
993 * float_min), in the second, 4, and in the third, 5 (which
994 * equals float_max). In all cases, the floating string bcd
995 * can never start more than 4 chars from the end of the
996 * string, which equals minlen - float_min. As the substring
997 * starts to match more than float_min from the start of the
998 * regex, it makes the regex match more than minlen chars,
999 * and the two cancel each other out. So we can always use
1000 * float_min - minlen, rather than float_max - minlen for the
1001 * latest position in the string.
1002 *
1003 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1004 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1005 */
1006
e7a14a9c 1007 assert(prog->minlen >= other->min_offset);
6c3343a6
DM
1008 last1 = HOP3c(strend,
1009 other->min_offset - prog->minlen, strbeg);
1010
4d006249 1011 if (other_ix) {/* i.e. if (other-is-float) */
6c3343a6
DM
1012 /* last is the latest point where the floating substr could
1013 * start, *given* any constraints from the earlier fixed
1014 * match. This constraint is that the floating string starts
1015 * <= float_max_offset chars from the regex origin (rx_origin).
1016 * If this value is less than last1, use it instead.
eb3831ce 1017 */
6c3343a6
DM
1018 assert(rx_origin <= last1);
1019 last =
1020 /* this condition handles the offset==infinity case, and
1021 * is a short-cut otherwise. Although it's comparing a
1022 * byte offset to a char length, it does so in a safe way,
1023 * since 1 char always occupies 1 or more bytes,
1024 * so if a string range is (last1 - rx_origin) bytes,
1025 * it will be less than or equal to (last1 - rx_origin)
1026 * chars; meaning it errs towards doing the accurate HOP3
1027 * rather than just using last1 as a short-cut */
1028 (last1 - rx_origin) < other->max_offset
1029 ? last1
1030 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1031 }
1032 else {
1033 assert(strpos + start_shift <= check_at);
1034 last = HOP4c(check_at, other->min_offset - start_shift,
1035 strbeg, strend);
1036 }
ead917d0 1037
6c3343a6
DM
1038 s = HOP3c(rx_origin, other->min_offset, strend);
1039 if (s < other_last) /* These positions already checked */
1040 s = other_last;
1041
1042 must = utf8_target ? other->utf8_substr : other->substr;
1043 assert(SvPOK(must));
675e93ee
DM
1044 {
1045 char *from = s;
1046 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1047
88203927
DM
1048 if (from > to) {
1049 s = NULL;
1050 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1051 " skipping 'other' fbm scan: %"IVdf" > %"IVdf"\n",
1052 (IV)(from - strbeg),
1053 (IV)(to - strbeg)
1054 ));
1055 }
1056 else {
1057 s = fbm_instr(
1058 (unsigned char*)from,
1059 (unsigned char*)to,
1060 must,
1061 multiline ? FBMrf_MULTILINE : 0
1062 );
1063 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1064 " doing 'other' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
1065 (IV)(from - strbeg),
1066 (IV)(to - strbeg),
1067 (IV)(s ? s - strbeg : -1)
1068 ));
1069 }
675e93ee
DM
1070 }
1071
6c3343a6
DM
1072 DEBUG_EXECUTE_r({
1073 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1074 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1075 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s",
1076 s ? "Found" : "Contradicts",
1077 other_ix ? "floating" : "anchored",
1078 quoted, RE_SV_TAIL(must));
1079 });
ead917d0 1080
ead917d0 1081
6c3343a6
DM
1082 if (!s) {
1083 /* last1 is latest possible substr location. If we didn't
1084 * find it before there, we never will */
1085 if (last >= last1) {
1086 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
675e93ee 1087 "; giving up...\n"));
6c3343a6 1088 goto fail_finish;
ead917d0
DM
1089 }
1090
6c3343a6
DM
1091 /* try to find the check substr again at a later
1092 * position. Maybe next time we'll find the "other" substr
1093 * in range too */
6c3343a6
DM
1094 other_last = HOP3c(last, 1, strend) /* highest failure */;
1095 rx_origin =
4d006249 1096 other_ix /* i.e. if other-is-float */
6c3343a6
DM
1097 ? HOP3c(rx_origin, 1, strend)
1098 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
675e93ee
DM
1099 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1100 "; about to retry %s at offset %ld (rx_origin now %"IVdf")...\n",
1101 (other_ix ? "floating" : "anchored"),
1102 (long)(HOP3c(check_at, 1, strend) - strbeg),
1103 (IV)(rx_origin - strbeg)
1104 ));
6c3343a6
DM
1105 goto restart;
1106 }
1107 else {
4d006249 1108 if (other_ix) { /* if (other-is-float) */
6c3343a6
DM
1109 /* other_last is set to s, not s+1, since its possible for
1110 * a floating substr to fail first time, then succeed
1111 * second time at the same floating position; e.g.:
1112 * "-AB--AABZ" =~ /\wAB\d*Z/
1113 * The first time round, anchored and float match at
1114 * "-(AB)--AAB(Z)" then fail on the initial \w character
1115 * class. Second time round, they match at "-AB--A(AB)(Z)".
1116 */
1117 other_last = s;
ead917d0
DM
1118 }
1119 else {
6c3343a6
DM
1120 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1121 other_last = HOP3c(s, 1, strend);
ead917d0 1122 }
675e93ee
DM
1123 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1124 " at offset %ld (rx_origin now %"IVdf")...\n",
1125 (long)(s - strbeg),
1126 (IV)(rx_origin - strbeg)
1127 ));
1128
6c3343a6 1129 }
cad2e5aa 1130 }
acba93e8
DM
1131 else {
1132 DEBUG_OPTIMISE_MORE_r(
1133 PerlIO_printf(Perl_debug_log,
1134 " Check-only match: offset min:%"IVdf" max:%"IVdf
1c1c599d 1135 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
675e93ee 1136 " strend:%"IVdf"\n",
acba93e8
DM
1137 (IV)prog->check_offset_min,
1138 (IV)prog->check_offset_max,
675e93ee
DM
1139 (IV)(check_at-strbeg),
1140 (IV)(rx_origin-strbeg),
1c1c599d 1141 (IV)(rx_origin-check_at),
675e93ee 1142 (IV)(strend-strbeg)
acba93e8
DM
1143 )
1144 );
1145 }
2c2d71f5 1146
acba93e8 1147 postprocess_substr_matches:
0991020e 1148
1a4edc3c 1149 /* handle the extra constraint of /^.../m if present */
e3c6feb0 1150
7d2d37f5 1151 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
4620cb61
DM
1152 char *s;
1153
a62659bd
DM
1154 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1155 " looking for /^/m anchor"));
d0880ea7
DM
1156
1157 /* we have failed the constraint of a \n before rx_origin.
2e759faa
DM
1158 * Find the next \n, if any, even if it's beyond the current
1159 * anchored and/or floating substrings. Whether we should be
1160 * scanning ahead for the next \n or the next substr is debatable.
1161 * On the one hand you'd expect rare substrings to appear less
1162 * often than \n's. On the other hand, searching for \n means
675e93ee 1163 * we're effectively flipping between check_substr and "\n" on each
2e759faa
DM
1164 * iteration as the current "rarest" string candidate, which
1165 * means for example that we'll quickly reject the whole string if
1166 * hasn't got a \n, rather than trying every substr position
1167 * first
1168 */
d0880ea7 1169
4620cb61
DM
1170 s = HOP3c(strend, - prog->minlen, strpos);
1171 if (s <= rx_origin ||
1172 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1173 {
d0880ea7
DM
1174 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1175 " Did not find /%s^%s/m...\n",
1176 PL_colors[0], PL_colors[1]));
a62659bd
DM
1177 goto fail_finish;
1178 }
d0880ea7 1179
4ada1233
DM
1180 /* earliest possible origin is 1 char after the \n.
1181 * (since *rx_origin == '\n', it's safe to ++ here rather than
1182 * HOP(rx_origin, 1)) */
1183 rx_origin++;
d0880ea7 1184
f4f115de 1185 if (prog->substrs->check_ix == 0 /* check is anchored */
4ada1233 1186 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
f4f115de 1187 {
d0880ea7
DM
1188 /* Position contradicts check-string; either because
1189 * check was anchored (and thus has no wiggle room),
4ada1233 1190 * or check was float and rx_origin is above the float range */
d0880ea7 1191 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
675e93ee
DM
1192 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1193 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
d0880ea7
DM
1194 goto restart;
1195 }
1196
1197 /* if we get here, the check substr must have been float,
2e759faa 1198 * is in range, and we may or may not have had an anchored
d0880ea7
DM
1199 * "other" substr which still contradicts */
1200 assert(prog->substrs->check_ix); /* check is float */
1201
1202 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1203 /* whoops, the anchored "other" substr exists, so we still
1204 * contradict. On the other hand, the float "check" substr
1205 * didn't contradict, so just retry the anchored "other"
1206 * substr */
1207 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
73e8ff00 1208 " Found /%s^%s/m, rescanning for anchored from offset %"IVdf" (rx_origin now %"IVdf")...\n",
d0880ea7 1209 PL_colors[0], PL_colors[1],
73e8ff00
DM
1210 (IV)(rx_origin - strbeg + prog->anchored_offset),
1211 (IV)(rx_origin - strbeg)
675e93ee 1212 ));
d0880ea7
DM
1213 goto do_other_substr;
1214 }
1215
1216 /* success: we don't contradict the found floating substring
1217 * (and there's no anchored substr). */
d0880ea7 1218 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
675e93ee
DM
1219 " Found /%s^%s/m with rx_origin %ld...\n",
1220 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
e3c6feb0
DM
1221 }
1222 else {
2e759faa 1223 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
fe4f3442 1224 " (multiline anchor test skipped)\n"));
e3c6feb0
DM
1225 }
1226
ffad1e6a 1227 success_at_start:
e3c6feb0 1228
cad2e5aa 1229
dd170ff5
DM
1230 /* if we have a starting character class, then test that extra constraint.
1231 * (trie stclasses are too expensive to use here, we are better off to
1232 * leave it to regmatch itself) */
1233
f8fc2ecf 1234 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
f8fc2ecf 1235 const U8* const str = (U8*)STRING(progi->regstclass);
0991020e 1236
2c75e362 1237 /* XXX this value could be pre-computed */
f8fc2ecf 1238 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
2c75e362
DM
1239 ? (reginfo->is_utf8_pat
1240 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1241 : STR_LEN(progi->regstclass))
66e933ab 1242 : 1);
1de06328 1243 char * endpos;
fa3bb21d 1244 char *s;
000dfd2d
DM
1245 /* latest pos that a matching float substr constrains rx start to */
1246 char *rx_max_float = NULL;
1247
c75a3985
DM
1248 /* if the current rx_origin is anchored, either by satisfying an
1249 * anchored substring constraint, or a /^.../m constraint, then we
1250 * can reject the current origin if the start class isn't found
1251 * at the current position. If we have a float-only match, then
1252 * rx_origin is constrained to a range; so look for the start class
1253 * in that range. if neither, then look for the start class in the
1254 * whole rest of the string */
1255
dd170ff5
DM
1256 /* XXX DAPM it's not clear what the minlen test is for, and why
1257 * it's not used in the floating case. Nothing in the test suite
1258 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1259 * Here are some old comments, which may or may not be correct:
1260 *
1261 * minlen == 0 is possible if regstclass is \b or \B,
1262 * and the fixed substr is ''$.
1263 * Since minlen is already taken into account, rx_origin+1 is
1264 * before strend; accidentally, minlen >= 1 guaranties no false
1265 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1266 * 0) below assumes that regstclass does not come from lookahead...
1267 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1268 * This leaves EXACTF-ish only, which are dealt with in
1269 * find_byclass().
1270 */
1271
7d2d37f5 1272 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
fa3bb21d 1273 endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
000dfd2d
DM
1274 else if (prog->float_substr || prog->float_utf8) {
1275 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1276 endpos= HOP3c(rx_max_float, cl_l, strend);
1277 }
1de06328
YO
1278 else
1279 endpos= strend;
1280
1dc475d0
DM
1281 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1282 " looking for class: start_shift: %"IVdf" check_at: %"IVdf
c43b5520 1283 " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1dc475d0 1284 (IV)start_shift, (IV)(check_at - strbeg),
c43b5520 1285 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
d8080198 1286
c43b5520 1287 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
f9176b44 1288 reginfo);
be778b1a 1289 if (!s) {
6eb5f6b9 1290 if (endpos == strend) {
a3621e74 1291 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1292 " Could not match STCLASS...\n") );
6eb5f6b9
JH
1293 goto fail;
1294 }
a3621e74 1295 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1296 " This position contradicts STCLASS...\n") );
e0eb31e7
DM
1297 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1298 && !(prog->intflags & PREGf_IMPLICIT))
653099ff 1299 goto fail;
9fed8d02 1300
6eb5f6b9 1301 /* Contradict one of substrings */
97136c8a
DM
1302 if (prog->anchored_substr || prog->anchored_utf8) {
1303 if (prog->substrs->check_ix == 1) { /* check is float */
1304 /* Have both, check_string is floating */
1305 assert(rx_origin + start_shift <= check_at);
1306 if (rx_origin + start_shift != check_at) {
1307 /* not at latest position float substr could match:
c75a3985
DM
1308 * Recheck anchored substring, but not floating.
1309 * The condition above is in bytes rather than
1310 * chars for efficiency. It's conservative, in
1311 * that it errs on the side of doing 'goto
88203927
DM
1312 * do_other_substr'. In this case, at worst,
1313 * an extra anchored search may get done, but in
1314 * practice the extra fbm_instr() is likely to
1315 * get skipped anyway. */
97136c8a 1316 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
675e93ee
DM
1317 " about to retry anchored at offset %ld (rx_origin now %"IVdf")...\n",
1318 (long)(other_last - strbeg),
1319 (IV)(rx_origin - strbeg)
1320 ));
97136c8a 1321 goto do_other_substr;
3369914b 1322 }
3369914b
DM
1323 }
1324 }
97136c8a 1325 else {
9fed8d02
DM
1326 /* float-only */
1327
7d2d37f5 1328 if (ml_anch) {
c75a3985
DM
1329 /* In the presence of ml_anch, we might be able to
1330 * find another \n without breaking the current float
1331 * constraint. */
1332
1333 /* strictly speaking this should be HOP3c(..., 1, ...),
1334 * but since we goto a block of code that's going to
1335 * search for the next \n if any, its safe here */
9fed8d02 1336 rx_origin++;
9fed8d02 1337 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
675e93ee 1338 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
9fed8d02 1339 PL_colors[0], PL_colors[1],
675e93ee 1340 (long)(rx_origin - strbeg)) );
9fed8d02 1341 goto postprocess_substr_matches;
ab60c45a 1342 }
c75a3985
DM
1343
1344 /* strictly speaking this can never be true; but might
1345 * be if we ever allow intuit without substrings */
1346 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
9fed8d02 1347 goto fail;
c75a3985 1348
000dfd2d 1349 rx_origin = rx_max_float;
9fed8d02
DM
1350 }
1351
c75a3985
DM
1352 /* at this point, any matching substrings have been
1353 * contradicted. Start again... */
1354
9fed8d02 1355 rx_origin = HOP3c(rx_origin, 1, strend);
557f47af
DM
1356
1357 /* uses bytes rather than char calculations for efficiency.
1358 * It's conservative: it errs on the side of doing 'goto restart',
1359 * where there is code that does a proper char-based test */
9fed8d02 1360 if (rx_origin + start_shift + end_shift > strend) {
40268e5b 1361 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
9fed8d02
DM
1362 " Could not match STCLASS...\n") );
1363 goto fail;
1364 }
1365 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
675e93ee 1366 " about to look for %s substr starting at offset %ld (rx_origin now %"IVdf")...\n",
9fed8d02 1367 (prog->substrs->check_ix ? "floating" : "anchored"),
675e93ee
DM
1368 (long)(rx_origin + start_shift - strbeg),
1369 (IV)(rx_origin - strbeg)
1370 ));
9fed8d02 1371 goto restart;
6eb5f6b9 1372 }
9fed8d02 1373
c75a3985
DM
1374 /* Success !!! */
1375
5f9c6575 1376 if (rx_origin != s) {
a3621e74 1377 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 1378 " By STCLASS: moving %ld --> %ld\n",
675e93ee 1379 (long)(rx_origin - strbeg), (long)(s - strbeg))
b7953727
JH
1380 );
1381 }
1382 else {
a3621e74 1383 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 1384 " Does not contradict STCLASS...\n");
b7953727
JH
1385 );
1386 }
6eb5f6b9 1387 }
ffad1e6a
DM
1388
1389 /* Decide whether using the substrings helped */
1390
1391 if (rx_origin != strpos) {
1392 /* Fixed substring is found far enough so that the match
1393 cannot start at strpos. */
1394
1395 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n"));
1396 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1397 }
1398 else {
70563e16
DM
1399 /* The found rx_origin position does not prohibit matching at
1400 * strpos, so calling intuit didn't gain us anything. Decrement
1401 * the BmUSEFUL() count on the check substring, and if we reach
1402 * zero, free it. */
1403 if (!(prog->intflags & PREGf_NAUGHTY)
ffad1e6a
DM
1404 && (utf8_target ? (
1405 prog->check_utf8 /* Could be deleted already */
1406 && --BmUSEFUL(prog->check_utf8) < 0
1407 && (prog->check_utf8 == prog->float_utf8)
1408 ) : (
1409 prog->check_substr /* Could be deleted already */
1410 && --BmUSEFUL(prog->check_substr) < 0
1411 && (prog->check_substr == prog->float_substr)
1412 )))
1413 {
1414 /* If flags & SOMETHING - do not do it many times on the same match */
1415 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n"));
1416 /* XXX Does the destruction order has to change with utf8_target? */
1417 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1418 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1419 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1420 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1421 check = NULL; /* abort */
ffad1e6a
DM
1422 /* XXXX This is a remnant of the old implementation. It
1423 looks wasteful, since now INTUIT can use many
1424 other heuristics. */
1425 prog->extflags &= ~RXf_USE_INTUIT;
ffad1e6a
DM
1426 }
1427 }
1428
1429 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1430 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
675e93ee 1431 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
ffad1e6a 1432
c765d6e0 1433 return rx_origin;
2c2d71f5
JH
1434
1435 fail_finish: /* Substring not found */
33b8afdf 1436 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1437 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1438 fail:
a3621e74 1439 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1440 PL_colors[4], PL_colors[5]));
bd61b366 1441 return NULL;
cad2e5aa 1442}
9661b544 1443
70563e16 1444
a0a388a1 1445#define DECL_TRIE_TYPE(scan) \
e7fd4aa1 1446 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
a4525e78
KW
1447 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
1448 trie_utf8l, trie_flu8 } \
e7fd4aa1
KW
1449 trie_type = ((scan->flags == EXACT) \
1450 ? (utf8_target ? trie_utf8 : trie_plain) \
a4525e78
KW
1451 : (scan->flags == EXACTL) \
1452 ? (utf8_target ? trie_utf8l : trie_plain) \
1453 : (scan->flags == EXACTFA) \
1454 ? (utf8_target \
1455 ? trie_utf8_exactfa_fold \
1456 : trie_latin_utf8_exactfa_fold) \
1457 : (scan->flags == EXACTFLU8 \
1458 ? trie_flu8 \
1459 : (utf8_target \
1460 ? trie_utf8_fold \
1461 : trie_latin_utf8_fold)))
fab2782b 1462
fd3249ee 1463#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
baa60164 1464STMT_START { \
fab2782b 1465 STRLEN skiplen; \
baa60164 1466 U8 flags = FOLD_FLAGS_FULL; \
fab2782b 1467 switch (trie_type) { \
a4525e78 1468 case trie_flu8: \
780fcc9f 1469 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
613abc6d
KW
1470 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1471 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1472 } \
a4525e78 1473 goto do_trie_utf8_fold; \
31f05a37 1474 case trie_utf8_exactfa_fold: \
baa60164 1475 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1476 /* FALLTHROUGH */ \
fab2782b 1477 case trie_utf8_fold: \
a4525e78 1478 do_trie_utf8_fold: \
fab2782b 1479 if ( foldlen>0 ) { \
c80e42f3 1480 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1481 foldlen -= len; \
1482 uscan += len; \
1483 len=0; \
1484 } else { \
445bf929 1485 uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
fab2782b 1486 len = UTF8SKIP(uc); \
5f560d8a 1487 skiplen = UVCHR_SKIP( uvc ); \
fab2782b
YO
1488 foldlen -= skiplen; \
1489 uscan = foldbuf + skiplen; \
1490 } \
1491 break; \
baa60164
KW
1492 case trie_latin_utf8_exactfa_fold: \
1493 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1494 /* FALLTHROUGH */ \
fab2782b
YO
1495 case trie_latin_utf8_fold: \
1496 if ( foldlen>0 ) { \
c80e42f3 1497 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1498 foldlen -= len; \
1499 uscan += len; \
1500 len=0; \
1501 } else { \
1502 len = 1; \
31f05a37 1503 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
5f560d8a 1504 skiplen = UVCHR_SKIP( uvc ); \
fab2782b
YO
1505 foldlen -= skiplen; \
1506 uscan = foldbuf + skiplen; \
1507 } \
1508 break; \
a4525e78 1509 case trie_utf8l: \
780fcc9f 1510 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
613abc6d
KW
1511 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1512 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1513 } \
780fcc9f 1514 /* FALLTHROUGH */ \
fab2782b 1515 case trie_utf8: \
c80e42f3 1516 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1517 break; \
1518 case trie_plain: \
1519 uvc = (UV)*uc; \
1520 len = 1; \
1521 } \
1522 if (uvc < 256) { \
1523 charid = trie->charmap[ uvc ]; \
1524 } \
1525 else { \
1526 charid = 0; \
1527 if (widecharmap) { \
1528 SV** const svpp = hv_fetch(widecharmap, \
1529 (char*)&uvc, sizeof(UV), 0); \
1530 if (svpp) \
1531 charid = (U16)SvIV(*svpp); \
1532 } \
1533 } \
4cadc6a9
YO
1534} STMT_END
1535
ae7c5b9b
KW
1536#define DUMP_EXEC_POS(li,s,doutf8) \
1537 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1538 startpos, doutf8)
1539
c84a03c5 1540#define REXEC_FBC_EXACTISH_SCAN(COND) \
4cadc6a9
YO
1541STMT_START { \
1542 while (s <= e) { \
c84a03c5 1543 if ( (COND) \
fac1af77 1544 && (ln == 1 || folder(s, pat_string, ln)) \
02d5137b 1545 && (reginfo->intuit || regtry(reginfo, &s)) )\
4cadc6a9
YO
1546 goto got_it; \
1547 s++; \
1548 } \
1549} STMT_END
1550
c84a03c5 1551#define REXEC_FBC_UTF8_SCAN(CODE) \
4cadc6a9 1552STMT_START { \
9a902117 1553 while (s < strend) { \
c84a03c5 1554 CODE \
9a902117 1555 s += UTF8SKIP(s); \
4cadc6a9
YO
1556 } \
1557} STMT_END
1558
c84a03c5 1559#define REXEC_FBC_SCAN(CODE) \
4cadc6a9
YO
1560STMT_START { \
1561 while (s < strend) { \
c84a03c5 1562 CODE \
4cadc6a9
YO
1563 s++; \
1564 } \
1565} STMT_END
1566
05bd126c
KW
1567#define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
1568REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */ \
1569 if (COND) { \
1570 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1571 goto got_it; \
1572 else \
1573 tmp = doevery; \
1574 } \
1575 else \
1576 tmp = 1; \
4cadc6a9
YO
1577)
1578
05bd126c
KW
1579#define REXEC_FBC_CLASS_SCAN(COND) \
1580REXEC_FBC_SCAN( /* Loops while (s < strend) */ \
1581 if (COND) { \
1582 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1583 goto got_it; \
1584 else \
1585 tmp = doevery; \
1586 } \
1587 else \
1588 tmp = 1; \
4cadc6a9
YO
1589)
1590
c84a03c5 1591#define REXEC_FBC_CSCAN(CONDUTF8,COND) \
baa60164 1592 if (utf8_target) { \
c84a03c5 1593 REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8); \
e1d1eefb
YO
1594 } \
1595 else { \
c84a03c5 1596 REXEC_FBC_CLASS_SCAN(COND); \
d981ef24 1597 }
05bd126c 1598
05bd126c
KW
1599/* The three macros below are slightly different versions of the same logic.
1600 *
1601 * The first is for /a and /aa when the target string is UTF-8. This can only
1602 * match ascii, but it must advance based on UTF-8. The other two handle the
1603 * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking
1604 * for the boundary (or non-boundary) between a word and non-word character.
1605 * The utf8 and non-utf8 cases have the same logic, but the details must be
1606 * different. Find the "wordness" of the character just prior to this one, and
1607 * compare it with the wordness of this one. If they differ, we have a
1608 * boundary. At the beginning of the string, pretend that the previous
1609 * character was a new-line.
1610 *
1611 * All these macros uncleanly have side-effects with each other and outside
1612 * variables. So far it's been too much trouble to clean-up
1613 *
1614 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1615 * a word character or not.
1616 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1617 * word/non-word
1618 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1619 *
1620 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1621 * are looking for a boundary or for a non-boundary. If we are looking for a
1622 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1623 * see if this tentative match actually works, and if so, to quit the loop
1624 * here. And vice-versa if we are looking for a non-boundary.
1625 *
1626 * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1627 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1628 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
1629 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1630 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1631 * complement. But in that branch we complement tmp, meaning that at the
1632 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1633 * which means at the top of the loop in the next iteration, it is
1634 * TEST_NON_UTF8(s-1) */
b2f4e957 1635#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1636 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1637 tmp = TEST_NON_UTF8(tmp); \
1638 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1639 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1640 tmp = !tmp; \
1641 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
1642 } \
1643 else { \
1644 IF_FAIL; \
1645 } \
1646 ); \
1647
1648/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1649 * TEST_UTF8 is a macro that for the same input code points returns identically
1650 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
236d82fd 1651#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1652 if (s == reginfo->strbeg) { \
1653 tmp = '\n'; \
1654 } \
1655 else { /* Back-up to the start of the previous character */ \
1656 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1657 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
3db24e1e 1658 0, UTF8_ALLOW_DEFAULT); \
05bd126c
KW
1659 } \
1660 tmp = TEST_UV(tmp); \
1661 LOAD_UTF8_CHARCLASS_ALNUM(); \
1662 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1663 if (tmp == ! (TEST_UTF8((U8 *) s))) { \
1664 tmp = !tmp; \
1665 IF_SUCCESS; \
1666 } \
1667 else { \
1668 IF_FAIL; \
1669 } \
1670 );
cfaf538b 1671
05bd126c
KW
1672/* Like the above two macros. UTF8_CODE is the complete code for handling
1673 * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1674 * macros below */
baa60164 1675#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1676 if (utf8_target) { \
05bd126c 1677 UTF8_CODE \
63ac0dad
KW
1678 } \
1679 else { /* Not utf8 */ \
9d9163fb 1680 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
63ac0dad 1681 tmp = TEST_NON_UTF8(tmp); \
05bd126c 1682 REXEC_FBC_SCAN( /* advances s while s < strend */ \
63ac0dad 1683 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
63ac0dad 1684 IF_SUCCESS; \
760cfa8e 1685 tmp = !tmp; \
63ac0dad
KW
1686 } \
1687 else { \
1688 IF_FAIL; \
1689 } \
1690 ); \
1691 } \
c8519dc7
KW
1692 /* Here, things have been set up by the previous code so that tmp is the \
1693 * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \
1694 * utf8ness of the target). We also have to check if this matches against \
1695 * the EOS, which we treat as a \n (which is the same value in both UTF-8 \
1696 * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \
1697 * string */ \
1698 if (tmp == ! TEST_NON_UTF8('\n')) { \
1699 IF_SUCCESS; \
1700 } \
1701 else { \
1702 IF_FAIL; \
1703 }
63ac0dad 1704
ae7c5b9b
KW
1705/* This is the macro to use when we want to see if something that looks like it
1706 * could match, actually does, and if so exits the loop */
1707#define REXEC_FBC_TRYIT \
1708 if ((reginfo->intuit || regtry(reginfo, &s))) \
1709 goto got_it
1710
1711/* The only difference between the BOUND and NBOUND cases is that
1712 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1713 * NBOUND. This is accomplished by passing it as either the if or else clause,
1714 * with the other one being empty (PLACEHOLDER is defined as empty).
1715 *
1716 * The TEST_FOO parameters are for operating on different forms of input, but
1717 * all should be ones that return identically for the same underlying code
1718 * points */
1719#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1720 FBC_BOUND_COMMON( \
1721 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1722 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1723
44129e46 1724#define FBC_BOUND_A(TEST_NON_UTF8) \
ae7c5b9b
KW
1725 FBC_BOUND_COMMON( \
1726 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1727 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1728
1729#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1730 FBC_BOUND_COMMON( \
1731 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1732 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1733
44129e46 1734#define FBC_NBOUND_A(TEST_NON_UTF8) \
ae7c5b9b
KW
1735 FBC_BOUND_COMMON( \
1736 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1737 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1738
64935bc6
KW
1739/* Takes a pointer to an inversion list, a pointer to its corresponding
1740 * inversion map, and a code point, and returns the code point's value
1741 * according to the two arrays. It assumes that all code points have a value.
1742 * This is used as the base macro for macros for particular properties */
1743#define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
1744 invmap[_invlist_search(invlist, cp)]
1745
1746/* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
1747 * of a code point, returning the value for the first code point in the string.
1748 * And it takes the particular macro name that finds the desired value given a
1749 * code point. Merely convert the UTF-8 to code point and call the cp macro */
1750#define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
1751 (__ASSERT_(pos < strend) \
1752 /* Note assumes is valid UTF-8 */ \
1753 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
1754
1755/* Returns the GCB value for the input code point */
1756#define getGCB_VAL_CP(cp) \
1757 _generic_GET_BREAK_VAL_CP( \
1758 PL_GCB_invlist, \
02f811dd 1759 _Perl_GCB_invmap, \
64935bc6
KW
1760 (cp))
1761
1762/* Returns the GCB value for the first code point in the UTF-8 encoded string
1763 * bounded by pos and strend */
1764#define getGCB_VAL_UTF8(pos, strend) \
1765 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
05bd126c 1766
06ae2722
KW
1767
1768/* Returns the SB value for the input code point */
1769#define getSB_VAL_CP(cp) \
1770 _generic_GET_BREAK_VAL_CP( \
1771 PL_SB_invlist, \
bf4268fa 1772 _Perl_SB_invmap, \
06ae2722
KW
1773 (cp))
1774
1775/* Returns the SB value for the first code point in the UTF-8 encoded string
1776 * bounded by pos and strend */
1777#define getSB_VAL_UTF8(pos, strend) \
1778 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
1779
ae3bb8ea
KW
1780/* Returns the WB value for the input code point */
1781#define getWB_VAL_CP(cp) \
1782 _generic_GET_BREAK_VAL_CP( \
1783 PL_WB_invlist, \
bf4268fa 1784 _Perl_WB_invmap, \
ae3bb8ea
KW
1785 (cp))
1786
1787/* Returns the WB value for the first code point in the UTF-8 encoded string
1788 * bounded by pos and strend */
1789#define getWB_VAL_UTF8(pos, strend) \
1790 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
1791
786e8c11 1792/* We know what class REx starts with. Try to find this position... */
02d5137b 1793/* if reginfo->intuit, its a dryrun */
786e8c11
YO
1794/* annoyingly all the vars in this routine have different names from their counterparts
1795 in regmatch. /grrr */
3c3eec57 1796STATIC char *
07be1b83 1797S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 1798 const char *strend, regmatch_info *reginfo)
a687059c 1799{
73104a1b
KW
1800 dVAR;
1801 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1802 char *pat_string; /* The pattern's exactish string */
1803 char *pat_end; /* ptr to end char of pat_string */
1804 re_fold_t folder; /* Function for computing non-utf8 folds */
1805 const U8 *fold_array; /* array for folding ords < 256 */
1806 STRLEN ln;
1807 STRLEN lnc;
73104a1b
KW
1808 U8 c1;
1809 U8 c2;
1810 char *e;
1811 I32 tmp = 1; /* Scratch variable? */
ba44c216 1812 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 1813 UV utf8_fold_flags = 0;
f9176b44 1814 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
1815 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1816 with a result inverts that result, as 0^1 =
1817 1 and 1^1 = 0 */
1818 _char_class_number classnum;
1819
73104a1b 1820 RXi_GET_DECL(prog,progi);
2f7f8cb1 1821
73104a1b 1822 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1823
73104a1b
KW
1824 /* We know what class it must start with. */
1825 switch (OP(c)) {
a4525e78 1826 case ANYOFL:
780fcc9f 1827 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
a0bd1a30
KW
1828
1829 if ((FLAGS(c) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) {
1830 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
1831 }
1832
780fcc9f 1833 /* FALLTHROUGH */
ac44c12e 1834 case ANYOFD:
73104a1b
KW
1835 case ANYOF:
1836 if (utf8_target) {
1837 REXEC_FBC_UTF8_CLASS_SCAN(
3db24e1e 1838 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
73104a1b
KW
1839 }
1840 else {
1841 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1842 }
1843 break;
73104a1b 1844
098b07d5
KW
1845 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
1846 assert(! is_utf8_pat);
924ba076 1847 /* FALLTHROUGH */
73104a1b 1848 case EXACTFA:
984e6dd1 1849 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1850 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1851 goto do_exactf_utf8;
1852 }
1853 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1854 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1855 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1856
2fdb7295
KW
1857 case EXACTF: /* This node only generated for non-utf8 patterns */
1858 assert(! is_utf8_pat);
73104a1b 1859 if (utf8_target) {
73104a1b
KW
1860 utf8_fold_flags = 0;
1861 goto do_exactf_utf8;
1862 }
1863 fold_array = PL_fold;
1864 folder = foldEQ;
1865 goto do_exactf_non_utf8;
1866
1867 case EXACTFL:
780fcc9f 1868 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
31f05a37 1869 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
cea315b6 1870 utf8_fold_flags = FOLDEQ_LOCALE;
73104a1b
KW
1871 goto do_exactf_utf8;
1872 }
1873 fold_array = PL_fold_locale;
1874 folder = foldEQ_locale;
1875 goto do_exactf_non_utf8;
3c760661 1876
73104a1b 1877 case EXACTFU_SS:
984e6dd1 1878 if (is_utf8_pat) {
73104a1b
KW
1879 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1880 }
1881 goto do_exactf_utf8;
16d951b7 1882
a4525e78
KW
1883 case EXACTFLU8:
1884 if (! utf8_target) { /* All code points in this node require
1885 UTF-8 to express. */
1886 break;
1887 }
613abc6d
KW
1888 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
1889 | FOLDEQ_S2_FOLDS_SANE;
a4525e78
KW
1890 goto do_exactf_utf8;
1891
73104a1b 1892 case EXACTFU:
984e6dd1
DM
1893 if (is_utf8_pat || utf8_target) {
1894 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
1895 goto do_exactf_utf8;
1896 }
fac1af77 1897
73104a1b
KW
1898 /* Any 'ss' in the pattern should have been replaced by regcomp,
1899 * so we don't have to worry here about this single special case
1900 * in the Latin1 range */
1901 fold_array = PL_fold_latin1;
1902 folder = foldEQ_latin1;
1903
924ba076 1904 /* FALLTHROUGH */
73104a1b 1905
c52b8b12 1906 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
73104a1b
KW
1907 are no glitches with fold-length differences
1908 between the target string and pattern */
1909
1910 /* The idea in the non-utf8 EXACTF* cases is to first find the
1911 * first character of the EXACTF* node and then, if necessary,
1912 * case-insensitively compare the full text of the node. c1 is the
1913 * first character. c2 is its fold. This logic will not work for
1914 * Unicode semantics and the german sharp ss, which hence should
1915 * not be compiled into a node that gets here. */
1916 pat_string = STRING(c);
1917 ln = STR_LEN(c); /* length to match in octets/bytes */
1918
1919 /* We know that we have to match at least 'ln' bytes (which is the
1920 * same as characters, since not utf8). If we have to match 3
1921 * characters, and there are only 2 availabe, we know without
1922 * trying that it will fail; so don't start a match past the
1923 * required minimum number from the far end */
ea3daa5d 1924 e = HOP3c(strend, -((SSize_t)ln), s);
73104a1b 1925
02d5137b 1926 if (reginfo->intuit && e < s) {
73104a1b
KW
1927 e = s; /* Due to minlen logic of intuit() */
1928 }
fac1af77 1929
73104a1b
KW
1930 c1 = *pat_string;
1931 c2 = fold_array[c1];
1932 if (c1 == c2) { /* If char and fold are the same */
1933 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1934 }
1935 else {
1936 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1937 }
1938 break;
fac1af77 1939
c52b8b12
KW
1940 do_exactf_utf8:
1941 {
73104a1b
KW
1942 unsigned expansion;
1943
1944 /* If one of the operands is in utf8, we can't use the simpler folding
1945 * above, due to the fact that many different characters can have the
1946 * same fold, or portion of a fold, or different- length fold */
1947 pat_string = STRING(c);
1948 ln = STR_LEN(c); /* length to match in octets/bytes */
1949 pat_end = pat_string + ln;
984e6dd1 1950 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
1951 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1952 : ln;
1953
1954 /* We have 'lnc' characters to match in the pattern, but because of
1955 * multi-character folding, each character in the target can match
1956 * up to 3 characters (Unicode guarantees it will never exceed
1957 * this) if it is utf8-encoded; and up to 2 if not (based on the
1958 * fact that the Latin 1 folds are already determined, and the
1959 * only multi-char fold in that range is the sharp-s folding to
1960 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1961 * string character. Adjust lnc accordingly, rounding up, so that
1962 * if we need to match at least 4+1/3 chars, that really is 5. */
1963 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1964 lnc = (lnc + expansion - 1) / expansion;
1965
1966 /* As in the non-UTF8 case, if we have to match 3 characters, and
1967 * only 2 are left, it's guaranteed to fail, so don't start a
1968 * match that would require us to go beyond the end of the string
1969 */
ea3daa5d 1970 e = HOP3c(strend, -((SSize_t)lnc), s);
73104a1b 1971
02d5137b 1972 if (reginfo->intuit && e < s) {
73104a1b
KW
1973 e = s; /* Due to minlen logic of intuit() */
1974 }
0658cdde 1975
73104a1b
KW
1976 /* XXX Note that we could recalculate e to stop the loop earlier,
1977 * as the worst case expansion above will rarely be met, and as we
1978 * go along we would usually find that e moves further to the left.
1979 * This would happen only after we reached the point in the loop
1980 * where if there were no expansion we should fail. Unclear if
1981 * worth the expense */
1982
1983 while (s <= e) {
1984 char *my_strend= (char *)strend;
1985 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 1986 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
02d5137b 1987 && (reginfo->intuit || regtry(reginfo, &s)) )
73104a1b
KW
1988 {
1989 goto got_it;
1990 }
1991 s += (utf8_target) ? UTF8SKIP(s) : 1;
1992 }
1993 break;
1994 }
236d82fd 1995
73104a1b 1996 case BOUNDL:
780fcc9f 1997 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6 1998 if (FLAGS(c) != TRADITIONAL_BOUND) {
89ad707a
KW
1999 if (! IN_UTF8_CTYPE_LOCALE) {
2000 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
64935bc6 2001 B_ON_NON_UTF8_LOCALE_IS_WRONG);
89ad707a 2002 }
64935bc6
KW
2003 goto do_boundu;
2004 }
2005
236d82fd 2006 FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
73104a1b 2007 break;
64935bc6 2008
73104a1b 2009 case NBOUNDL:
780fcc9f 2010 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6 2011 if (FLAGS(c) != TRADITIONAL_BOUND) {
89ad707a
KW
2012 if (! IN_UTF8_CTYPE_LOCALE) {
2013 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
64935bc6 2014 B_ON_NON_UTF8_LOCALE_IS_WRONG);
89ad707a 2015 }
64935bc6
KW
2016 goto do_nboundu;
2017 }
2018
236d82fd 2019 FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
73104a1b 2020 break;
64935bc6
KW
2021
2022 case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2023 meaning */
2024 assert(FLAGS(c) == TRADITIONAL_BOUND);
2025
236d82fd 2026 FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
73104a1b 2027 break;
64935bc6
KW
2028
2029 case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2030 meaning */
2031 assert(FLAGS(c) == TRADITIONAL_BOUND);
2032
44129e46 2033 FBC_BOUND_A(isWORDCHAR_A);
73104a1b 2034 break;
64935bc6
KW
2035
2036 case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2037 meaning */
2038 assert(FLAGS(c) == TRADITIONAL_BOUND);
2039
236d82fd 2040 FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
73104a1b 2041 break;
64935bc6
KW
2042
2043 case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2044 meaning */
2045 assert(FLAGS(c) == TRADITIONAL_BOUND);
2046
44129e46 2047 FBC_NBOUND_A(isWORDCHAR_A);
73104a1b 2048 break;
64935bc6 2049
73104a1b 2050 case NBOUNDU:
64935bc6
KW
2051 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2052 FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2053 break;
2054 }
2055
2056 do_nboundu:
2057
2058 to_complement = 1;
2059 /* FALLTHROUGH */
2060
2061 case BOUNDU:
2062 do_boundu:
2063 switch((bound_type) FLAGS(c)) {
2064 case TRADITIONAL_BOUND:
2065 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2066 break;
2067 case GCB_BOUND:
a7a8bd1e 2068 if (s == reginfo->strbeg) {
67481c39 2069 if (reginfo->intuit || regtry(reginfo, &s))
64935bc6
KW
2070 {
2071 goto got_it;
2072 }
a7a8bd1e
KW
2073
2074 /* Didn't match. Try at the next position (if there is one) */
64935bc6 2075 s += (utf8_target) ? UTF8SKIP(s) : 1;
a7a8bd1e
KW
2076 if (UNLIKELY(s >= reginfo->strend)) {
2077 break;
2078 }
64935bc6
KW
2079 }
2080
2081 if (utf8_target) {
85e5f08b 2082 GCB_enum before = getGCB_VAL_UTF8(
64935bc6
KW
2083 reghop3((U8*)s, -1,
2084 (U8*)(reginfo->strbeg)),
2085 (U8*) reginfo->strend);
2086 while (s < strend) {
85e5f08b 2087 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
64935bc6 2088 (U8*) reginfo->strend);
00e3344b
KW
2089 if ( (to_complement ^ isGCB(before, after))
2090 && (reginfo->intuit || regtry(reginfo, &s)))
2091 {
2092 goto got_it;
64935bc6 2093 }
43a7bd62 2094 before = after;
64935bc6
KW
2095 s += UTF8SKIP(s);
2096 }
2097 }
2098 else { /* Not utf8. Everything is a GCB except between CR and
2099 LF */
2100 while (s < strend) {
00e3344b
KW
2101 if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
2102 || UCHARAT(s) != '\n'))
2103 && (reginfo->intuit || regtry(reginfo, &s)))
64935bc6 2104 {
00e3344b 2105 goto got_it;
64935bc6 2106 }
43a7bd62 2107 s++;
64935bc6
KW
2108 }
2109 }
2110
6de80efc
KW
2111 /* And, since this is a bound, it can match after the final
2112 * character in the string */
67481c39 2113 if ((reginfo->intuit || regtry(reginfo, &s))) {
64935bc6
KW
2114 goto got_it;
2115 }
2116 break;
ae3bb8ea 2117
06ae2722 2118 case SB_BOUND:
a7a8bd1e 2119 if (s == reginfo->strbeg) {
67481c39 2120 if (reginfo->intuit || regtry(reginfo, &s)) {
06ae2722
KW
2121 goto got_it;
2122 }
06ae2722 2123 s += (utf8_target) ? UTF8SKIP(s) : 1;
a7a8bd1e
KW
2124 if (UNLIKELY(s >= reginfo->strend)) {
2125 break;
2126 }
06ae2722
KW
2127 }
2128
2129 if (utf8_target) {
85e5f08b 2130 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
06ae2722
KW
2131 -1,
2132 (U8*)(reginfo->strbeg)),
2133 (U8*) reginfo->strend);
2134 while (s < strend) {
85e5f08b 2135 SB_enum after = getSB_VAL_UTF8((U8*) s,
06ae2722 2136 (U8*) reginfo->strend);
00e3344b
KW
2137 if ((to_complement ^ isSB(before,
2138 after,
2139 (U8*) reginfo->strbeg,
2140 (U8*) s,
2141 (U8*) reginfo->strend,
2142 utf8_target))
2143 && (reginfo->intuit || regtry(reginfo, &s)))
06ae2722 2144 {
00e3344b 2145 goto got_it;
06ae2722 2146 }
43a7bd62 2147 before = after;
06ae2722
KW
2148 s += UTF8SKIP(s);
2149 }
2150 }
2151 else { /* Not utf8. */
85e5f08b 2152 SB_enum before = getSB_VAL_CP((U8) *(s -1));
06ae2722 2153 while (s < strend) {
85e5f08b 2154 SB_enum after = getSB_VAL_CP((U8) *s);
00e3344b
KW
2155 if ((to_complement ^ isSB(before,
2156 after,
2157 (U8*) reginfo->strbeg,
2158 (U8*) s,
2159 (U8*) reginfo->strend,
2160 utf8_target))
2161 && (reginfo->intuit || regtry(reginfo, &s)))
06ae2722 2162 {
00e3344b 2163 goto got_it;
06ae2722 2164 }
43a7bd62 2165 before = after;
06ae2722
KW
2166 s++;
2167 }
2168 }
2169
2170 /* Here are at the final position in the target string. The SB
2171 * value is always true here, so matches, depending on other
2172 * constraints */
67481c39 2173 if (reginfo->intuit || regtry(reginfo, &s)) {
06ae2722
KW
2174 goto got_it;
2175 }
2176
2177 break;
2178
ae3bb8ea
KW
2179 case WB_BOUND:
2180 if (s == reginfo->strbeg) {
67481c39 2181 if (reginfo->intuit || regtry(reginfo, &s)) {
ae3bb8ea
KW
2182 goto got_it;
2183 }
2184 s += (utf8_target) ? UTF8SKIP(s) : 1;
a7a8bd1e
KW
2185 if (UNLIKELY(s >= reginfo->strend)) {
2186 break;
2187 }
ae3bb8ea
KW
2188 }
2189
2190 if (utf8_target) {
2191 /* We are at a boundary between char_sub_0 and char_sub_1.
2192 * We also keep track of the value for char_sub_-1 as we
2193 * loop through the line. Context may be needed to make a
2194 * determination, and if so, this can save having to
2195 * recalculate it */
85e5f08b
KW
2196 WB_enum previous = WB_UNKNOWN;
2197 WB_enum before = getWB_VAL_UTF8(
ae3bb8ea
KW
2198 reghop3((U8*)s,
2199 -1,
2200 (U8*)(reginfo->strbeg)),
2201 (U8*) reginfo->strend);
2202 while (s < strend) {
85e5f08b 2203 WB_enum after = getWB_VAL_UTF8((U8*) s,
ae3bb8ea 2204 (U8*) reginfo->strend);
00e3344b
KW
2205 if ((to_complement ^ isWB(previous,
2206 before,
2207 after,
2208 (U8*) reginfo->strbeg,
2209 (U8*) s,
2210 (U8*) reginfo->strend,
2211 utf8_target))
2212 && (reginfo->intuit || regtry(reginfo, &s)))
ae3bb8ea 2213 {
00e3344b 2214 goto got_it;
ae3bb8ea 2215 }
43a7bd62
KW
2216 previous = before;
2217 before = after;
ae3bb8ea
KW
2218 s += UTF8SKIP(s);
2219 }
2220 }
2221 else { /* Not utf8. */
85e5f08b
KW
2222 WB_enum previous = WB_UNKNOWN;
2223 WB_enum before = getWB_VAL_CP((U8) *(s -1));
ae3bb8ea 2224 while (s < strend) {
85e5f08b 2225 WB_enum after = getWB_VAL_CP((U8) *s);
00e3344b
KW
2226 if ((to_complement ^ isWB(previous,
2227 before,
2228 after,
2229 (U8*) reginfo->strbeg,
2230 (U8*) s,
2231 (U8*) reginfo->strend,
2232 utf8_target))
2233 && (reginfo->intuit || regtry(reginfo, &s)))
ae3bb8ea 2234 {
00e3344b 2235 goto got_it;
ae3bb8ea 2236 }
43a7bd62
KW
2237 previous = before;
2238 before = after;
ae3bb8ea
KW
2239 s++;
2240 }
2241 }
2242
67481c39 2243 if (reginfo->intuit || regtry(reginfo, &s)) {
ae3bb8ea
KW
2244 goto got_it;
2245 }
64935bc6 2246 }
73104a1b 2247 break;
64935bc6 2248
73104a1b
KW
2249 case LNBREAK:
2250 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2251 is_LNBREAK_latin1_safe(s, strend)
2252 );
2253 break;
3018b823
KW
2254
2255 /* The argument to all the POSIX node types is the class number to pass to
2256 * _generic_isCC() to build a mask for searching in PL_charclass[] */
2257
2258 case NPOSIXL:
2259 to_complement = 1;
2260 /* FALLTHROUGH */
2261
2262 case POSIXL:
780fcc9f 2263 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3018b823
KW
2264 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2265 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 2266 break;
3018b823
KW
2267
2268 case NPOSIXD:
2269 to_complement = 1;
2270 /* FALLTHROUGH */
2271
2272 case POSIXD:
2273 if (utf8_target) {
2274 goto posix_utf8;
2275 }
2276 goto posixa;
2277
2278 case NPOSIXA:
2279 if (utf8_target) {
2280 /* The complement of something that matches only ASCII matches all
837226c8
KW
2281 * non-ASCII, plus everything in ASCII that isn't in the class. */
2282 REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
3018b823
KW
2283 || ! _generic_isCC_A(*s, FLAGS(c)));
2284 break;
2285 }
2286
2287 to_complement = 1;
2288 /* FALLTHROUGH */
2289
73104a1b 2290 case POSIXA:
3018b823 2291 posixa:
73104a1b 2292 /* Don't need to worry about utf8, as it can match only a single
3018b823
KW
2293 * byte invariant character. */
2294 REXEC_FBC_CLASS_SCAN(
2295 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 2296 break;
3018b823
KW
2297
2298 case NPOSIXU:
2299 to_complement = 1;
2300 /* FALLTHROUGH */
2301
2302 case POSIXU:
2303 if (! utf8_target) {
2304 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2305 FLAGS(c))));
2306 }
2307 else {
2308
c52b8b12 2309 posix_utf8:
3018b823
KW
2310 classnum = (_char_class_number) FLAGS(c);
2311 if (classnum < _FIRST_NON_SWASH_CC) {
2312 while (s < strend) {
2313
2314 /* We avoid loading in the swash as long as possible, but
2315 * should we have to, we jump to a separate loop. This
2316 * extra 'if' statement is what keeps this code from being
2317 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2318 if (UTF8_IS_ABOVE_LATIN1(*s)) {
2319 goto found_above_latin1;
2320 }
2321 if ((UTF8_IS_INVARIANT(*s)
2322 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2323 classnum)))
2324 || (UTF8_IS_DOWNGRADEABLE_START(*s)
2325 && to_complement ^ cBOOL(
a62b247b 2326 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
94bb8c36 2327 *(s + 1)),
3018b823
KW
2328 classnum))))
2329 {
02d5137b 2330 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
3018b823
KW
2331 goto got_it;
2332 else {
2333 tmp = doevery;
2334 }
2335 }
2336 else {
2337 tmp = 1;
2338 }
2339 s += UTF8SKIP(s);
2340 }
2341 }
2342 else switch (classnum) { /* These classes are implemented as
2343 macros */
779cf272 2344 case _CC_ENUM_SPACE:
3018b823
KW
2345 REXEC_FBC_UTF8_CLASS_SCAN(
2346 to_complement ^ cBOOL(isSPACE_utf8(s)));
2347 break;
2348
2349 case _CC_ENUM_BLANK:
2350 REXEC_FBC_UTF8_CLASS_SCAN(
2351 to_complement ^ cBOOL(isBLANK_utf8(s)));
2352 break;
2353
2354 case _CC_ENUM_XDIGIT:
2355 REXEC_FBC_UTF8_CLASS_SCAN(
2356 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2357 break;
2358
2359 case _CC_ENUM_VERTSPACE:
2360 REXEC_FBC_UTF8_CLASS_SCAN(
2361 to_complement ^ cBOOL(isVERTWS_utf8(s)));
2362 break;
2363
2364 case _CC_ENUM_CNTRL:
2365 REXEC_FBC_UTF8_CLASS_SCAN(
2366 to_complement ^ cBOOL(isCNTRL_utf8(s)));
2367 break;
2368
2369 default:
2370 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
e5964223 2371 NOT_REACHED; /* NOTREACHED */
3018b823
KW
2372 }
2373 }
2374 break;
2375
2376 found_above_latin1: /* Here we have to load a swash to get the result
2377 for the current code point */
2378 if (! PL_utf8_swash_ptrs[classnum]) {
2379 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2380 PL_utf8_swash_ptrs[classnum] =
2a16ac92
KW
2381 _core_swash_init("utf8",
2382 "",
2383 &PL_sv_undef, 1, 0,
2384 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
2385 }
2386
2387 /* This is a copy of the loop above for swash classes, though using the
2388 * FBC macro instead of being expanded out. Since we've loaded the
2389 * swash, we don't have to check for that each time through the loop */
2390 REXEC_FBC_UTF8_CLASS_SCAN(
2391 to_complement ^ cBOOL(_generic_utf8(
2392 classnum,
2393 s,
2394 swash_fetch(PL_utf8_swash_ptrs[classnum],
2395 (U8 *) s, TRUE))));
73104a1b
KW
2396 break;
2397
2398 case AHOCORASICKC:
2399 case AHOCORASICK:
2400 {
2401 DECL_TRIE_TYPE(c);
2402 /* what trie are we using right now */
2403 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2404 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2405 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2406
2407 const char *last_start = strend - trie->minlen;
6148ee25 2408#ifdef DEBUGGING
73104a1b 2409 const char *real_start = s;
6148ee25 2410#endif
73104a1b
KW
2411 STRLEN maxlen = trie->maxlen;
2412 SV *sv_points;
2413 U8 **points; /* map of where we were in the input string
2414 when reading a given char. For ASCII this
2415 is unnecessary overhead as the relationship
2416 is always 1:1, but for Unicode, especially
2417 case folded Unicode this is not true. */
2418 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2419 U8 *bitmap=NULL;
2420
2421
2422 GET_RE_DEBUG_FLAGS_DECL;
2423
2424 /* We can't just allocate points here. We need to wrap it in
2425 * an SV so it gets freed properly if there is a croak while
2426 * running the match */
2427 ENTER;
2428 SAVETMPS;
2429 sv_points=newSV(maxlen * sizeof(U8 *));
2430 SvCUR_set(sv_points,
2431 maxlen * sizeof(U8 *));
2432 SvPOK_on(sv_points);
2433 sv_2mortal(sv_points);
2434 points=(U8**)SvPV_nolen(sv_points );
2435 if ( trie_type != trie_utf8_fold
2436 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2437 {
2438 if (trie->bitmap)
2439 bitmap=(U8*)trie->bitmap;
2440 else
2441 bitmap=(U8*)ANYOF_BITMAP(c);
2442 }
2443 /* this is the Aho-Corasick algorithm modified a touch
2444 to include special handling for long "unknown char" sequences.
2445 The basic idea being that we use AC as long as we are dealing
2446 with a possible matching char, when we encounter an unknown char
2447 (and we have not encountered an accepting state) we scan forward
2448 until we find a legal starting char.
2449 AC matching is basically that of trie matching, except that when
2450 we encounter a failing transition, we fall back to the current
2451 states "fail state", and try the current char again, a process
2452 we repeat until we reach the root state, state 1, or a legal
2453 transition. If we fail on the root state then we can either
2454 terminate if we have reached an accepting state previously, or
2455 restart the entire process from the beginning if we have not.
2456
2457 */
2458 while (s <= last_start) {
2459 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2460 U8 *uc = (U8*)s;
2461 U16 charid = 0;
2462 U32 base = 1;
2463 U32 state = 1;
2464 UV uvc = 0;
2465 STRLEN len = 0;
2466 STRLEN foldlen = 0;
2467 U8 *uscan = (U8*)NULL;
2468 U8 *leftmost = NULL;
2469#ifdef DEBUGGING
2470 U32 accepted_word= 0;
786e8c11 2471#endif
73104a1b
KW
2472 U32 pointpos = 0;
2473
2474 while ( state && uc <= (U8*)strend ) {
2475 int failed=0;
2476 U32 word = aho->states[ state ].wordnum;
2477
2478 if( state==1 ) {
2479 if ( bitmap ) {
2480 DEBUG_TRIE_EXECUTE_r(
2481 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2482 dump_exec_pos( (char *)uc, c, strend, real_start,
2483 (char *)uc, utf8_target );
2484 PerlIO_printf( Perl_debug_log,
2485 " Scanning for legal start char...\n");
2486 }
2487 );
2488 if (utf8_target) {
2489 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2490 uc += UTF8SKIP(uc);
2491 }
2492 } else {
2493 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2494 uc++;
2495 }
786e8c11 2496 }
73104a1b 2497 s= (char *)uc;
07be1b83 2498 }
73104a1b
KW
2499 if (uc >(U8*)last_start) break;
2500 }
2501
2502 if ( word ) {
2503 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2504 if (!leftmost || lpos < leftmost) {
2505 DEBUG_r(accepted_word=word);
2506 leftmost= lpos;
7016d6eb 2507 }
73104a1b 2508 if (base==0) break;
7016d6eb 2509
73104a1b
KW
2510 }
2511 points[pointpos++ % maxlen]= uc;
2512 if (foldlen || uc < (U8*)strend) {
2513 REXEC_TRIE_READ_CHAR(trie_type, trie,
2514 widecharmap, uc,
2515 uscan, len, uvc, charid, foldlen,
2516 foldbuf, uniflags);
2517 DEBUG_TRIE_EXECUTE_r({
2518 dump_exec_pos( (char *)uc, c, strend,
2519 real_start, s, utf8_target);
2520 PerlIO_printf(Perl_debug_log,
2521 " Charid:%3u CP:%4"UVxf" ",
2522 charid, uvc);
2523 });
2524 }
2525 else {
2526 len = 0;
2527 charid = 0;
2528 }
07be1b83 2529
73104a1b
KW
2530
2531 do {
6148ee25 2532#ifdef DEBUGGING
73104a1b 2533 word = aho->states[ state ].wordnum;
6148ee25 2534#endif
73104a1b
KW
2535 base = aho->states[ state ].trans.base;
2536
2537 DEBUG_TRIE_EXECUTE_r({
2538 if (failed)
2539 dump_exec_pos( (char *)uc, c, strend, real_start,
2540 s, utf8_target );
2541 PerlIO_printf( Perl_debug_log,
2542 "%sState: %4"UVxf", word=%"UVxf,
2543 failed ? " Fail transition to " : "",
2544 (UV)state, (UV)word);
2545 });
2546 if ( base ) {
2547 U32 tmp;
2548 I32 offset;
2549 if (charid &&
2550 ( ((offset = base + charid
2551 - 1 - trie->uniquecharcount)) >= 0)
2552 && ((U32)offset < trie->lasttrans)
2553 && trie->trans[offset].check == state
2554 && (tmp=trie->trans[offset].next))
2555 {
2556 DEBUG_TRIE_EXECUTE_r(
2557 PerlIO_printf( Perl_debug_log," - legal\n"));
2558 state = tmp;
2559 break;
07be1b83
YO
2560 }
2561 else {
786e8c11 2562 DEBUG_TRIE_EXECUTE_r(
73104a1b 2563 PerlIO_printf( Perl_debug_log," - fail\n"));
786e8c11 2564 failed = 1;
73104a1b 2565 state = aho->fail[state];
07be1b83 2566 }
07be1b83 2567 }
73104a1b
KW
2568 else {
2569 /* we must be accepting here */
2570 DEBUG_TRIE_EXECUTE_r(
2571 PerlIO_printf( Perl_debug_log," - accepting\n"));
2572 failed = 1;
2573 break;
786e8c11 2574 }
73104a1b
KW
2575 } while(state);
2576 uc += len;
2577 if (failed) {
2578 if (leftmost)
2579 break;
2580 if (!state) state = 1;
07be1b83 2581 }
73104a1b
KW
2582 }
2583 if ( aho->states[ state ].wordnum ) {
2584 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2585 if (!leftmost || lpos < leftmost) {
2586 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2587 leftmost = lpos;
07be1b83
YO
2588 }
2589 }
73104a1b
KW
2590 if (leftmost) {
2591 s = (char*)leftmost;
2592 DEBUG_TRIE_EXECUTE_r({
2593 PerlIO_printf(
2594 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2595 (UV)accepted_word, (IV)(s - real_start)
2596 );
2597 });
02d5137b 2598 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
2599 FREETMPS;
2600 LEAVE;
2601 goto got_it;
2602 }
2603 s = HOPc(s,1);
2604 DEBUG_TRIE_EXECUTE_r({
2605 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2606 });
2607 } else {
2608 DEBUG_TRIE_EXECUTE_r(
2609 PerlIO_printf( Perl_debug_log,"No match.\n"));
2610 break;
2611 }
2612 }
2613 FREETMPS;
2614 LEAVE;
2615 }
2616 break;
2617 default:
2618 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
73104a1b
KW
2619 }
2620 return 0;
2621 got_it:
2622 return s;
6eb5f6b9
JH
2623}
2624
60165aa4
DM
2625/* set RX_SAVED_COPY, RX_SUBBEG etc.
2626 * flags have same meanings as with regexec_flags() */
2627
749f4950
DM
2628static void
2629S_reg_set_capture_string(pTHX_ REGEXP * const rx,
60165aa4
DM
2630 char *strbeg,
2631 char *strend,
2632 SV *sv,
2633 U32 flags,
2634 bool utf8_target)
2635{
2636 struct regexp *const prog = ReANY(rx);
2637
60165aa4
DM
2638 if (flags & REXEC_COPY_STR) {
2639#ifdef PERL_ANY_COW
2640 if (SvCANCOW(sv)) {
2641 if (DEBUG_C_TEST) {
2642 PerlIO_printf(Perl_debug_log,
2643 "Copy on write: regexp capture, type %d\n",
2644 (int) SvTYPE(sv));
2645 }
5411a0e5
DM
2646 /* Create a new COW SV to share the match string and store
2647 * in saved_copy, unless the current COW SV in saved_copy
2648 * is valid and suitable for our purpose */
2649 if (( prog->saved_copy
2650 && SvIsCOW(prog->saved_copy)
2651 && SvPOKp(prog->saved_copy)
2652 && SvIsCOW(sv)
2653 && SvPOKp(sv)
2654 && SvPVX(sv) == SvPVX(prog->saved_copy)))
a76b0e90 2655 {
5411a0e5
DM
2656 /* just reuse saved_copy SV */
2657 if (RXp_MATCH_COPIED(prog)) {
2658 Safefree(prog->subbeg);
2659 RXp_MATCH_COPIED_off(prog);
2660 }
2661 }
2662 else {
2663 /* create new COW SV to share string */
a76b0e90
DM
2664 RX_MATCH_COPY_FREE(rx);
2665 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
a76b0e90 2666 }
5411a0e5
DM
2667 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2668 assert (SvPOKp(prog->saved_copy));
60165aa4
DM
2669 prog->sublen = strend - strbeg;
2670 prog->suboffset = 0;
2671 prog->subcoffset = 0;
2672 } else
2673#endif
2674 {
99a90e59
FC
2675 SSize_t min = 0;
2676 SSize_t max = strend - strbeg;
ea3daa5d 2677 SSize_t sublen;
60165aa4
DM
2678
2679 if ( (flags & REXEC_COPY_SKIP_POST)
e322109a 2680 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2681 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2682 ) { /* don't copy $' part of string */
2683 U32 n = 0;
2684 max = -1;
2685 /* calculate the right-most part of the string covered
2686 * by a capture. Due to look-ahead, this may be to
2687 * the right of $&, so we have to scan all captures */
2688 while (n <= prog->lastparen) {
2689 if (prog->offs[n].end > max)
2690 max = prog->offs[n].end;
2691 n++;
2692 }
2693 if (max == -1)
2694 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2695 ? prog->offs[0].start
2696 : 0;
2697 assert(max >= 0 && max <= strend - strbeg);
2698 }
2699
2700 if ( (flags & REXEC_COPY_SKIP_PRE)
e322109a 2701 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2702 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2703 ) { /* don't copy $` part of string */
2704 U32 n = 0;
2705 min = max;
2706 /* calculate the left-most part of the string covered
2707 * by a capture. Due to look-behind, this may be to
2708 * the left of $&, so we have to scan all captures */
2709 while (min && n <= prog->lastparen) {
2710 if ( prog->offs[n].start != -1
2711 && prog->offs[n].start < min)
2712 {
2713 min = prog->offs[n].start;
2714 }
2715 n++;
2716 }
2717 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2718 && min > prog->offs[0].end
2719 )
2720 min = prog->offs[0].end;
2721
2722 }
2723
2724 assert(min >= 0 && min <= max && min <= strend - strbeg);
2725 sublen = max - min;
2726
2727 if (RX_MATCH_COPIED(rx)) {
2728 if (sublen > prog->sublen)
2729 prog->subbeg =
2730 (char*)saferealloc(prog->subbeg, sublen+1);
2731 }
2732 else
2733 prog->subbeg = (char*)safemalloc(sublen+1);
2734 Copy(strbeg + min, prog->subbeg, sublen, char);
2735 prog->subbeg[sublen] = '\0';
2736 prog->suboffset = min;
2737 prog->sublen = sublen;
2738 RX_MATCH_COPIED_on(rx);
2739 }
2740 prog->subcoffset = prog->suboffset;
2741 if (prog->suboffset && utf8_target) {
2742 /* Convert byte offset to chars.
2743 * XXX ideally should only compute this if @-/@+
2744 * has been seen, a la PL_sawampersand ??? */
2745
2746 /* If there's a direct correspondence between the
2747 * string which we're matching and the original SV,
2748 * then we can use the utf8 len cache associated with
2749 * the SV. In particular, it means that under //g,
2750 * sv_pos_b2u() will use the previously cached
2751 * position to speed up working out the new length of
2752 * subcoffset, rather than counting from the start of
2753 * the string each time. This stops
2754 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2755 * from going quadratic */
2756 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
ea3daa5d
FC
2757 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2758 SV_GMAGIC|SV_CONST_RETURN);
60165aa4
DM
2759 else
2760 prog->subcoffset = utf8_length((U8*)strbeg,
2761 (U8*)(strbeg+prog->suboffset));
2762 }
2763 }
2764 else {
2765 RX_MATCH_COPY_FREE(rx);
2766 prog->subbeg = strbeg;
2767 prog->suboffset = 0;
2768 prog->subcoffset = 0;
2769 prog->sublen = strend - strbeg;
2770 }
2771}
2772
2773
2774
fae667d5 2775
6eb5f6b9
JH
2776/*
2777 - regexec_flags - match a regexp against a string
2778 */
2779I32
5aaab254 2780Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
ea3daa5d 2781 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2782/* stringarg: the point in the string at which to begin matching */
2783/* strend: pointer to null at end of string */
2784/* strbeg: real beginning of string */
2785/* minend: end of match must be >= minend bytes after stringarg. */
2786/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2787 * itself is accessed via the pointers above */
2788/* data: May be used for some additional optimizations.
d058ec57 2789 Currently unused. */
a340edde 2790/* flags: For optimizations. See REXEC_* in regexp.h */
8fd1a950 2791
6eb5f6b9 2792{
8d919b0a 2793 struct regexp *const prog = ReANY(rx);
5aaab254 2794 char *s;
eb578fdb 2795 regnode *c;
03c83e26 2796 char *startpos;
ea3daa5d
FC
2797 SSize_t minlen; /* must match at least this many chars */
2798 SSize_t dontbother = 0; /* how many characters not to try at end */
f2ed9b32 2799 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2800 I32 multiline;
f8fc2ecf 2801 RXi_GET_DECL(prog,progi);
02d5137b
DM
2802 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2803 regmatch_info *const reginfo = &reginfo_buf;
e9105d30 2804 regexp_paren_pair *swap = NULL;
006f26b2 2805 I32 oldsave;
a3621e74
YO
2806 GET_RE_DEBUG_FLAGS_DECL;
2807
7918f24d 2808 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2809 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2810
2811 /* Be paranoid... */
3dc78631 2812 if (prog == NULL) {
6eb5f6b9 2813 Perl_croak(aTHX_ "NULL regexp parameter");
6eb5f6b9
JH
2814 }
2815
6c3fea77 2816 DEBUG_EXECUTE_r(
03c83e26 2817 debug_start_match(rx, utf8_target, stringarg, strend,
6c3fea77
DM
2818 "Matching");
2819 );
8adc0f72 2820
b342a604
DM
2821 startpos = stringarg;
2822
58430ea8 2823 if (prog->intflags & PREGf_GPOS_SEEN) {
d307c076
DM
2824 MAGIC *mg;
2825
fef7148b
DM
2826 /* set reginfo->ganch, the position where \G can match */
2827
2828 reginfo->ganch =
2829 (flags & REXEC_IGNOREPOS)
2830 ? stringarg /* use start pos rather than pos() */
3dc78631 2831 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
25fdce4a
FC
2832 /* Defined pos(): */
2833 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
fef7148b
DM
2834 : strbeg; /* pos() not defined; use start of string */
2835
2836 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
7b0eb0b8 2837 "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
fef7148b 2838
03c83e26
DM
2839 /* in the presence of \G, we may need to start looking earlier in
2840 * the string than the suggested start point of stringarg:
0b2c2a84 2841 * if prog->gofs is set, then that's a known, fixed minimum
03c83e26
DM
2842 * offset, such as
2843 * /..\G/: gofs = 2
2844 * /ab|c\G/: gofs = 1
2845 * or if the minimum offset isn't known, then we have to go back
2846 * to the start of the string, e.g. /w+\G/
2847 */
2bfbe302 2848
8e1490ee 2849 if (prog->intflags & PREGf_ANCH_GPOS) {
2bfbe302
DM
2850 startpos = reginfo->ganch - prog->gofs;
2851 if (startpos <
2852 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2853 {
2854 DEBUG_r(PerlIO_printf(Perl_debug_log,
2855 "fail: ganch-gofs before earliest possible start\n"));
2856 return 0;
2857 }
2858 }
2859 else if (prog->gofs) {
b342a604
DM
2860 if (startpos - prog->gofs < strbeg)
2861 startpos = strbeg;
2862 else
2863 startpos -= prog->gofs;
03c83e26 2864 }
58430ea8 2865 else if (prog->intflags & PREGf_GPOS_FLOAT)
b342a604 2866 startpos = strbeg;
03c83e26
DM
2867 }
2868
2869 minlen = prog->minlen;
b342a604 2870 if ((startpos + minlen) > strend || startpos < strbeg) {
03c83e26
DM
2871 DEBUG_r(PerlIO_printf(Perl_debug_log,
2872 "Regex match can't succeed, so not even tried\n"));
2873 return 0;
2874 }
2875
63a3746a
DM
2876 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2877 * which will call destuctors to reset PL_regmatch_state, free higher
2878 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2879 * regmatch_info_aux_eval */
2880
2881 oldsave = PL_savestack_ix;
2882
dfa77d06
DM
2883 s = startpos;
2884
e322109a 2885 if ((prog->extflags & RXf_USE_INTUIT)
7fadf4a7
DM
2886 && !(flags & REXEC_CHECKED))
2887 {
dfa77d06 2888 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
7fadf4a7 2889 flags, NULL);
dfa77d06 2890 if (!s)
7fadf4a7
DM
2891 return 0;
2892
e322109a 2893 if (prog->extflags & RXf_CHECK_ALL) {
7fadf4a7
DM
2894 /* we can match based purely on the result of INTUIT.
2895 * Set up captures etc just for $& and $-[0]
2896 * (an intuit-only match wont have $1,$2,..) */
2897 assert(!prog->nparens);
d5e7783a
DM
2898
2899 /* s/// doesn't like it if $& is earlier than where we asked it to
2900 * start searching (which can happen on something like /.\G/) */
2901 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2902 && (s < stringarg))
2903 {
2904 /* this should only be possible under \G */
58430ea8 2905 assert(prog->intflags & PREGf_GPOS_SEEN);
d5e7783a
DM
2906 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2907 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2908 goto phooey;
2909 }
2910
7fadf4a7
DM
2911 /* match via INTUIT shouldn't have any captures.
2912 * Let @-, @+, $^N know */
2913 prog->lastparen = prog->lastcloseparen = 0;
2914 RX_MATCH_UTF8_set(rx, utf8_target);
3ff69bd6
DM
2915 prog->offs[0].start = s - strbeg;
2916 prog->offs[0].end = utf8_target
2917 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2918 : s - strbeg + prog->minlenret;
7fadf4a7 2919 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 2920 S_reg_set_capture_string(aTHX_ rx,
7fadf4a7
DM
2921 strbeg, strend,
2922 sv, flags, utf8_target);
2923
7fadf4a7
DM
2924 return 1;
2925 }
2926 }
2927
6c3fea77 2928 multiline = prog->extflags & RXf_PMf_MULTILINE;
1de06328 2929
dfa77d06 2930 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2931 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2932 "String too short [regexec_flags]...\n"));
2933 goto phooey;
1aa99e6b 2934 }
1de06328 2935
6eb5f6b9 2936 /* Check validity of program. */
f8fc2ecf 2937 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2938 Perl_croak(aTHX_ "corrupted regexp program");
2939 }
2940
1738e041 2941 RX_MATCH_TAINTED_off(rx);
ab4e48c1 2942 RX_MATCH_UTF8_set(rx, utf8_target);
1738e041 2943
6c3fea77
DM
2944 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2945 reginfo->intuit = 0;
2946 reginfo->is_utf8_target = cBOOL(utf8_target);
02d5137b
DM
2947 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2948 reginfo->warned = FALSE;
9d9163fb 2949 reginfo->strbeg = strbeg;
02d5137b 2950 reginfo->sv = sv;
1cb48e53 2951 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
220db18a 2952 reginfo->strend = strend;
6eb5f6b9 2953 /* see how far we have to get to not match where we matched before */
fe3974be 2954 reginfo->till = stringarg + minend;
6eb5f6b9 2955
60779a30 2956 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
82c23608
FC
2957 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2958 S_cleanup_regmatch_info_aux has executed (registered by
2959 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
2960 magic belonging to this SV.
2961 Not newSVsv, either, as it does not COW.
2962 */
2963 reginfo->sv = newSV(0);
4cba5ac0 2964 SvSetSV_nosteal(reginfo->sv, sv);
82c23608
FC
2965 SAVEFREESV(reginfo->sv);
2966 }
2967
331b2dcc
DM
2968 /* reserve next 2 or 3 slots in PL_regmatch_state:
2969 * slot N+0: may currently be in use: skip it
2970 * slot N+1: use for regmatch_info_aux struct
2971 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2972 * slot N+3: ready for use by regmatch()
2973 */
bf2039a9 2974
331b2dcc
DM
2975 {
2976 regmatch_state *old_regmatch_state;
2977 regmatch_slab *old_regmatch_slab;
2978 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2979
2980 /* on first ever match, allocate first slab */
2981 if (!PL_regmatch_slab) {
2982 Newx(PL_regmatch_slab, 1, regmatch_slab);
2983 PL_regmatch_slab->prev = NULL;
2984 PL_regmatch_slab->next = NULL;
2985 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2986 }
bf2039a9 2987
331b2dcc
DM
2988 old_regmatch_state = PL_regmatch_state;
2989 old_regmatch_slab = PL_regmatch_slab;
bf2039a9 2990
331b2dcc
DM
2991 for (i=0; i <= max; i++) {
2992 if (i == 1)
2993 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2994 else if (i ==2)
2995 reginfo->info_aux_eval =
2996 reginfo->info_aux->info_aux_eval =
2997 &(PL_regmatch_state->u.info_aux_eval);
bf2039a9 2998
331b2dcc
DM
2999 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
3000 PL_regmatch_state = S_push_slab(aTHX);
3001 }
bf2039a9 3002
331b2dcc
DM
3003 /* note initial PL_regmatch_state position; at end of match we'll
3004 * pop back to there and free any higher slabs */
bf2039a9 3005
331b2dcc
DM
3006 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3007 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2ac8ff4b 3008 reginfo->info_aux->poscache = NULL;
bf2039a9 3009
331b2dcc 3010 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
bf2039a9 3011
331b2dcc
DM
3012 if ((prog->extflags & RXf_EVAL_SEEN))
3013 S_setup_eval_state(aTHX_ reginfo);
3014 else
3015 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
bf2039a9 3016 }
d3aa529c 3017
6eb5f6b9 3018 /* If there is a "must appear" string, look for it. */
6eb5f6b9 3019
288b8c02 3020 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
3021 /* We have to be careful. If the previous successful match
3022 was from this regex we don't want a subsequent partially
3023 successful match to clobber the old results.
3024 So when we detect this possibility we add a swap buffer
d8da0584
KW
3025 to the re, and switch the buffer each match. If we fail,
3026 we switch it back; otherwise we leave it swapped.
e9105d30
GG
3027 */
3028 swap = prog->offs;
3029 /* do we need a save destructor here for eval dies? */
3030 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
3031 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3032 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
3033 PTR2UV(prog),
3034 PTR2UV(swap),
3035 PTR2UV(prog->offs)
3036 ));
c74340f9 3037 }
6eb5f6b9 3038
0fa70a06
DM
3039 /* Simplest case: anchored match need be tried only once, or with
3040 * MBOL, only at the beginning of each line.
3041 *
3042 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3043 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3044 * match at the start of the string then it won't match anywhere else
3045 * either; while with /.*.../, if it doesn't match at the beginning,
3046 * the earliest it could match is at the start of the next line */
3047
8e1490ee 3048 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
0fa70a06
DM
3049 char *end;
3050
3051 if (regtry(reginfo, &s))
6eb5f6b9 3052 goto got_it;
0fa70a06
DM
3053
3054 if (!(prog->intflags & PREGf_ANCH_MBOL))
3055 goto phooey;
3056
3057 /* didn't match at start, try at other newline positions */
3058
3059 if (minlen)
3060 dontbother = minlen - 1;
3061 end = HOP3c(strend, -dontbother, strbeg) - 1;
3062
3063 /* skip to next newline */
3064
3065 while (s <= end) { /* note it could be possible to match at the end of the string */
3066 /* NB: newlines are the same in unicode as they are in latin */
3067 if (*s++ != '\n')
3068 continue;
3069 if (prog->check_substr || prog->check_utf8) {
3070 /* note that with PREGf_IMPLICIT, intuit can only fail
3071 * or return the start position, so it's of limited utility.
3072 * Nevertheless, I made the decision that the potential for
3073 * quick fail was still worth it - DAPM */
3074 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3075 if (!s)
3076 goto phooey;
3077 }
3078 if (regtry(reginfo, &s))
3079 goto got_it;
3080 }
3081 goto phooey;
3082 } /* end anchored search */
3083
3084 if (prog->intflags & PREGf_ANCH_GPOS)
f9f4320a 3085 {
a8430a8b
YO
3086 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3087 assert(prog->intflags & PREGf_GPOS_SEEN);
2bfbe302
DM
3088 /* For anchored \G, the only position it can match from is
3089 * (ganch-gofs); we already set startpos to this above; if intuit
3090 * moved us on from there, we can't possibly succeed */
3091 assert(startpos == reginfo->ganch - prog->gofs);
3092 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9
JH
3093 goto got_it;
3094 goto phooey;
3095 }
3096
3097 /* Messy cases: unanchored match. */
bbe252da 3098 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 3099 /* we have /x+whatever/ */
984e6dd1 3100 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 3101 char ch;
bf93d4cc
GS
3102#ifdef DEBUGGING
3103 int did_match = 0;
3104#endif
f2ed9b32 3105 if (utf8_target) {
7e0d5ad7
KW
3106 if (! prog->anchored_utf8) {
3107 to_utf8_substr(prog);
3108 }
3109 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 3110 REXEC_FBC_SCAN(
6eb5f6b9 3111 if (*s == ch) {
a3621e74 3112 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 3113 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
3114 s += UTF8SKIP(s);
3115 while (s < strend && *s == ch)
3116 s += UTF8SKIP(s);
3117 }
4cadc6a9 3118 );
7e0d5ad7 3119
6eb5f6b9
JH
3120 }
3121 else {
7e0d5ad7
KW
3122 if (! prog->anchored_substr) {
3123 if (! to_byte_substr(prog)) {
6b54ddc5 3124 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3125 }
3126 }
3127 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 3128 REXEC_FBC_SCAN(
6eb5f6b9 3129 if (*s == ch) {
a3621e74 3130 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 3131 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
3132 s++;
3133 while (s < strend && *s == ch)
3134 s++;
3135 }
4cadc6a9 3136 );
6eb5f6b9 3137 }
a3621e74 3138 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 3139 PerlIO_printf(Perl_debug_log,
b7953727
JH
3140 "Did not find anchored character...\n")
3141 );
6eb5f6b9 3142 }
a0714e2c
SS
3143 else if (prog->anchored_substr != NULL
3144 || prog->anchored_utf8 != NULL
3145 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
3146 && prog->float_max_offset < strend - s)) {
3147 SV *must;
ea3daa5d
FC
3148 SSize_t back_max;
3149 SSize_t back_min;
33b8afdf 3150 char *last;
6eb5f6b9 3151 char *last1; /* Last position checked before */
bf93d4cc
GS
3152#ifdef DEBUGGING
3153 int did_match = 0;
3154#endif
33b8afdf 3155 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
3156 if (utf8_target) {
3157 if (! prog->anchored_utf8) {
3158 to_utf8_substr(prog);
3159 }
3160 must = prog->anchored_utf8;
3161 }
3162 else {
3163 if (! prog->anchored_substr) {
3164 if (! to_byte_substr(prog)) {
6b54ddc5 3165 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3166 }
3167 }
3168 must = prog->anchored_substr;
3169 }
33b8afdf
JH
3170 back_max = back_min = prog->anchored_offset;
3171 } else {
7e0d5ad7
KW
3172 if (utf8_target) {
3173 if (! prog->float_utf8) {
3174 to_utf8_substr(prog);
3175 }
3176 must = prog->float_utf8;
3177 }
3178 else {
3179 if (! prog->float_substr) {
3180 if (! to_byte_substr(prog)) {
6b54ddc5 3181 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3182 }
3183 }
3184 must = prog->float_substr;
3185 }
33b8afdf
JH
3186 back_max = prog->float_max_offset;
3187 back_min = prog->float_min_offset;
3188 }
1de06328 3189
1de06328
YO
3190 if (back_min<0) {
3191 last = strend;
3192 } else {
3193 last = HOP3c(strend, /* Cannot start after this */
ea3daa5d 3194 -(SSize_t)(CHR_SVLEN(must)
1de06328
YO
3195 - (SvTAIL(must) != 0) + back_min), strbeg);
3196 }
9d9163fb 3197 if (s > reginfo->strbeg)
6eb5f6b9
JH
3198 last1 = HOPc(s, -1);
3199 else
3200 last1 = s - 1; /* bogus */
3201
a0288114 3202 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9 3203 check_substr==must. */
bf05793b 3204 dontbother = 0;
6eb5f6b9
JH
3205 strend = HOPc(strend, -dontbother);
3206 while ( (s <= last) &&
e50d57d4 3207 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
9041c2e3 3208 (unsigned char*)strend, must,
c33e64f0 3209 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 3210 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
3211 if (HOPc(s, -back_max) > last1) {
3212 last1 = HOPc(s, -back_min);
3213 s = HOPc(s, -back_max);
3214 }
3215 else {
9d9163fb
DM
3216 char * const t = (last1 >= reginfo->strbeg)
3217 ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
3218
3219 last1 = HOPc(s, -back_min);
52657f30 3220 s = t;
6eb5f6b9 3221 }
f2ed9b32 3222 if (utf8_target) {
6eb5f6b9 3223 while (s <= last1) {
02d5137b 3224 if (regtry(reginfo, &s))
6eb5f6b9 3225 goto got_it;
7016d6eb
DM
3226 if (s >= last1) {
3227 s++; /* to break out of outer loop */
3228 break;
3229 }
3230 s += UTF8SKIP(s);
6eb5f6b9
JH
3231 }
3232 }
3233 else {
3234 while (s <= last1) {
02d5137b 3235 if (regtry(reginfo, &s))
6eb5f6b9
JH
3236 goto got_it;
3237 s++;
3238 }
3239 }
3240 }
ab3bbdeb 3241 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 3242 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
3243 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3244 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 3245 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 3246 ? "anchored" : "floating"),
ab3bbdeb
YO
3247 quoted, RE_SV_TAIL(must));
3248 });
6eb5f6b9
JH
3249 goto phooey;
3250 }
f8fc2ecf 3251 else if ( (c = progi->regstclass) ) {
f14c76ed 3252 if (minlen) {
f8fc2ecf 3253 const OPCODE op = OP(progi->regstclass);
66e933ab 3254 /* don't bother with what can't match */
33c28ab2 3255 if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
f14c76ed
RGS
3256 strend = HOPc(strend, -(minlen - 1));
3257 }
a3621e74 3258 DEBUG_EXECUTE_r({
be8e71aa 3259 SV * const prop = sv_newmortal();
8b9781c9 3260 regprop(prog, prop, c, reginfo, NULL);
0df25f3d 3261 {
f2ed9b32 3262 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 3263 s,strend-s,60);
0df25f3d 3264 PerlIO_printf(Perl_debug_log,
1c8f8eb1 3265 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 3266 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 3267 quoted, (int)(strend - s));
0df25f3d 3268 }
ffc61ed2 3269 });
f9176b44 3270 if (find_byclass(prog, c, s, strend, reginfo))
6eb5f6b9 3271 goto got_it;
07be1b83 3272 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
3273 }
3274 else {
3275 dontbother = 0;
a0714e2c 3276 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 3277 /* Trim the end. */
6af40bd7 3278 char *last= NULL;
33b8afdf 3279 SV* float_real;
c33e64f0
FC
3280 STRLEN len;
3281 const char *little;
33b8afdf 3282
7e0d5ad7
KW
3283 if (utf8_target) {
3284 if (! prog->float_utf8) {
3285 to_utf8_substr(prog);
3286 }
3287 float_real = prog->float_utf8;
3288 }
3289 else {
3290 if (! prog->float_substr) {
3291 if (! to_byte_substr(prog)) {
6b54ddc5 3292 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3293 }
3294 }
3295 float_real = prog->float_substr;
3296 }
d6a28714 3297
c33e64f0
FC
3298 little = SvPV_const(float_real, len);
3299 if (SvTAIL(float_real)) {
7f18ad16
KW
3300 /* This means that float_real contains an artificial \n on
3301 * the end due to the presence of something like this:
3302 * /foo$/ where we can match both "foo" and "foo\n" at the
3303 * end of the string. So we have to compare the end of the
3304 * string first against the float_real without the \n and
3305 * then against the full float_real with the string. We
3306 * have to watch out for cases where the string might be
3307 * smaller than the float_real or the float_real without
3308 * the \n. */
1a13b075
YO
3309 char *checkpos= strend - len;
3310 DEBUG_OPTIMISE_r(
3311 PerlIO_printf(Perl_debug_log,
3312 "%sChecking for float_real.%s\n",
3313 PL_colors[4], PL_colors[5]));
3314 if (checkpos + 1 < strbeg) {
7f18ad16
KW
3315 /* can't match, even if we remove the trailing \n
3316 * string is too short to match */
1a13b075
YO
3317 DEBUG_EXECUTE_r(
3318 PerlIO_printf(Perl_debug_log,
3319 "%sString shorter than required trailing substring, cannot match.%s\n",
3320 PL_colors[4], PL_colors[5]));
3321 goto phooey;
3322 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
3323 /* can match, the end of the string matches without the
3324 * "\n" */
1a13b075
YO
3325 last = checkpos + 1;
3326 } else if (checkpos < strbeg) {
7f18ad16
KW
3327 /* cant match, string is too short when the "\n" is
3328 * included */
1a13b075
YO
3329 DEBUG_EXECUTE_r(
3330 PerlIO_printf(Perl_debug_log,
3331 "%sString does not contain required trailing substring, cannot match.%s\n",
3332 PL_colors[4], PL_colors[5]));
3333 goto phooey;
3334 } else if (!multiline) {
7f18ad16
KW
3335 /* non multiline match, so compare with the "\n" at the
3336 * end of the string */
1a13b075
YO
3337 if (memEQ(checkpos, little, len)) {
3338 last= checkpos;
3339 } else {
3340 DEBUG_EXECUTE_r(
3341 PerlIO_printf(Perl_debug_log,
3342 "%sString does not contain required trailing substring, cannot match.%s\n",
3343 PL_colors[4], PL_colors[5]));
3344 goto phooey;
3345 }
3346 } else {
7f18ad16
KW
3347 /* multiline match, so we have to search for a place
3348 * where the full string is located */
d6a28714 3349 goto find_last;
1a13b075 3350 }
c33e64f0 3351 } else {
d6a28714 3352 find_last:
9041c2e3 3353 if (len)
d6a28714 3354 last = rninstr(s, strend, little, little + len);
b8c5462f 3355 else
a0288114 3356 last = strend; /* matching "$" */
b8c5462f 3357 }
6af40bd7 3358 if (!last) {
7f18ad16
KW
3359 /* at one point this block contained a comment which was
3360 * probably incorrect, which said that this was a "should not
3361 * happen" case. Even if it was true when it was written I am
3362 * pretty sure it is not anymore, so I have removed the comment
3363 * and replaced it with this one. Yves */
6bda09f9
YO
3364 DEBUG_EXECUTE_r(
3365 PerlIO_printf(Perl_debug_log,
b729e729
YO
3366 "%sString does not contain required substring, cannot match.%s\n",
3367 PL_colors[4], PL_colors[5]
6af40bd7
YO
3368 ));
3369 goto phooey;
bf93d4cc 3370 }
d6a28714
JH
3371 dontbother = strend - last + prog->float_min_offset;
3372 }
3373 if (minlen && (dontbother < minlen))
3374 dontbother = minlen - 1;
3375 strend -= dontbother; /* this one's always in bytes! */
3376 /* We don't know much -- general case. */
f2ed9b32 3377 if (utf8_target) {
d6a28714 3378 for (;;) {
02d5137b 3379 if (regtry(reginfo, &s))
d6a28714
JH
3380 goto got_it;
3381 if (s >= strend)
3382 break;
b8c5462f 3383 s += UTF8SKIP(s);
d6a28714
JH
3384 };
3385 }
3386 else {
3387 do {
02d5137b 3388 if (regtry(reginfo, &s))
d6a28714
JH
3389 goto got_it;
3390 } while (s++ < strend);
3391 }
3392 }
3393
3394 /* Failure. */
3395 goto phooey;
3396
7b52d656 3397 got_it:
d5e7783a
DM
3398 /* s/// doesn't like it if $& is earlier than where we asked it to
3399 * start searching (which can happen on something like /.\G/) */
3400 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3401 && (prog->offs[0].start < stringarg - strbeg))
3402 {
3403 /* this should only be possible under \G */
58430ea8 3404 assert(prog->intflags & PREGf_GPOS_SEEN);
d5e7783a
DM
3405 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3406 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3407 goto phooey;
3408 }
3409
495f47a5
DM
3410 DEBUG_BUFFERS_r(
3411 if (swap)
3412 PerlIO_printf(Perl_debug_log,
3413 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3414 PTR2UV(prog),
3415 PTR2UV(swap)
3416 );
3417 );
e9105d30 3418 Safefree(swap);
d6a28714 3419
bf2039a9
DM
3420 /* clean up; this will trigger destructors that will free all slabs
3421 * above the current one, and cleanup the regmatch_info_aux
3422 * and regmatch_info_aux_eval sructs */
8adc0f72 3423
006f26b2
DM
3424 LEAVE_SCOPE(oldsave);
3425
5daac39c
NC
3426 if (RXp_PAREN_NAMES(prog))
3427 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
3428
3429 /* make sure $`, $&, $', and $digit will work later */
60165aa4 3430 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 3431 S_reg_set_capture_string(aTHX_ rx,
60165aa4
DM
3432 strbeg, reginfo->strend,
3433 sv, flags, utf8_target);
9041c2e3 3434
d6a28714
JH
3435 return 1;
3436
7b52d656 3437 phooey:
a3621e74 3438 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 3439 PL_colors[4], PL_colors[5]));
8adc0f72 3440
bf2039a9
DM
3441 /* clean up; this will trigger destructors that will free all slabs
3442 * above the current one, and cleanup the regmatch_info_aux
3443 * and regmatch_info_aux_eval sructs */
8adc0f72 3444
006f26b2
DM
3445 LEAVE_SCOPE(oldsave);
3446
e9105d30 3447 if (swap) {
c74340f9 3448 /* we failed :-( roll it back */
495f47a5
DM
3449 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3450 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3451 PTR2UV(prog),
3452 PTR2UV(prog->offs),
3453 PTR2UV(swap)
3454 ));
e9105d30
GG
3455 Safefree(prog->offs);
3456 prog->offs = swap;
3457 }
d6a28714
JH
3458 return 0;
3459}
3460
6bda09f9 3461
b3d298be 3462/* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
ec43f78b 3463 * Do inc before dec, in case old and new rex are the same */
baa60164 3464#define SET_reg_curpm(Re2) \
bf2039a9 3465 if (reginfo->info_aux_eval) { \
ec43f78b
DM
3466 (void)ReREFCNT_inc(Re2); \
3467 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
3468 PM_SETRE((PL_reg_curpm), (Re2)); \
3469 }
3470
3471
d6a28714
JH
3472/*
3473 - regtry - try match at specific point
3474 */
63f46dab 3475STATIC bool /* 0 failure, 1 success */
f73aaa43 3476S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 3477{
d6a28714 3478 CHECKPOINT lastcp;
288b8c02 3479 REGEXP *const rx = reginfo->prog;
8d919b0a 3480 regexp *const prog = ReANY(rx);
99a90e59 3481 SSize_t result;
f8fc2ecf 3482 RXi_GET_DECL(prog,progi);
a3621e74 3483 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
3484
3485 PERL_ARGS_ASSERT_REGTRY;
3486
24b23f37 3487 reginfo->cutpoint=NULL;
d6a28714 3488
9d9163fb 3489 prog->offs[0].start = *startposp - reginfo->strbeg;
d6a28714 3490 prog->lastparen = 0;
03994de8 3491 prog->lastcloseparen = 0;
d6a28714
JH
3492
3493 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 3494 to do this again and again, prog->lastparen should take care of
3dd2943c 3495 this! --ilya*/
dafc8851
JH
3496
3497 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3498 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 3499 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
3500 * (op/regexp, op/pat, op/split), but that code is needed otherwise
3501 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3502 * Meanwhile, this code *is* needed for the
daf18116
JH
3503 * above-mentioned test suite tests to succeed. The common theme
3504 * on those tests seems to be returning null fields from matches.
225593e1 3505 * --jhi updated by dapm */
dafc8851 3506#if 1
d6a28714 3507 if (prog->nparens) {
b93070ed 3508 regexp_paren_pair *pp = prog->offs;
eb578fdb 3509 I32 i;
b93070ed 3510 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
3511 ++pp;
3512 pp->start = -1;
3513 pp->end = -1;
d6a28714
JH
3514 }
3515 }