This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Avoid some branches
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
f65819ce
CO
5 * One Ring to rule them all, One Ring to find them
6 *
4ac71550
TC
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
b992490d 83#include "invlist_inline.h"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
bbac6b20
KW
86#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
88
a0bd1a30
KW
89static const char utf8_locale_required[] =
90 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
91
e1cf74e3
CB
92#ifdef DEBUGGING
93/* At least one required character in the target string is expressible only in
94 * UTF-8. */
95static const char* const non_utf8_target_but_utf8_required
96 = "Can't match, because target string needs to be in UTF-8\n";
97#endif
98
7631e439
KW
99/* Returns a boolean as to whether the input unsigned number is a power of 2
100 * (2**0, 2**1, etc). In other words if it has just a single bit set.
101 * If not, subtracting 1 would leave the uppermost bit set, so the & would
102 * yield non-zero */
103#define isPOWER_OF_2(n) ((n & (n-1)) == 0)
104
7b031478 105#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
6ad9a8ab 106 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\
7b031478 107 goto target; \
e1cf74e3
CB
108} STMT_END
109
c74f6de9
KW
110#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
111
a687059c
LW
112#ifndef STATIC
113#define STATIC static
114#endif
115
451c6e0b
KW
116/* Valid only if 'c', the character being looke-up, is an invariant under
117 * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
118 * everything matchable is straight forward in the bitmap */
119#define REGINCLASS(prog,p,c,u) (ANYOF_FLAGS(p) \
120 ? reginclass(prog,p,c,c+1,u) \
121 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 122
c277df42
IZ
123/*
124 * Forwards.
125 */
126
f2ed9b32 127#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 128
3dab1dad 129#define HOPc(pos,off) \
ba44c216 130 (char *)(reginfo->is_utf8_target \
220db18a 131 ? reghop3((U8*)pos, off, \
9d9163fb 132 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad 133 : (U8*)(pos + off))
557f47af 134
bb152a4b
DM
135/* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
136#define HOPBACK3(pos, off, lim) \
137 (reginfo->is_utf8_target \
138 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
139 : (pos - off >= lim) \
140 ? (U8*)pos - off \
3dab1dad 141 : NULL)
efb30f32 142
bb152a4b
DM
143#define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
144
ba44c216 145#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 146#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 147
557f47af
DM
148/* lim must be +ve. Returns NULL on overshoot */
149#define HOPMAYBE3(pos,off,lim) \
150 (reginfo->is_utf8_target \
151 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
152 : ((U8*)pos + off <= lim) \
153 ? (U8*)pos + off \
154 : NULL)
155
8e9f2289
DM
156/* like HOP3, but limits the result to <= lim even for the non-utf8 case.
157 * off must be >=0; args should be vars rather than expressions */
158#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
159 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
160 : (U8*)((pos + off) > lim ? lim : (pos + off)))
67853908 161#define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
8e9f2289 162
2974eaec
DM
163#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
164 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
165 : (U8*)(pos + off))
166#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
7016d6eb
DM
167
168#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
169#define NEXTCHR_IS_EOS (nextchr < 0)
170
171#define SET_nextchr \
220db18a 172 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
7016d6eb
DM
173
174#define SET_locinput(p) \
175 locinput = (p); \
176 SET_nextchr
177
178
2a16ac92 179#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
c7304fe2
KW
180 if (!swash_ptr) { \
181 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
c7304fe2 182 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
2a16ac92 183 1, 0, invlist, &flags); \
c7304fe2
KW
184 assert(swash_ptr); \
185 } \
186 } STMT_END
187
188/* If in debug mode, we test that a known character properly matches */
189#ifdef DEBUGGING
190# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
191 property_name, \
2a16ac92 192 invlist, \
c7304fe2 193 utf8_char_in_property) \
2a16ac92 194 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
c7304fe2
KW
195 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
196#else
197# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
198 property_name, \
2a16ac92 199 invlist, \
c7304fe2 200 utf8_char_in_property) \
2a16ac92 201 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
c7304fe2 202#endif
d1eb3177 203
c7304fe2
KW
204#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
205 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
2a16ac92
KW
206 "", \
207 PL_XPosix_ptrs[_CC_WORDCHAR], \
0766489e 208 LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
c7304fe2 209
c7304fe2 210#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
211/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
212
5f80c4cf 213/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
214/* it would be nice to rework regcomp.sym to generate this stuff. sigh
215 *
216 * NOTE that *nothing* that affects backtracking should be in here, specifically
217 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
218 * node that is in between two EXACT like nodes when ascertaining what the required
219 * "follow" character is. This should probably be moved to regex compile time
220 * although it may be done at run time beause of the REF possibility - more
221 * investigation required. -- demerphq
222*/
baa60164
KW
223#define JUMPABLE(rn) ( \
224 OP(rn) == OPEN || \
24be3102
YO
225 (OP(rn) == CLOSE && \
226 !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) || \
baa60164
KW
227 OP(rn) == EVAL || \
228 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
229 OP(rn) == PLUS || OP(rn) == MINMOD || \
230 OP(rn) == KEEPS || \
231 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 232)
ee9b8eae 233#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 234
ee9b8eae
YO
235#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
236
237#if 0
238/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
a4525e78 239 we don't need this definition. XXX These are now out-of-sync*/
ee9b8eae 240#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
098b07d5 241#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
242#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
243
244#else
245/* ... so we use this as its faster. */
a4525e78
KW
246#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==EXACTL )
247#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
ee9b8eae
YO
248#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
249#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
250
251#endif
e2d8ce26 252
a84d97b6
HS
253/*
254 Search for mandatory following text node; for lookahead, the text must
255 follow but for lookbehind (rn->flags != 0) we skip to the next step.
256*/
baa60164 257#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
258 while (JUMPABLE(rn)) { \
259 const OPCODE type = OP(rn); \
260 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 261 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 262 else if (type == PLUS) \
cca55fe3 263 rn = NEXTOPER(rn); \
3dab1dad 264 else if (type == IFMATCH) \
a84d97b6 265 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 266 else rn += NEXT_OFF(rn); \
3dab1dad 267 } \
5f80c4cf 268} STMT_END
74750237 269
006f26b2
DM
270#define SLAB_FIRST(s) (&(s)->states[0])
271#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
272
a75351a1 273static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 274static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
bf2039a9 275static regmatch_state * S_push_slab(pTHX);
51371543 276
87c0511b 277#define REGCP_PAREN_ELEMS 3
f067efbf 278#define REGCP_OTHER_ELEMS 3
e0fa7e2b 279#define REGCP_FRAME_ELEMS 1
620d5b66
NC
280/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
281 * are needed for the regexp context stack bookkeeping. */
282
76e3520e 283STATIC CHECKPOINT
21553840 284S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
a0d0e21e 285{
a3b680e6 286 const int retval = PL_savestack_ix;
92da3157
DM
287 const int paren_elems_to_push =
288 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
289 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
290 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 291 I32 p;
40a82448 292 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 293
b93070ed
DM
294 PERL_ARGS_ASSERT_REGCPPUSH;
295
e49a9654 296 if (paren_elems_to_push < 0)
e8a85d26
JH
297 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
298 (int)paren_elems_to_push, (int)maxopenparen,
299 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
e49a9654 300
e0fa7e2b 301 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
147e3846 302 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
5df417d0 303 " out of range (%lu-%ld)",
92da3157
DM
304 total_elems,
305 (unsigned long)maxopenparen,
306 (long)parenfloor);
e0fa7e2b 307
620d5b66 308 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 309
495f47a5 310 DEBUG_BUFFERS_r(
92da3157 311 if ((int)maxopenparen > (int)parenfloor)
2b1a3689 312 Perl_re_exec_indentf( aTHX_
147e3846 313 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
2b1a3689
YO
314 depth,
315 PTR2UV(rex),
495f47a5
DM
316 PTR2UV(rex->offs)
317 );
318 );
92da3157 319 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 320/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
99a90e59
FC
321 SSPUSHIV(rex->offs[p].end);
322 SSPUSHIV(rex->offs[p].start);
1ca2007e 323 SSPUSHINT(rex->offs[p].start_tmp);
2b1a3689 324 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
147e3846 325 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
2b1a3689
YO
326 depth,
327 (UV)p,
495f47a5
DM
328 (IV)rex->offs[p].start,
329 (IV)rex->offs[p].start_tmp,
330 (IV)rex->offs[p].end
40a82448 331 ));
a0d0e21e 332 }
b1ce53c5 333/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 334 SSPUSHINT(maxopenparen);
b93070ed
DM
335 SSPUSHINT(rex->lastparen);
336 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 337 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 338
a0d0e21e
LW
339 return retval;
340}
341
c277df42 342/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
343#define REGCP_SET(cp) \
344 DEBUG_STATE_r( \
147e3846
KW
345 Perl_re_exec_indentf( aTHX_ \
346 "Setting an EVAL scope, savestack=%" IVdf ",\n", \
cb41e5d6
YO
347 depth, (IV)PL_savestack_ix \
348 ) \
349 ); \
ab3bbdeb 350 cp = PL_savestack_ix
c3464db5 351
ab3bbdeb 352#define REGCP_UNWIND(cp) \
e4f74956 353 DEBUG_STATE_r( \
cb41e5d6 354 if (cp != PL_savestack_ix) \
147e3846
KW
355 Perl_re_exec_indentf( aTHX_ \
356 "Clearing an EVAL scope, savestack=%" \
357 IVdf "..%" IVdf "\n", \
cb41e5d6
YO
358 depth, (IV)(cp), (IV)PL_savestack_ix \
359 ) \
360 ); \
ab3bbdeb 361 regcpblow(cp)
c277df42 362
a8d1f4b4
DM
363#define UNWIND_PAREN(lp, lcp) \
364 for (n = rex->lastparen; n > lp; n--) \
365 rex->offs[n].end = -1; \
366 rex->lastparen = n; \
367 rex->lastcloseparen = lcp;
368
369
f067efbf 370STATIC void
21553840 371S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
a0d0e21e 372{
e0fa7e2b 373 UV i;
87c0511b 374 U32 paren;
a3621e74
YO
375 GET_RE_DEBUG_FLAGS_DECL;
376
7918f24d
NC
377 PERL_ARGS_ASSERT_REGCPPOP;
378
b1ce53c5 379 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 380 i = SSPOPUV;
e0fa7e2b
NC
381 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
382 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
383 rex->lastcloseparen = SSPOPINT;
384 rex->lastparen = SSPOPINT;
92da3157 385 *maxopenparen_p = SSPOPINT;
b1ce53c5 386
620d5b66 387 i -= REGCP_OTHER_ELEMS;
b1ce53c5 388 /* Now restore the parentheses context. */
495f47a5
DM
389 DEBUG_BUFFERS_r(
390 if (i || rex->lastparen + 1 <= rex->nparens)
2b1a3689 391 Perl_re_exec_indentf( aTHX_
147e3846 392 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
2b1a3689
YO
393 depth,
394 PTR2UV(rex),
495f47a5
DM
395 PTR2UV(rex->offs)
396 );
397 );
92da3157 398 paren = *maxopenparen_p;
620d5b66 399 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
ea3daa5d 400 SSize_t tmps;
1ca2007e 401 rex->offs[paren].start_tmp = SSPOPINT;
99a90e59
FC
402 rex->offs[paren].start = SSPOPIV;
403 tmps = SSPOPIV;
b93070ed
DM
404 if (paren <= rex->lastparen)
405 rex->offs[paren].end = tmps;
2b1a3689 406 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
147e3846 407 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
2b1a3689
YO
408 depth,
409 (UV)paren,
495f47a5
DM
410 (IV)rex->offs[paren].start,
411 (IV)rex->offs[paren].start_tmp,
412 (IV)rex->offs[paren].end,
413 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 414 );
87c0511b 415 paren--;
a0d0e21e 416 }
daf18116 417#if 1
dafc8851
JH
418 /* It would seem that the similar code in regtry()
419 * already takes care of this, and in fact it is in
420 * a better location to since this code can #if 0-ed out
421 * but the code in regtry() is needed or otherwise tests
422 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
423 * (as of patchlevel 7877) will fail. Then again,
424 * this code seems to be necessary or otherwise
225593e1
DM
425 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
426 * --jhi updated by dapm */
b93070ed 427 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 428 if (i > *maxopenparen_p)
b93070ed
DM
429 rex->offs[i].start = -1;
430 rex->offs[i].end = -1;
2b1a3689 431 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
147e3846 432 " \\%" UVuf ": %s ..-1 undeffing\n",
2b1a3689
YO
433 depth,
434 (UV)i,
92da3157 435 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 436 ));
a0d0e21e 437 }
dafc8851 438#endif
a0d0e21e
LW
439}
440
74088413
DM
441/* restore the parens and associated vars at savestack position ix,
442 * but without popping the stack */
443
444STATIC void
21553840 445S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
74088413
DM
446{
447 I32 tmpix = PL_savestack_ix;
85882954
YO
448 PERL_ARGS_ASSERT_REGCP_RESTORE;
449
74088413 450 PL_savestack_ix = ix;
21553840 451 regcppop(rex, maxopenparen_p);
74088413
DM
452 PL_savestack_ix = tmpix;
453}
454
02db2b7b 455#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 456
9637d2a5
CB
457#ifndef PERL_IN_XSUB_RE
458
24e16d7b
KW
459bool
460Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
31c7f561
KW
461{
462 /* Returns a boolean as to whether or not 'character' is a member of the
463 * Posix character class given by 'classnum' that should be equivalent to a
464 * value in the typedef '_char_class_number'.
465 *
466 * Ideally this could be replaced by a just an array of function pointers
467 * to the C library functions that implement the macros this calls.
468 * However, to compile, the precise function signatures are required, and
469 * these may vary from platform to to platform. To avoid having to figure
470 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
471 * which adds an extra layer of function call overhead (unless the C
472 * optimizer strips it away). But we don't particularly care about
473 * performance with locales anyway. */
31c7f561
KW
474
475 switch ((_char_class_number) classnum) {
15861f94 476 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 477 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
478 case _CC_ENUM_ASCII: return isASCII_LC(character);
479 case _CC_ENUM_BLANK: return isBLANK_LC(character);
cee69f79 480 case _CC_ENUM_CASED: return isLOWER_LC(character)
b0d691b2 481 || isUPPER_LC(character);
e8d596e0 482 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
483 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
484 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
485 case _CC_ENUM_LOWER: return isLOWER_LC(character);
486 case _CC_ENUM_PRINT: return isPRINT_LC(character);
487 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 488 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
489 case _CC_ENUM_UPPER: return isUPPER_LC(character);
490 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 491 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
492 default: /* VERTSPACE should never occur in locales */
493 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
494 }
495
e5964223 496 NOT_REACHED; /* NOTREACHED */
31c7f561
KW
497 return FALSE;
498}
499
9637d2a5
CB
500#endif
501
3018b823
KW
502STATIC bool
503S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
504{
505 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
506 * 'character' is a member of the Posix character class given by 'classnum'
507 * that should be equivalent to a value in the typedef
508 * '_char_class_number'.
509 *
510 * This just calls isFOO_lc on the code point for the character if it is in
2f306ab9 511 * the range 0-255. Outside that range, all characters use Unicode
3018b823
KW
512 * rules, ignoring any locale. So use the Unicode function if this class
513 * requires a swash, and use the Unicode macro otherwise. */
514
515 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
516
517 if (UTF8_IS_INVARIANT(*character)) {
518 return isFOO_lc(classnum, *character);
519 }
520 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
521 return isFOO_lc(classnum,
a62b247b 522 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
3018b823
KW
523 }
524
613abc6d
KW
525 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
526
3018b823
KW
527 if (classnum < _FIRST_NON_SWASH_CC) {
528
529 /* Initialize the swash unless done already */
530 if (! PL_utf8_swash_ptrs[classnum]) {
531 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2a16ac92
KW
532 PL_utf8_swash_ptrs[classnum] =
533 _core_swash_init("utf8",
534 "",
535 &PL_sv_undef, 1, 0,
536 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
537 }
538
92a2046b
KW
539 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
540 character,
541 TRUE /* is UTF */ ));
3018b823
KW
542 }
543
544 switch ((_char_class_number) classnum) {
779cf272 545 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
3018b823
KW
546 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
547 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
548 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
e1ee3960 549 default: break;
3018b823
KW
550 }
551
e1ee3960 552 return FALSE; /* Things like CNTRL are always below 256 */
3018b823
KW
553}
554
aff4cafe
KW
555STATIC char *
556S_find_next_ascii(char * s, const char * send, const bool utf8_target)
557{
558 /* Returns the position of the first ASCII byte in the sequence between 's'
559 * and 'send-1' inclusive; returns 'send' if none found */
560
561 PERL_ARGS_ASSERT_FIND_NEXT_ASCII;
562
563#ifndef EBCDIC
564
565 if ((STRLEN) (send - s) >= PERL_WORDSIZE
566
567 /* This term is wordsize if subword; 0 if not */
568 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
569
570 /* 'offset' */
571 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
572 {
573
574 /* Process per-byte until reach word boundary. XXX This loop could be
575 * eliminated if we knew that this platform had fast unaligned reads */
576 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
577 if (isASCII(*s)) {
578 return s;
579 }
580 s++; /* khw didn't bother creating a separate loop for
581 utf8_target */
582 }
583
584 /* Here, we know we have at least one full word to process. Process
585 * per-word as long as we have at least a full word left */
586 do {
1d2af574
KW
587 PERL_UINTMAX_T complemented = ~ * (PERL_UINTMAX_T *) s;
588 if (complemented & PERL_VARIANTS_WORD_MASK) {
589
590#if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
591 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
592
593 s += _variant_byte_number(complemented);
594 return s;
595
596#else /* If weird byte order, drop into next loop to do byte-at-a-time
597 checks. */
598
aff4cafe 599 break;
1d2af574 600#endif
aff4cafe 601 }
1d2af574 602
aff4cafe 603 s += PERL_WORDSIZE;
1d2af574 604
aff4cafe
KW
605 } while (s + PERL_WORDSIZE <= send);
606 }
607
608#endif
609
610 /* Process per-character */
611 if (utf8_target) {
612 while (s < send) {
613 if (isASCII(*s)) {
614 return s;
615 }
616 s += UTF8SKIP(s);
617 }
618 }
619 else {
620 while (s < send) {
621 if (isASCII(*s)) {
622 return s;
623 }
624 s++;
625 }
626 }
627
628 return s;
629}
630
631STATIC char *
632S_find_next_non_ascii(char * s, const char * send, const bool utf8_target)
633{
634 /* Returns the position of the first non-ASCII byte in the sequence between
635 * 's' and 'send-1' inclusive; returns 'send' if none found */
636
637#ifdef EBCDIC
638
639 PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
640
641 if (utf8_target) {
642 while (s < send) {
643 if ( ! isASCII(*s)) {
644 return s;
645 }
646 s += UTF8SKIP(s);
647 }
648 }
649 else {
650 while (s < send) {
651 if ( ! isASCII(*s)) {
652 return s;
653 }
654 s++;
655 }
656 }
657
658 return s;
659
660#else
661
662 const U8 * next_non_ascii = NULL;
663
664 PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
665 PERL_UNUSED_ARG(utf8_target);
666
667 /* On ASCII platforms invariants and ASCII are identical, so if the string
668 * is entirely invariants, there is no non-ASCII character */
669 return (is_utf8_invariant_string_loc((U8 *) s,
670 (STRLEN) (send - s),
671 &next_non_ascii))
672 ? (char *) send
673 : (char *) next_non_ascii;
674
675#endif
676
677}
678
a687059c 679/*
e50aee73 680 * pregexec and friends
a687059c
LW
681 */
682
76234dfb 683#ifndef PERL_IN_XSUB_RE
a687059c 684/*
c277df42 685 - pregexec - match a regexp against a string
a687059c 686 */
c277df42 687I32
5aaab254 688Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
ea3daa5d 689 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
8fd1a950
DM
690/* stringarg: the point in the string at which to begin matching */
691/* strend: pointer to null at end of string */
692/* strbeg: real beginning of string */
693/* minend: end of match must be >= minend bytes after stringarg. */
694/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
695 * itself is accessed via the pointers above */
696/* nosave: For optimizations. */
c277df42 697{
7918f24d
NC
698 PERL_ARGS_ASSERT_PREGEXEC;
699
c277df42 700 return
9041c2e3 701 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
702 nosave ? 0 : REXEC_COPY_STR);
703}
76234dfb 704#endif
22e551b9 705
cad2e5aa 706
6eb5f6b9 707
1a4edc3c
DM
708/* re_intuit_start():
709 *
710 * Based on some optimiser hints, try to find the earliest position in the
711 * string where the regex could match.
712 *
713 * rx: the regex to match against
714 * sv: the SV being matched: only used for utf8 flag; the string
715 * itself is accessed via the pointers below. Note that on
716 * something like an overloaded SV, SvPOK(sv) may be false
717 * and the string pointers may point to something unrelated to
718 * the SV itself.
719 * strbeg: real beginning of string
720 * strpos: the point in the string at which to begin matching
721 * strend: pointer to the byte following the last char of the string
722 * flags currently unused; set to 0
723 * data: currently unused; set to NULL
724 *
725 * The basic idea of re_intuit_start() is to use some known information
726 * about the pattern, namely:
727 *
728 * a) the longest known anchored substring (i.e. one that's at a
729 * constant offset from the beginning of the pattern; but not
730 * necessarily at a fixed offset from the beginning of the
731 * string);
732 * b) the longest floating substring (i.e. one that's not at a constant
733 * offset from the beginning of the pattern);
734 * c) Whether the pattern is anchored to the string; either
735 * an absolute anchor: /^../, or anchored to \n: /^.../m,
736 * or anchored to pos(): /\G/;
737 * d) A start class: a real or synthetic character class which
738 * represents which characters are legal at the start of the pattern;
739 *
740 * to either quickly reject the match, or to find the earliest position
741 * within the string at which the pattern might match, thus avoiding
742 * running the full NFA engine at those earlier locations, only to
743 * eventually fail and retry further along.
744 *
745 * Returns NULL if the pattern can't match, or returns the address within
746 * the string which is the earliest place the match could occur.
747 *
748 * The longest of the anchored and floating substrings is called 'check'
749 * and is checked first. The other is called 'other' and is checked
750 * second. The 'other' substring may not be present. For example,
751 *
752 * /(abc|xyz)ABC\d{0,3}DEFG/
753 *
754 * will have
755 *
756 * check substr (float) = "DEFG", offset 6..9 chars
757 * other substr (anchored) = "ABC", offset 3..3 chars
758 * stclass = [ax]
759 *
760 * Be aware that during the course of this function, sometimes 'anchored'
761 * refers to a substring being anchored relative to the start of the
762 * pattern, and sometimes to the pattern itself being anchored relative to
763 * the string. For example:
764 *
765 * /\dabc/: "abc" is anchored to the pattern;
766 * /^\dabc/: "abc" is anchored to the pattern and the string;
767 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
768 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
769 * but the pattern is anchored to the string.
52a21eb3
DM
770 */
771
cad2e5aa 772char *
52a21eb3
DM
773Perl_re_intuit_start(pTHX_
774 REGEXP * const rx,
775 SV *sv,
776 const char * const strbeg,
777 char *strpos,
778 char *strend,
779 const U32 flags,
780 re_scream_pos_data *data)
cad2e5aa 781{
8d919b0a 782 struct regexp *const prog = ReANY(rx);
b2ad2123
DM
783 SSize_t start_shift = prog->check_offset_min;
784 /* Should be nonnegative! */
785 SSize_t end_shift = 0;
0fc004dd
DM
786 /* current lowest pos in string where the regex can start matching */
787 char *rx_origin = strpos;
eb578fdb 788 SV *check;
f2ed9b32 789 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
6480a6c4 790 U8 other_ix = 1 - prog->substrs->check_ix;
6ad5ffb3 791 bool ml_anch = 0;
8f4bf5fc 792 char *other_last = strpos;/* latest pos 'other' substr already checked to */
bd61b366 793 char *check_at = NULL; /* check substr found at this pos */
bbe252da 794 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 795 RXi_GET_DECL(prog,progi);
02d5137b
DM
796 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
797 regmatch_info *const reginfo = &reginfo_buf;
a3621e74
YO
798 GET_RE_DEBUG_FLAGS_DECL;
799
7918f24d 800 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
801 PERL_UNUSED_ARG(flags);
802 PERL_UNUSED_ARG(data);
7918f24d 803
6ad9a8ab 804 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0
DM
805 "Intuit: trying to determine minimum start position...\n"));
806
fb9bbddb 807 /* for now, assume that all substr offsets are positive. If at some point
f67a5002 808 * in the future someone wants to do clever things with lookbehind and
fb9bbddb
DM
809 * -ve offsets, they'll need to fix up any code in this function
810 * which uses these offsets. See the thread beginning
811 * <20140113145929.GF27210@iabyn.com>
812 */
813 assert(prog->substrs->data[0].min_offset >= 0);
814 assert(prog->substrs->data[0].max_offset >= 0);
815 assert(prog->substrs->data[1].min_offset >= 0);
816 assert(prog->substrs->data[1].max_offset >= 0);
817 assert(prog->substrs->data[2].min_offset >= 0);
818 assert(prog->substrs->data[2].max_offset >= 0);
819
f7022b5a 820 /* for now, assume that if both present, that the floating substring
83f2232d 821 * doesn't start before the anchored substring.
f7022b5a
DM
822 * If you break this assumption (e.g. doing better optimisations
823 * with lookahead/behind), then you'll need to audit the code in this
824 * function carefully first
825 */
826 assert(
827 ! ( (prog->anchored_utf8 || prog->anchored_substr)
828 && (prog->float_utf8 || prog->float_substr))
829 || (prog->float_min_offset >= prog->anchored_offset));
830
1a4edc3c
DM
831 /* byte rather than char calculation for efficiency. It fails
832 * to quickly reject some cases that can't match, but will reject
833 * them later after doing full char arithmetic */
c344f387 834 if (prog->minlen > strend - strpos) {
6ad9a8ab 835 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 836 " String too short...\n"));
cad2e5aa 837 goto fail;
2c2d71f5 838 }
d8da0584 839
196a02af 840 RXp_MATCH_UTF8_set(prog, utf8_target);
6c3fea77 841 reginfo->is_utf8_target = cBOOL(utf8_target);
bf2039a9 842 reginfo->info_aux = NULL;
9d9163fb 843 reginfo->strbeg = strbeg;
220db18a 844 reginfo->strend = strend;
aed7b151 845 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 846 reginfo->intuit = 1;
1cb48e53
DM
847 /* not actually used within intuit, but zero for safety anyway */
848 reginfo->poscache_maxiter = 0;
02d5137b 849
f2ed9b32 850 if (utf8_target) {
2814f4b3
HS
851 if ((!prog->anchored_utf8 && prog->anchored_substr)
852 || (!prog->float_utf8 && prog->float_substr))
33b8afdf
JH
853 to_utf8_substr(prog);
854 check = prog->check_utf8;
855 } else {
7e0d5ad7
KW
856 if (!prog->check_substr && prog->check_utf8) {
857 if (! to_byte_substr(prog)) {
6b54ddc5 858 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
859 }
860 }
33b8afdf
JH
861 check = prog->check_substr;
862 }
274cd312 863
1dc475d0
DM
864 /* dump the various substring data */
865 DEBUG_OPTIMISE_MORE_r({
866 int i;
867 for (i=0; i<=2; i++) {
868 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
869 : prog->substrs->data[i].substr);
870 if (!sv)
871 continue;
872
6ad9a8ab 873 Perl_re_printf( aTHX_
147e3846
KW
874 " substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
875 " useful=%" IVdf " utf8=%d [%s]\n",
1dc475d0
DM
876 i,
877 (IV)prog->substrs->data[i].min_offset,
878 (IV)prog->substrs->data[i].max_offset,
879 (IV)prog->substrs->data[i].end_shift,
880 BmUSEFUL(sv),
881 utf8_target ? 1 : 0,
882 SvPEEK(sv));
883 }
884 });
885
8e1490ee 886 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
9fc7410e
DM
887
888 /* ml_anch: check after \n?
889 *
0fa70a06 890 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
9fc7410e
DM
891 * with /.*.../, these flags will have been added by the
892 * compiler:
893 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
894 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
895 */
7d2d37f5
DM
896 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
897 && !(prog->intflags & PREGf_IMPLICIT);
cad2e5aa 898
343c8a29 899 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
c889ccc8
DM
900 /* we are only allowed to match at BOS or \G */
901
57fcbfa7 902 /* trivially reject if there's a BOS anchor and we're not at BOS.
7bb3b9eb
DM
903 *
904 * Note that we don't try to do a similar quick reject for
905 * \G, since generally the caller will have calculated strpos
906 * based on pos() and gofs, so the string is already correctly
907 * anchored by definition; and handling the exceptions would
908 * be too fiddly (e.g. REXEC_IGNOREPOS).
57fcbfa7 909 */
7bb3b9eb 910 if ( strpos != strbeg
d3d47aac 911 && (prog->intflags & PREGf_ANCH_SBOL))
c889ccc8 912 {
6ad9a8ab 913 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 914 " Not at start...\n"));
c889ccc8
DM
915 goto fail;
916 }
917
a5d12a4b
DM
918 /* in the presence of an anchor, the anchored (relative to the
919 * start of the regex) substr must also be anchored relative
66b7ec5c
DM
920 * to strpos. So quickly reject if substr isn't found there.
921 * This works for \G too, because the caller will already have
922 * subtracted gofs from pos, and gofs is the offset from the
923 * \G to the start of the regex. For example, in /.abc\Gdef/,
924 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
925 * caller will have set strpos=pos()-4; we look for the substr
926 * at position pos()-4+1, which lines up with the "a" */
a5d12a4b 927
33c28ab2 928 if (prog->check_offset_min == prog->check_offset_max) {
c889ccc8 929 /* Substring at constant offset from beg-of-str... */
b2ad2123 930 SSize_t slen = SvCUR(check);
343c8a29 931 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 932
6ad9a8ab 933 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 934 " Looking for check substr at fixed offset %" IVdf "...\n",
1dc475d0
DM
935 (IV)prog->check_offset_min));
936
7742aa66
DM
937 if (SvTAIL(check)) {
938 /* In this case, the regex is anchored at the end too.
939 * Unless it's a multiline match, the lengths must match
b2ad2123 940 * exactly, give or take a \n. NB: slen >= 1 since
7742aa66
DM
941 * the last char of check is \n */
942 if (!multiline
b2ad2123
DM
943 && ( strend - s > slen
944 || strend - s < slen - 1
945 || (strend - s == slen && strend[-1] != '\n')))
c889ccc8 946 {
6ad9a8ab 947 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 948 " String too long...\n"));
c889ccc8
DM
949 goto fail_finish;
950 }
b2ad2123
DM
951 /* Now should match s[0..slen-2] */
952 slen--;
c889ccc8 953 }
b2ad2123 954 if (slen && (strend - s < slen
26fb2318 955 || *SvPVX_const(check) != *s
b2ad2123 956 || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
d307bf57 957 {
6ad9a8ab 958 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 959 " String not equal...\n"));
d307bf57
DM
960 goto fail_finish;
961 }
c889ccc8
DM
962
963 check_at = s;
964 goto success_at_start;
cad2e5aa 965 }
cad2e5aa 966 }
cad2e5aa 967 }
0fc004dd 968
b2ad2123 969 end_shift = prog->check_end_shift;
cad2e5aa 970
19188028 971#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
b2ad2123 972 if (end_shift < 0)
147e3846 973 Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
b2ad2123 974 (IV)end_shift, RX_PRECOMP(rx));
2c2d71f5
JH
975#endif
976
2c2d71f5 977 restart:
1de06328 978
66b7ec5c
DM
979 /* This is the (re)entry point of the main loop in this function.
980 * The goal of this loop is to:
981 * 1) find the "check" substring in the region rx_origin..strend
b2ad2123 982 * (adjusted by start_shift / end_shift). If not found, reject
66b7ec5c
DM
983 * immediately.
984 * 2) If it exists, look for the "other" substr too if defined; for
985 * example, if the check substr maps to the anchored substr, then
986 * check the floating substr, and vice-versa. If not found, go
987 * back to (1) with rx_origin suitably incremented.
988 * 3) If we find an rx_origin position that doesn't contradict
989 * either of the substrings, then check the possible additional
990 * constraints on rx_origin of /^.../m or a known start class.
991 * If these fail, then depending on which constraints fail, jump
992 * back to here, or to various other re-entry points further along
993 * that skip some of the first steps.
994 * 4) If we pass all those tests, update the BmUSEFUL() count on the
995 * substring. If the start position was determined to be at the
996 * beginning of the string - so, not rejected, but not optimised,
997 * since we have to run regmatch from position 0 - decrement the
998 * BmUSEFUL() count. Otherwise increment it.
999 */
1000
1a4edc3c
DM
1001
1002 /* first, look for the 'check' substring */
1003
1de06328 1004 {
c33e64f0
FC
1005 U8* start_point;
1006 U8* end_point;
c889ccc8 1007
c889ccc8 1008 DEBUG_OPTIMISE_MORE_r({
6ad9a8ab 1009 Perl_re_printf( aTHX_
147e3846
KW
1010 " At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1011 " Start shift: %" IVdf " End shift %" IVdf
1012 " Real end Shift: %" IVdf "\n",
675e93ee 1013 (IV)(rx_origin - strbeg),
c889ccc8 1014 (IV)prog->check_offset_min,
b2ad2123
DM
1015 (IV)start_shift,
1016 (IV)end_shift,
c889ccc8
DM
1017 (IV)prog->check_end_shift);
1018 });
1de06328 1019
b2ad2123 1020 end_point = HOPBACK3(strend, end_shift, rx_origin);
bb152a4b
DM
1021 if (!end_point)
1022 goto fail_finish;
b2ad2123 1023 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
33c28ab2
DM
1024 if (!start_point)
1025 goto fail_finish;
c889ccc8 1026
557f47af 1027
e0362b86 1028 /* If the regex is absolutely anchored to either the start of the
d3d47aac 1029 * string (SBOL) or to pos() (ANCH_GPOS), then
e0362b86
DM
1030 * check_offset_max represents an upper bound on the string where
1031 * the substr could start. For the ANCH_GPOS case, we assume that
1032 * the caller of intuit will have already set strpos to
1033 * pos()-gofs, so in this case strpos + offset_max will still be
1034 * an upper bound on the substr.
1035 */
c19c836a
DM
1036 if (!ml_anch
1037 && prog->intflags & PREGf_ANCH
e0362b86 1038 && prog->check_offset_max != SSize_t_MAX)
c19c836a 1039 {
b2ad2123 1040 SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
e0362b86
DM
1041 const char * const anchor =
1042 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
b2ad2123 1043 SSize_t targ_len = (char*)end_point - anchor;
2ce94a86 1044
b2ad2123 1045 if (check_len > targ_len) {
2ce94a86
DM
1046 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1047 "Anchored string too short...\n"));
1048 goto fail_finish;
1049 }
e0362b86
DM
1050
1051 /* do a bytes rather than chars comparison. It's conservative;
1052 * so it skips doing the HOP if the result can't possibly end
1053 * up earlier than the old value of end_point.
1054 */
b2ad2123
DM
1055 assert(anchor + check_len <= (char *)end_point);
1056 if (prog->check_offset_max + check_len < targ_len) {
e0362b86
DM
1057 end_point = HOP3lim((U8*)anchor,
1058 prog->check_offset_max,
b2ad2123 1059 end_point - check_len
2ce94a86 1060 )
b2ad2123 1061 + check_len;
e0362b86 1062 }
d6ef1678
DM
1063 }
1064
ae5d4331 1065 check_at = fbm_instr( start_point, end_point,
7fba1cd6 1066 check, multiline ? FBMrf_MULTILINE : 0);
c889ccc8 1067
6ad9a8ab 1068 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1069 " doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
675e93ee
DM
1070 (IV)((char*)start_point - strbeg),
1071 (IV)((char*)end_point - strbeg),
1072 (IV)(check_at ? check_at - strbeg : -1)
1073 ));
1074
8fd34720
DM
1075 /* Update the count-of-usability, remove useless subpatterns,
1076 unshift s. */
1077
1078 DEBUG_EXECUTE_r({
1079 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1080 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
6ad9a8ab 1081 Perl_re_printf( aTHX_ " %s %s substr %s%s%s",
8fd34720
DM
1082 (check_at ? "Found" : "Did not find"),
1083 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1084 ? "anchored" : "floating"),
1085 quoted,
1086 RE_SV_TAIL(check),
1087 (check_at ? " at offset " : "...\n") );
1088 });
2c2d71f5 1089
8fd34720
DM
1090 if (!check_at)
1091 goto fail_finish;
8fd34720
DM
1092 /* set rx_origin to the minimum position where the regex could start
1093 * matching, given the constraint of the just-matched check substring.
1094 * But don't set it lower than previously.
1095 */
fdc003fd 1096
8fd34720
DM
1097 if (check_at - rx_origin > prog->check_offset_max)
1098 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
675e93ee 1099 /* Finish the diagnostic message */
6ad9a8ab 1100 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1101 "%ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1102 (long)(check_at - strbeg),
1103 (IV)(rx_origin - strbeg)
1104 ));
8fd34720 1105 }
fdc003fd
DM
1106
1107
1a4edc3c 1108 /* now look for the 'other' substring if defined */
2c2d71f5 1109
6480a6c4
DM
1110 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
1111 : prog->substrs->data[other_ix].substr)
1de06328 1112 {
30944b6d 1113 /* Take into account the "other" substring. */
6c3343a6
DM
1114 char *last, *last1;
1115 char *s;
1116 SV* must;
1117 struct reg_substr_datum *other;
1118
1119 do_other_substr:
1120 other = &prog->substrs->data[other_ix];
1121
1122 /* if "other" is anchored:
1123 * we've previously found a floating substr starting at check_at.
1124 * This means that the regex origin must lie somewhere
1125 * between min (rx_origin): HOP3(check_at, -check_offset_max)
1126 * and max: HOP3(check_at, -check_offset_min)
1127 * (except that min will be >= strpos)
1128 * So the fixed substr must lie somewhere between
1129 * HOP3(min, anchored_offset)
1130 * HOP3(max, anchored_offset) + SvCUR(substr)
1131 */
1132
1133 /* if "other" is floating
1134 * Calculate last1, the absolute latest point where the
1135 * floating substr could start in the string, ignoring any
1136 * constraints from the earlier fixed match. It is calculated
1137 * as follows:
1138 *
1139 * strend - prog->minlen (in chars) is the absolute latest
1140 * position within the string where the origin of the regex
1141 * could appear. The latest start point for the floating
1142 * substr is float_min_offset(*) on from the start of the
1143 * regex. last1 simply combines thee two offsets.
1144 *
1145 * (*) You might think the latest start point should be
1146 * float_max_offset from the regex origin, and technically
1147 * you'd be correct. However, consider
1148 * /a\d{2,4}bcd\w/
1149 * Here, float min, max are 3,5 and minlen is 7.
1150 * This can match either
1151 * /a\d\dbcd\w/
1152 * /a\d\d\dbcd\w/
1153 * /a\d\d\d\dbcd\w/
1154 * In the first case, the regex matches minlen chars; in the
1155 * second, minlen+1, in the third, minlen+2.
1156 * In the first case, the floating offset is 3 (which equals
1157 * float_min), in the second, 4, and in the third, 5 (which
1158 * equals float_max). In all cases, the floating string bcd
1159 * can never start more than 4 chars from the end of the
1160 * string, which equals minlen - float_min. As the substring
1161 * starts to match more than float_min from the start of the
1162 * regex, it makes the regex match more than minlen chars,
1163 * and the two cancel each other out. So we can always use
1164 * float_min - minlen, rather than float_max - minlen for the
1165 * latest position in the string.
1166 *
1167 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1168 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1169 */
1170
e7a14a9c 1171 assert(prog->minlen >= other->min_offset);
6c3343a6
DM
1172 last1 = HOP3c(strend,
1173 other->min_offset - prog->minlen, strbeg);
1174
4d006249 1175 if (other_ix) {/* i.e. if (other-is-float) */
6c3343a6
DM
1176 /* last is the latest point where the floating substr could
1177 * start, *given* any constraints from the earlier fixed
1178 * match. This constraint is that the floating string starts
1179 * <= float_max_offset chars from the regex origin (rx_origin).
1180 * If this value is less than last1, use it instead.
eb3831ce 1181 */
6c3343a6
DM
1182 assert(rx_origin <= last1);
1183 last =
1184 /* this condition handles the offset==infinity case, and
1185 * is a short-cut otherwise. Although it's comparing a
1186 * byte offset to a char length, it does so in a safe way,
1187 * since 1 char always occupies 1 or more bytes,
1188 * so if a string range is (last1 - rx_origin) bytes,
1189 * it will be less than or equal to (last1 - rx_origin)
1190 * chars; meaning it errs towards doing the accurate HOP3
1191 * rather than just using last1 as a short-cut */
1192 (last1 - rx_origin) < other->max_offset
1193 ? last1
1194 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1195 }
1196 else {
b2ad2123
DM
1197 assert(strpos + start_shift <= check_at);
1198 last = HOP4c(check_at, other->min_offset - start_shift,
6c3343a6
DM
1199 strbeg, strend);
1200 }
ead917d0 1201
6c3343a6
DM
1202 s = HOP3c(rx_origin, other->min_offset, strend);
1203 if (s < other_last) /* These positions already checked */
1204 s = other_last;
1205
1206 must = utf8_target ? other->utf8_substr : other->substr;
1207 assert(SvPOK(must));
675e93ee
DM
1208 {
1209 char *from = s;
1210 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1211
71a9d105
DM
1212 if (to > strend)
1213 to = strend;
88203927
DM
1214 if (from > to) {
1215 s = NULL;
6ad9a8ab 1216 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1217 " skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
88203927
DM
1218 (IV)(from - strbeg),
1219 (IV)(to - strbeg)
1220 ));
1221 }
1222 else {
1223 s = fbm_instr(
1224 (unsigned char*)from,
1225 (unsigned char*)to,
1226 must,
1227 multiline ? FBMrf_MULTILINE : 0
1228 );
6ad9a8ab 1229 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1230 " doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
88203927
DM
1231 (IV)(from - strbeg),
1232 (IV)(to - strbeg),
1233 (IV)(s ? s - strbeg : -1)
1234 ));
1235 }
675e93ee
DM
1236 }
1237
6c3343a6
DM
1238 DEBUG_EXECUTE_r({
1239 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1240 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
6ad9a8ab 1241 Perl_re_printf( aTHX_ " %s %s substr %s%s",
6c3343a6
DM
1242 s ? "Found" : "Contradicts",
1243 other_ix ? "floating" : "anchored",
1244 quoted, RE_SV_TAIL(must));
1245 });
ead917d0 1246
ead917d0 1247
6c3343a6
DM
1248 if (!s) {
1249 /* last1 is latest possible substr location. If we didn't
1250 * find it before there, we never will */
1251 if (last >= last1) {
6ad9a8ab 1252 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee 1253 "; giving up...\n"));
6c3343a6 1254 goto fail_finish;
ead917d0
DM
1255 }
1256
6c3343a6
DM
1257 /* try to find the check substr again at a later
1258 * position. Maybe next time we'll find the "other" substr
1259 * in range too */
6c3343a6
DM
1260 other_last = HOP3c(last, 1, strend) /* highest failure */;
1261 rx_origin =
4d006249 1262 other_ix /* i.e. if other-is-float */
6c3343a6
DM
1263 ? HOP3c(rx_origin, 1, strend)
1264 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
6ad9a8ab 1265 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1266 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1267 (other_ix ? "floating" : "anchored"),
1268 (long)(HOP3c(check_at, 1, strend) - strbeg),
1269 (IV)(rx_origin - strbeg)
1270 ));
6c3343a6
DM
1271 goto restart;
1272 }
1273 else {
4d006249 1274 if (other_ix) { /* if (other-is-float) */
6c3343a6
DM
1275 /* other_last is set to s, not s+1, since its possible for
1276 * a floating substr to fail first time, then succeed
1277 * second time at the same floating position; e.g.:
1278 * "-AB--AABZ" =~ /\wAB\d*Z/
1279 * The first time round, anchored and float match at
1280 * "-(AB)--AAB(Z)" then fail on the initial \w character
1281 * class. Second time round, they match at "-AB--A(AB)(Z)".
1282 */
1283 other_last = s;
ead917d0
DM
1284 }
1285 else {
6c3343a6
DM
1286 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1287 other_last = HOP3c(s, 1, strend);
ead917d0 1288 }
6ad9a8ab 1289 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1290 " at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1291 (long)(s - strbeg),
1292 (IV)(rx_origin - strbeg)
1293 ));
1294
6c3343a6 1295 }
cad2e5aa 1296 }
acba93e8
DM
1297 else {
1298 DEBUG_OPTIMISE_MORE_r(
6ad9a8ab 1299 Perl_re_printf( aTHX_
147e3846
KW
1300 " Check-only match: offset min:%" IVdf " max:%" IVdf
1301 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1302 " strend:%" IVdf "\n",
acba93e8
DM
1303 (IV)prog->check_offset_min,
1304 (IV)prog->check_offset_max,
675e93ee
DM
1305 (IV)(check_at-strbeg),
1306 (IV)(rx_origin-strbeg),
1c1c599d 1307 (IV)(rx_origin-check_at),
675e93ee 1308 (IV)(strend-strbeg)
acba93e8
DM
1309 )
1310 );
1311 }
2c2d71f5 1312
acba93e8 1313 postprocess_substr_matches:
0991020e 1314
1a4edc3c 1315 /* handle the extra constraint of /^.../m if present */
e3c6feb0 1316
7d2d37f5 1317 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
4620cb61
DM
1318 char *s;
1319
6ad9a8ab 1320 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
a62659bd 1321 " looking for /^/m anchor"));
d0880ea7
DM
1322
1323 /* we have failed the constraint of a \n before rx_origin.
2e759faa
DM
1324 * Find the next \n, if any, even if it's beyond the current
1325 * anchored and/or floating substrings. Whether we should be
1326 * scanning ahead for the next \n or the next substr is debatable.
1327 * On the one hand you'd expect rare substrings to appear less
1328 * often than \n's. On the other hand, searching for \n means
675e93ee 1329 * we're effectively flipping between check_substr and "\n" on each
2e759faa
DM
1330 * iteration as the current "rarest" string candidate, which
1331 * means for example that we'll quickly reject the whole string if
1332 * hasn't got a \n, rather than trying every substr position
1333 * first
1334 */
d0880ea7 1335
4620cb61
DM
1336 s = HOP3c(strend, - prog->minlen, strpos);
1337 if (s <= rx_origin ||
1338 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1339 {
6ad9a8ab 1340 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
d0880ea7
DM
1341 " Did not find /%s^%s/m...\n",
1342 PL_colors[0], PL_colors[1]));
a62659bd
DM
1343 goto fail_finish;
1344 }
d0880ea7 1345
4ada1233
DM
1346 /* earliest possible origin is 1 char after the \n.
1347 * (since *rx_origin == '\n', it's safe to ++ here rather than
1348 * HOP(rx_origin, 1)) */
1349 rx_origin++;
d0880ea7 1350
f4f115de 1351 if (prog->substrs->check_ix == 0 /* check is anchored */
4ada1233 1352 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
f4f115de 1353 {
d0880ea7
DM
1354 /* Position contradicts check-string; either because
1355 * check was anchored (and thus has no wiggle room),
4ada1233 1356 * or check was float and rx_origin is above the float range */
6ad9a8ab 1357 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee
DM
1358 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1359 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
d0880ea7
DM
1360 goto restart;
1361 }
1362
1363 /* if we get here, the check substr must have been float,
2e759faa 1364 * is in range, and we may or may not have had an anchored
d0880ea7
DM
1365 * "other" substr which still contradicts */
1366 assert(prog->substrs->check_ix); /* check is float */
1367
1368 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1369 /* whoops, the anchored "other" substr exists, so we still
1370 * contradict. On the other hand, the float "check" substr
1371 * didn't contradict, so just retry the anchored "other"
1372 * substr */
6ad9a8ab 1373 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1374 " Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
d0880ea7 1375 PL_colors[0], PL_colors[1],
73e8ff00
DM
1376 (IV)(rx_origin - strbeg + prog->anchored_offset),
1377 (IV)(rx_origin - strbeg)
675e93ee 1378 ));
d0880ea7
DM
1379 goto do_other_substr;
1380 }
1381
1382 /* success: we don't contradict the found floating substring
1383 * (and there's no anchored substr). */
6ad9a8ab 1384 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee
DM
1385 " Found /%s^%s/m with rx_origin %ld...\n",
1386 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
e3c6feb0
DM
1387 }
1388 else {
6ad9a8ab 1389 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
fe4f3442 1390 " (multiline anchor test skipped)\n"));
e3c6feb0
DM
1391 }
1392
ffad1e6a 1393 success_at_start:
e3c6feb0 1394
cad2e5aa 1395
dd170ff5
DM
1396 /* if we have a starting character class, then test that extra constraint.
1397 * (trie stclasses are too expensive to use here, we are better off to
1398 * leave it to regmatch itself) */
1399
f8fc2ecf 1400 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
f8fc2ecf 1401 const U8* const str = (U8*)STRING(progi->regstclass);
0991020e 1402
b2ad2123
DM
1403 /* XXX this value could be pre-computed */
1404 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
2c75e362
DM
1405 ? (reginfo->is_utf8_pat
1406 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1407 : STR_LEN(progi->regstclass))
66e933ab 1408 : 1);
1de06328 1409 char * endpos;
fa3bb21d 1410 char *s;
000dfd2d
DM
1411 /* latest pos that a matching float substr constrains rx start to */
1412 char *rx_max_float = NULL;
1413
c75a3985
DM
1414 /* if the current rx_origin is anchored, either by satisfying an
1415 * anchored substring constraint, or a /^.../m constraint, then we
1416 * can reject the current origin if the start class isn't found
1417 * at the current position. If we have a float-only match, then
1418 * rx_origin is constrained to a range; so look for the start class
1419 * in that range. if neither, then look for the start class in the
1420 * whole rest of the string */
1421
dd170ff5
DM
1422 /* XXX DAPM it's not clear what the minlen test is for, and why
1423 * it's not used in the floating case. Nothing in the test suite
1424 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1425 * Here are some old comments, which may or may not be correct:
1426 *
1427 * minlen == 0 is possible if regstclass is \b or \B,
1428 * and the fixed substr is ''$.
1429 * Since minlen is already taken into account, rx_origin+1 is
1430 * before strend; accidentally, minlen >= 1 guaranties no false
1431 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1432 * 0) below assumes that regstclass does not come from lookahead...
1433 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1434 * This leaves EXACTF-ish only, which are dealt with in
1435 * find_byclass().
1436 */
1437
7d2d37f5 1438 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
b2ad2123 1439 endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
000dfd2d 1440 else if (prog->float_substr || prog->float_utf8) {
b2ad2123
DM
1441 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1442 endpos = HOP3clim(rx_max_float, cl_l, strend);
000dfd2d 1443 }
1de06328
YO
1444 else
1445 endpos= strend;
1446
6ad9a8ab 1447 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
b2ad2123 1448 " looking for class: start_shift: %" IVdf " check_at: %" IVdf
147e3846 1449 " rx_origin: %" IVdf " endpos: %" IVdf "\n",
b2ad2123 1450 (IV)start_shift, (IV)(check_at - strbeg),
c43b5520 1451 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
d8080198 1452
c43b5520 1453 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
f9176b44 1454 reginfo);
be778b1a 1455 if (!s) {
6eb5f6b9 1456 if (endpos == strend) {
6ad9a8ab 1457 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1dc475d0 1458 " Could not match STCLASS...\n") );
6eb5f6b9
JH
1459 goto fail;
1460 }
6ad9a8ab 1461 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1dc475d0 1462 " This position contradicts STCLASS...\n") );
e0eb31e7
DM
1463 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1464 && !(prog->intflags & PREGf_IMPLICIT))
653099ff 1465 goto fail;
9fed8d02 1466
6eb5f6b9 1467 /* Contradict one of substrings */
97136c8a
DM
1468 if (prog->anchored_substr || prog->anchored_utf8) {
1469 if (prog->substrs->check_ix == 1) { /* check is float */
1470 /* Have both, check_string is floating */
b2ad2123
DM
1471 assert(rx_origin + start_shift <= check_at);
1472 if (rx_origin + start_shift != check_at) {
97136c8a 1473 /* not at latest position float substr could match:
c75a3985
DM
1474 * Recheck anchored substring, but not floating.
1475 * The condition above is in bytes rather than
1476 * chars for efficiency. It's conservative, in
1477 * that it errs on the side of doing 'goto
88203927
DM
1478 * do_other_substr'. In this case, at worst,
1479 * an extra anchored search may get done, but in
1480 * practice the extra fbm_instr() is likely to
1481 * get skipped anyway. */
6ad9a8ab 1482 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
147e3846 1483 " about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1484 (long)(other_last - strbeg),
1485 (IV)(rx_origin - strbeg)
1486 ));
97136c8a 1487 goto do_other_substr;
3369914b 1488 }
3369914b
DM
1489 }
1490 }
97136c8a 1491 else {
9fed8d02
DM
1492 /* float-only */
1493
7d2d37f5 1494 if (ml_anch) {
c75a3985
DM
1495 /* In the presence of ml_anch, we might be able to
1496 * find another \n without breaking the current float
1497 * constraint. */
1498
1499 /* strictly speaking this should be HOP3c(..., 1, ...),
1500 * but since we goto a block of code that's going to
1501 * search for the next \n if any, its safe here */
9fed8d02 1502 rx_origin++;
6ad9a8ab 1503 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
675e93ee 1504 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
9fed8d02 1505 PL_colors[0], PL_colors[1],
675e93ee 1506 (long)(rx_origin - strbeg)) );
9fed8d02 1507 goto postprocess_substr_matches;
ab60c45a 1508 }
c75a3985
DM
1509
1510 /* strictly speaking this can never be true; but might
1511 * be if we ever allow intuit without substrings */
1512 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
9fed8d02 1513 goto fail;
c75a3985 1514
000dfd2d 1515 rx_origin = rx_max_float;
9fed8d02
DM
1516 }
1517
c75a3985
DM
1518 /* at this point, any matching substrings have been
1519 * contradicted. Start again... */
1520
9fed8d02 1521 rx_origin = HOP3c(rx_origin, 1, strend);
557f47af
DM
1522
1523 /* uses bytes rather than char calculations for efficiency.
1524 * It's conservative: it errs on the side of doing 'goto restart',
1525 * where there is code that does a proper char-based test */
b2ad2123 1526 if (rx_origin + start_shift + end_shift > strend) {
6ad9a8ab 1527 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
9fed8d02
DM
1528 " Could not match STCLASS...\n") );
1529 goto fail;
1530 }
6ad9a8ab 1531 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
147e3846 1532 " about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
9fed8d02 1533 (prog->substrs->check_ix ? "floating" : "anchored"),
b2ad2123 1534 (long)(rx_origin + start_shift - strbeg),
675e93ee
DM
1535 (IV)(rx_origin - strbeg)
1536 ));
9fed8d02 1537 goto restart;
6eb5f6b9 1538 }
9fed8d02 1539
c75a3985
DM
1540 /* Success !!! */
1541
5f9c6575 1542 if (rx_origin != s) {
6ad9a8ab 1543 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1544 " By STCLASS: moving %ld --> %ld\n",
675e93ee 1545 (long)(rx_origin - strbeg), (long)(s - strbeg))
b7953727
JH
1546 );
1547 }
1548 else {
6ad9a8ab 1549 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1550 " Does not contradict STCLASS...\n");
b7953727
JH
1551 );
1552 }
6eb5f6b9 1553 }
ffad1e6a
DM
1554
1555 /* Decide whether using the substrings helped */
1556
1557 if (rx_origin != strpos) {
1558 /* Fixed substring is found far enough so that the match
1559 cannot start at strpos. */
1560
6ad9a8ab 1561 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n"));
ffad1e6a
DM
1562 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1563 }
1564 else {
70563e16
DM
1565 /* The found rx_origin position does not prohibit matching at
1566 * strpos, so calling intuit didn't gain us anything. Decrement
1567 * the BmUSEFUL() count on the check substring, and if we reach
1568 * zero, free it. */
1569 if (!(prog->intflags & PREGf_NAUGHTY)
ffad1e6a
DM
1570 && (utf8_target ? (
1571 prog->check_utf8 /* Could be deleted already */
1572 && --BmUSEFUL(prog->check_utf8) < 0
1573 && (prog->check_utf8 == prog->float_utf8)
1574 ) : (
1575 prog->check_substr /* Could be deleted already */
1576 && --BmUSEFUL(prog->check_substr) < 0
1577 && (prog->check_substr == prog->float_substr)
1578 )))
1579 {
1580 /* If flags & SOMETHING - do not do it many times on the same match */
6ad9a8ab 1581 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n"));
ffad1e6a
DM
1582 /* XXX Does the destruction order has to change with utf8_target? */
1583 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1584 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1585 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1586 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1587 check = NULL; /* abort */
ffad1e6a
DM
1588 /* XXXX This is a remnant of the old implementation. It
1589 looks wasteful, since now INTUIT can use many
1590 other heuristics. */
1591 prog->extflags &= ~RXf_USE_INTUIT;
ffad1e6a
DM
1592 }
1593 }
1594
6ad9a8ab 1595 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
ffad1e6a 1596 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
675e93ee 1597 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
ffad1e6a 1598
c765d6e0 1599 return rx_origin;
2c2d71f5
JH
1600
1601 fail_finish: /* Substring not found */
33b8afdf 1602 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1603 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1604 fail:
6ad9a8ab 1605 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n",
e4584336 1606 PL_colors[4], PL_colors[5]));
bd61b366 1607 return NULL;
cad2e5aa 1608}
9661b544 1609
70563e16 1610
a0a388a1 1611#define DECL_TRIE_TYPE(scan) \
e7fd4aa1 1612 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
a4525e78
KW
1613 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
1614 trie_utf8l, trie_flu8 } \
e7fd4aa1
KW
1615 trie_type = ((scan->flags == EXACT) \
1616 ? (utf8_target ? trie_utf8 : trie_plain) \
a4525e78
KW
1617 : (scan->flags == EXACTL) \
1618 ? (utf8_target ? trie_utf8l : trie_plain) \
1619 : (scan->flags == EXACTFA) \
1620 ? (utf8_target \
1621 ? trie_utf8_exactfa_fold \
1622 : trie_latin_utf8_exactfa_fold) \
1623 : (scan->flags == EXACTFLU8 \
1624 ? trie_flu8 \
1625 : (utf8_target \
1626 ? trie_utf8_fold \
1627 : trie_latin_utf8_fold)))
fab2782b 1628
fd3249ee 1629#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
baa60164 1630STMT_START { \
fab2782b 1631 STRLEN skiplen; \
baa60164 1632 U8 flags = FOLD_FLAGS_FULL; \
fab2782b 1633 switch (trie_type) { \
a4525e78 1634 case trie_flu8: \
780fcc9f 1635 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
613abc6d
KW
1636 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1637 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1638 } \
a4525e78 1639 goto do_trie_utf8_fold; \
31f05a37 1640 case trie_utf8_exactfa_fold: \
baa60164 1641 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1642 /* FALLTHROUGH */ \
fab2782b 1643 case trie_utf8_fold: \
a4525e78 1644 do_trie_utf8_fold: \
fab2782b 1645 if ( foldlen>0 ) { \
c80e42f3 1646 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1647 foldlen -= len; \
1648 uscan += len; \
1649 len=0; \
1650 } else { \
fab2782b 1651 len = UTF8SKIP(uc); \
a1a5ec35
KW
1652 uvc = _toFOLD_utf8_flags( (const U8*) uc, uc + len, foldbuf, &foldlen, \
1653 flags); \
5f560d8a 1654 skiplen = UVCHR_SKIP( uvc ); \
fab2782b
YO
1655 foldlen -= skiplen; \
1656 uscan = foldbuf + skiplen; \
1657 } \
1658 break; \
baa60164
KW
1659 case trie_latin_utf8_exactfa_fold: \
1660 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1661 /* FALLTHROUGH */ \
fab2782b
YO
1662 case trie_latin_utf8_fold: \
1663 if ( foldlen>0 ) { \
c80e42f3 1664 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1665 foldlen -= len; \
1666 uscan += len; \
1667 len=0; \
1668 } else { \
1669 len = 1; \
31f05a37 1670 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
5f560d8a 1671 skiplen = UVCHR_SKIP( uvc ); \
fab2782b
YO
1672 foldlen -= skiplen; \
1673 uscan = foldbuf + skiplen; \
1674 } \
1675 break; \
a4525e78 1676 case trie_utf8l: \
780fcc9f 1677 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
613abc6d
KW
1678 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1679 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1680 } \
780fcc9f 1681 /* FALLTHROUGH */ \
fab2782b 1682 case trie_utf8: \
c80e42f3 1683 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1684 break; \
1685 case trie_plain: \
1686 uvc = (UV)*uc; \
1687 len = 1; \
1688 } \
1689 if (uvc < 256) { \
1690 charid = trie->charmap[ uvc ]; \
1691 } \
1692 else { \
1693 charid = 0; \
1694 if (widecharmap) { \
1695 SV** const svpp = hv_fetch(widecharmap, \
1696 (char*)&uvc, sizeof(UV), 0); \
1697 if (svpp) \
1698 charid = (U16)SvIV(*svpp); \
1699 } \
1700 } \
4cadc6a9
YO
1701} STMT_END
1702
cb41e5d6 1703#define DUMP_EXEC_POS(li,s,doutf8,depth) \
ae7c5b9b 1704 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
cb41e5d6 1705 startpos, doutf8, depth)
ae7c5b9b 1706
c84a03c5 1707#define REXEC_FBC_EXACTISH_SCAN(COND) \
4cadc6a9
YO
1708STMT_START { \
1709 while (s <= e) { \
c84a03c5 1710 if ( (COND) \
fac1af77 1711 && (ln == 1 || folder(s, pat_string, ln)) \
02d5137b 1712 && (reginfo->intuit || regtry(reginfo, &s)) )\
4cadc6a9
YO
1713 goto got_it; \
1714 s++; \
1715 } \
1716} STMT_END
1717
c84a03c5 1718#define REXEC_FBC_UTF8_SCAN(CODE) \
4cadc6a9 1719STMT_START { \
9a902117 1720 while (s < strend) { \
c84a03c5 1721 CODE \
9a902117 1722 s += UTF8SKIP(s); \
4cadc6a9
YO
1723 } \
1724} STMT_END
1725
c84a03c5 1726#define REXEC_FBC_SCAN(CODE) \
4cadc6a9
YO
1727STMT_START { \
1728 while (s < strend) { \
c84a03c5 1729 CODE \
4cadc6a9
YO
1730 s++; \
1731 } \
1732} STMT_END
1733
05bd126c
KW
1734#define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
1735REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */ \
1736 if (COND) { \
1737 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1738 goto got_it; \
1739 else \
1740 tmp = doevery; \
1741 } \
1742 else \
1743 tmp = 1; \
4cadc6a9
YO
1744)
1745
05bd126c
KW
1746#define REXEC_FBC_CLASS_SCAN(COND) \
1747REXEC_FBC_SCAN( /* Loops while (s < strend) */ \
1748 if (COND) { \
1749 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1750 goto got_it; \
1751 else \
1752 tmp = doevery; \
1753 } \
1754 else \
1755 tmp = 1; \
4cadc6a9
YO
1756)
1757
c84a03c5 1758#define REXEC_FBC_CSCAN(CONDUTF8,COND) \
baa60164 1759 if (utf8_target) { \
c84a03c5 1760 REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8); \
e1d1eefb
YO
1761 } \
1762 else { \
c84a03c5 1763 REXEC_FBC_CLASS_SCAN(COND); \
d981ef24 1764 }
05bd126c 1765
05bd126c
KW
1766/* The three macros below are slightly different versions of the same logic.
1767 *
1768 * The first is for /a and /aa when the target string is UTF-8. This can only
1769 * match ascii, but it must advance based on UTF-8. The other two handle the
1770 * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking
1771 * for the boundary (or non-boundary) between a word and non-word character.
1772 * The utf8 and non-utf8 cases have the same logic, but the details must be
1773 * different. Find the "wordness" of the character just prior to this one, and
1774 * compare it with the wordness of this one. If they differ, we have a
1775 * boundary. At the beginning of the string, pretend that the previous
1776 * character was a new-line.
1777 *
1778 * All these macros uncleanly have side-effects with each other and outside
1779 * variables. So far it's been too much trouble to clean-up
1780 *
1781 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1782 * a word character or not.
1783 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1784 * word/non-word
1785 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1786 *
1787 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1788 * are looking for a boundary or for a non-boundary. If we are looking for a
1789 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1790 * see if this tentative match actually works, and if so, to quit the loop
1791 * here. And vice-versa if we are looking for a non-boundary.
1792 *
1793 * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1794 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1795 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
1796 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1797 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1798 * complement. But in that branch we complement tmp, meaning that at the
1799 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1800 * which means at the top of the loop in the next iteration, it is
1801 * TEST_NON_UTF8(s-1) */
b2f4e957 1802#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1803 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1804 tmp = TEST_NON_UTF8(tmp); \
1805 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1806 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1807 tmp = !tmp; \
1808 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
1809 } \
1810 else { \
1811 IF_FAIL; \
1812 } \
1813 ); \
1814
1815/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1816 * TEST_UTF8 is a macro that for the same input code points returns identically
1817 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
236d82fd 1818#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1819 if (s == reginfo->strbeg) { \
1820 tmp = '\n'; \
1821 } \
1822 else { /* Back-up to the start of the previous character */ \
1823 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1824 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
3db24e1e 1825 0, UTF8_ALLOW_DEFAULT); \
05bd126c
KW
1826 } \
1827 tmp = TEST_UV(tmp); \
1828 LOAD_UTF8_CHARCLASS_ALNUM(); \
1829 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
7a207065 1830 if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \
05bd126c
KW
1831 tmp = !tmp; \
1832 IF_SUCCESS; \
1833 } \
1834 else { \
1835 IF_FAIL; \
1836 } \
1837 );
cfaf538b 1838
05bd126c
KW
1839/* Like the above two macros. UTF8_CODE is the complete code for handling
1840 * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1841 * macros below */
baa60164 1842#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1843 if (utf8_target) { \
05bd126c 1844 UTF8_CODE \
63ac0dad
KW
1845 } \
1846 else { /* Not utf8 */ \
9d9163fb 1847 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
63ac0dad 1848 tmp = TEST_NON_UTF8(tmp); \
05bd126c 1849 REXEC_FBC_SCAN( /* advances s while s < strend */ \
63ac0dad 1850 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
63ac0dad 1851 IF_SUCCESS; \
760cfa8e 1852 tmp = !tmp; \
63ac0dad
KW
1853 } \
1854 else { \
1855 IF_FAIL; \
1856 } \
1857 ); \
1858 } \
c8519dc7
KW
1859 /* Here, things have been set up by the previous code so that tmp is the \
1860 * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \
1861 * utf8ness of the target). We also have to check if this matches against \
1862 * the EOS, which we treat as a \n (which is the same value in both UTF-8 \
1863 * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \
1864 * string */ \
1865 if (tmp == ! TEST_NON_UTF8('\n')) { \
1866 IF_SUCCESS; \
1867 } \
1868 else { \
1869 IF_FAIL; \
1870 }
63ac0dad 1871
ae7c5b9b
KW
1872/* This is the macro to use when we want to see if something that looks like it
1873 * could match, actually does, and if so exits the loop */
1874#define REXEC_FBC_TRYIT \
1875 if ((reginfo->intuit || regtry(reginfo, &s))) \
1876 goto got_it
1877
1878/* The only difference between the BOUND and NBOUND cases is that
1879 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1880 * NBOUND. This is accomplished by passing it as either the if or else clause,
1881 * with the other one being empty (PLACEHOLDER is defined as empty).
1882 *
1883 * The TEST_FOO parameters are for operating on different forms of input, but
1884 * all should be ones that return identically for the same underlying code
1885 * points */
1886#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1887 FBC_BOUND_COMMON( \
1888 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1889 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1890
44129e46 1891#define FBC_BOUND_A(TEST_NON_UTF8) \
ae7c5b9b
KW
1892 FBC_BOUND_COMMON( \
1893 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1894 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1895
1896#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1897 FBC_BOUND_COMMON( \
1898 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1899 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1900
44129e46 1901#define FBC_NBOUND_A(TEST_NON_UTF8) \
ae7c5b9b
KW
1902 FBC_BOUND_COMMON( \
1903 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1904 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1905
8bde5eaf
JH
1906#ifdef DEBUGGING
1907static IV
1908S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
1909 IV cp_out = Perl__invlist_search(invlist, cp_in);
1910 assert(cp_out >= 0);
1911 return cp_out;
1912}
1913# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1914 invmap[S_get_break_val_cp_checked(invlist, cp)]
1915#else
1916# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1917 invmap[_invlist_search(invlist, cp)]
1918#endif
1919
64935bc6
KW
1920/* Takes a pointer to an inversion list, a pointer to its corresponding
1921 * inversion map, and a code point, and returns the code point's value
1922 * according to the two arrays. It assumes that all code points have a value.
1923 * This is used as the base macro for macros for particular properties */
1924#define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
8bde5eaf 1925 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
64935bc6
KW
1926
1927/* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
1928 * of a code point, returning the value for the first code point in the string.
1929 * And it takes the particular macro name that finds the desired value given a
1930 * code point. Merely convert the UTF-8 to code point and call the cp macro */
1931#define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
1932 (__ASSERT_(pos < strend) \
1933 /* Note assumes is valid UTF-8 */ \
1934 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
1935
1936/* Returns the GCB value for the input code point */
1937#define getGCB_VAL_CP(cp) \
1938 _generic_GET_BREAK_VAL_CP( \
1939 PL_GCB_invlist, \
02f811dd 1940 _Perl_GCB_invmap, \
64935bc6
KW
1941 (cp))
1942
1943/* Returns the GCB value for the first code point in the UTF-8 encoded string
1944 * bounded by pos and strend */
1945#define getGCB_VAL_UTF8(pos, strend) \
1946 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
05bd126c 1947
6b659339
KW
1948/* Returns the LB value for the input code point */
1949#define getLB_VAL_CP(cp) \
1950 _generic_GET_BREAK_VAL_CP( \
1951 PL_LB_invlist, \
1952 _Perl_LB_invmap, \
1953 (cp))
1954
1955/* Returns the LB value for the first code point in the UTF-8 encoded string
1956 * bounded by pos and strend */
1957#define getLB_VAL_UTF8(pos, strend) \
1958 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
1959
06ae2722
KW
1960
1961/* Returns the SB value for the input code point */
1962#define getSB_VAL_CP(cp) \
1963 _generic_GET_BREAK_VAL_CP( \
1964 PL_SB_invlist, \
bf4268fa 1965 _Perl_SB_invmap, \
06ae2722
KW
1966 (cp))
1967
1968/* Returns the SB value for the first code point in the UTF-8 encoded string
1969 * bounded by pos and strend */
1970#define getSB_VAL_UTF8(pos, strend) \
1971 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
1972
ae3bb8ea
KW
1973/* Returns the WB value for the input code point */
1974#define getWB_VAL_CP(cp) \
1975 _generic_GET_BREAK_VAL_CP( \
1976 PL_WB_invlist, \
bf4268fa 1977 _Perl_WB_invmap, \
ae3bb8ea
KW
1978 (cp))
1979
1980/* Returns the WB value for the first code point in the UTF-8 encoded string
1981 * bounded by pos and strend */
1982#define getWB_VAL_UTF8(pos, strend) \
1983 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
1984
786e8c11 1985/* We know what class REx starts with. Try to find this position... */
02d5137b 1986/* if reginfo->intuit, its a dryrun */
786e8c11
YO
1987/* annoyingly all the vars in this routine have different names from their counterparts
1988 in regmatch. /grrr */
3c3eec57 1989STATIC char *
07be1b83 1990S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 1991 const char *strend, regmatch_info *reginfo)
a687059c 1992{
73104a1b
KW
1993 dVAR;
1994 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1995 char *pat_string; /* The pattern's exactish string */
1996 char *pat_end; /* ptr to end char of pat_string */
1997 re_fold_t folder; /* Function for computing non-utf8 folds */
1998 const U8 *fold_array; /* array for folding ords < 256 */
1999 STRLEN ln;
2000 STRLEN lnc;
73104a1b
KW
2001 U8 c1;
2002 U8 c2;
2003 char *e;
2004 I32 tmp = 1; /* Scratch variable? */
ba44c216 2005 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 2006 UV utf8_fold_flags = 0;
f9176b44 2007 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
2008 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
2009 with a result inverts that result, as 0^1 =
2010 1 and 1^1 = 0 */
2011 _char_class_number classnum;
2012
73104a1b 2013 RXi_GET_DECL(prog,progi);
2f7f8cb1 2014
73104a1b 2015 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 2016
73104a1b
KW
2017 /* We know what class it must start with. */
2018 switch (OP(c)) {
a4525e78 2019 case ANYOFL:
780fcc9f 2020 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
a0bd1a30 2021
d1c40ef5 2022 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
a0bd1a30
KW
2023 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2024 }
2025
780fcc9f 2026 /* FALLTHROUGH */
ac44c12e 2027 case ANYOFD:
73104a1b
KW
2028 case ANYOF:
2029 if (utf8_target) {
2030 REXEC_FBC_UTF8_CLASS_SCAN(
3db24e1e 2031 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
73104a1b 2032 }
1451f692
DM
2033 else if (ANYOF_FLAGS(c)) {
2034 REXEC_FBC_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
2035 }
73104a1b 2036 else {
1451f692 2037 REXEC_FBC_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
73104a1b
KW
2038 }
2039 break;
73104a1b 2040
098b07d5
KW
2041 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
2042 assert(! is_utf8_pat);
924ba076 2043 /* FALLTHROUGH */
73104a1b 2044 case EXACTFA:
984e6dd1 2045 if (is_utf8_pat || utf8_target) {
73104a1b
KW
2046 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2047 goto do_exactf_utf8;
2048 }
2049 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
2050 folder = foldEQ_latin1; /* /a, except the sharp s one which */
2051 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 2052
2fdb7295
KW
2053 case EXACTF: /* This node only generated for non-utf8 patterns */
2054 assert(! is_utf8_pat);
73104a1b 2055 if (utf8_target) {
73104a1b
KW
2056 utf8_fold_flags = 0;
2057 goto do_exactf_utf8;
2058 }
2059 fold_array = PL_fold;
2060 folder = foldEQ;
2061 goto do_exactf_non_utf8;
2062
2063 case EXACTFL:
780fcc9f 2064 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
31f05a37 2065 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
cea315b6 2066 utf8_fold_flags = FOLDEQ_LOCALE;
73104a1b
KW
2067 goto do_exactf_utf8;
2068 }
2069 fold_array = PL_fold_locale;
2070 folder = foldEQ_locale;
2071 goto do_exactf_non_utf8;
3c760661 2072
73104a1b 2073 case EXACTFU_SS:
984e6dd1 2074 if (is_utf8_pat) {
73104a1b
KW
2075 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2076 }
2077 goto do_exactf_utf8;
16d951b7 2078
a4525e78
KW
2079 case EXACTFLU8:
2080 if (! utf8_target) { /* All code points in this node require
2081 UTF-8 to express. */
2082 break;
2083 }
613abc6d
KW
2084 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2085 | FOLDEQ_S2_FOLDS_SANE;
a4525e78
KW
2086 goto do_exactf_utf8;
2087
73104a1b 2088 case EXACTFU:
984e6dd1
DM
2089 if (is_utf8_pat || utf8_target) {
2090 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
2091 goto do_exactf_utf8;
2092 }
fac1af77 2093
73104a1b
KW
2094 /* Any 'ss' in the pattern should have been replaced by regcomp,
2095 * so we don't have to worry here about this single special case
2096 * in the Latin1 range */
2097 fold_array = PL_fold_latin1;
2098 folder = foldEQ_latin1;
2099
924ba076 2100 /* FALLTHROUGH */
73104a1b 2101
c52b8b12 2102 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
73104a1b
KW
2103 are no glitches with fold-length differences
2104 between the target string and pattern */
2105
2106 /* The idea in the non-utf8 EXACTF* cases is to first find the
2107 * first character of the EXACTF* node and then, if necessary,
2108 * case-insensitively compare the full text of the node. c1 is the
2109 * first character. c2 is its fold. This logic will not work for
2110 * Unicode semantics and the german sharp ss, which hence should
2111 * not be compiled into a node that gets here. */
2112 pat_string = STRING(c);
2113 ln = STR_LEN(c); /* length to match in octets/bytes */
2114
2115 /* We know that we have to match at least 'ln' bytes (which is the
2116 * same as characters, since not utf8). If we have to match 3
2117 * characters, and there are only 2 availabe, we know without
2118 * trying that it will fail; so don't start a match past the
2119 * required minimum number from the far end */
ea3daa5d 2120 e = HOP3c(strend, -((SSize_t)ln), s);
dda01918
HS
2121 if (e < s)
2122 break;
fac1af77 2123
73104a1b
KW
2124 c1 = *pat_string;
2125 c2 = fold_array[c1];
2126 if (c1 == c2) { /* If char and fold are the same */
2127 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
2128 }
2129 else {
2130 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
2131 }
2132 break;
fac1af77 2133
c52b8b12
KW
2134 do_exactf_utf8:
2135 {
73104a1b
KW
2136 unsigned expansion;
2137
2138 /* If one of the operands is in utf8, we can't use the simpler folding
2139 * above, due to the fact that many different characters can have the
2140 * same fold, or portion of a fold, or different- length fold */
2141 pat_string = STRING(c);
2142 ln = STR_LEN(c); /* length to match in octets/bytes */
2143 pat_end = pat_string + ln;
984e6dd1 2144 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
2145 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2146 : ln;
2147
2148 /* We have 'lnc' characters to match in the pattern, but because of
2149 * multi-character folding, each character in the target can match
2150 * up to 3 characters (Unicode guarantees it will never exceed
2151 * this) if it is utf8-encoded; and up to 2 if not (based on the
2152 * fact that the Latin 1 folds are already determined, and the
2153 * only multi-char fold in that range is the sharp-s folding to
2154 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
2155 * string character. Adjust lnc accordingly, rounding up, so that
2156 * if we need to match at least 4+1/3 chars, that really is 5. */
2157 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2158 lnc = (lnc + expansion - 1) / expansion;
2159
2160 /* As in the non-UTF8 case, if we have to match 3 characters, and
2161 * only 2 are left, it's guaranteed to fail, so don't start a
2162 * match that would require us to go beyond the end of the string
2163 */
ea3daa5d 2164 e = HOP3c(strend, -((SSize_t)lnc), s);
73104a1b 2165
73104a1b
KW
2166 /* XXX Note that we could recalculate e to stop the loop earlier,
2167 * as the worst case expansion above will rarely be met, and as we
2168 * go along we would usually find that e moves further to the left.
2169 * This would happen only after we reached the point in the loop
2170 * where if there were no expansion we should fail. Unclear if
2171 * worth the expense */
2172
2173 while (s <= e) {
2174 char *my_strend= (char *)strend;
2175 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 2176 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
02d5137b 2177 && (reginfo->intuit || regtry(reginfo, &s)) )
73104a1b
KW
2178 {
2179 goto got_it;
2180 }
2181 s += (utf8_target) ? UTF8SKIP(s) : 1;
2182 }
2183 break;
2184 }
236d82fd 2185
73104a1b 2186 case BOUNDL:
780fcc9f 2187 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6 2188 if (FLAGS(c) != TRADITIONAL_BOUND) {
89ad707a
KW
2189 if (! IN_UTF8_CTYPE_LOCALE) {
2190 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
64935bc6 2191 B_ON_NON_UTF8_LOCALE_IS_WRONG);
89ad707a 2192 }
64935bc6
KW
2193 goto do_boundu;
2194 }
2195
7a207065 2196 FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
73104a1b 2197 break;
64935bc6 2198
73104a1b 2199 case NBOUNDL:
780fcc9f 2200 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6 2201 if (FLAGS(c) != TRADITIONAL_BOUND) {
89ad707a
KW
2202 if (! IN_UTF8_CTYPE_LOCALE) {
2203 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
64935bc6 2204 B_ON_NON_UTF8_LOCALE_IS_WRONG);
89ad707a 2205 }
64935bc6
KW
2206 goto do_nboundu;
2207 }
2208
7a207065 2209 FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
73104a1b 2210 break;
64935bc6
KW
2211
2212 case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2213 meaning */
2214 assert(FLAGS(c) == TRADITIONAL_BOUND);
2215
7a207065 2216 FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
73104a1b 2217 break;
64935bc6
KW
2218
2219 case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2220 meaning */
2221 assert(FLAGS(c) == TRADITIONAL_BOUND);
2222
44129e46 2223 FBC_BOUND_A(isWORDCHAR_A);
73104a1b 2224 break;
64935bc6
KW
2225
2226 case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2227 meaning */
2228 assert(FLAGS(c) == TRADITIONAL_BOUND);
2229
7a207065 2230 FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
73104a1b 2231 break;
64935bc6
KW
2232
2233 case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2234 meaning */
2235 assert(FLAGS(c) == TRADITIONAL_BOUND);
2236
44129e46 2237 FBC_NBOUND_A(isWORDCHAR_A);
73104a1b 2238 break;
64935bc6 2239
73104a1b 2240 case NBOUNDU:
64935bc6 2241 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
7a207065 2242 FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
64935bc6
KW
2243 break;
2244 }
2245
2246 do_nboundu:
2247
2248 to_complement = 1;
2249 /* FALLTHROUGH */
2250
2251 case BOUNDU:
2252 do_boundu:
2253 switch((bound_type) FLAGS(c)) {
2254 case TRADITIONAL_BOUND:
7a207065 2255 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
64935bc6
KW
2256 break;
2257 case GCB_BOUND:
a7a8bd1e 2258 if (s == reginfo->strbeg) {
67481c39 2259 if (reginfo->intuit || regtry(reginfo, &s))
64935bc6
KW
2260 {
2261 goto got_it;
2262 }
a7a8bd1e
KW
2263
2264 /* Didn't match. Try at the next position (if there is one) */
64935bc6 2265 s += (utf8_target) ? UTF8SKIP(s) : 1;
a7a8bd1e
KW
2266 if (UNLIKELY(s >= reginfo->strend)) {
2267 break;
2268 }
64935bc6
KW
2269 }
2270
2271 if (utf8_target) {
85e5f08b 2272 GCB_enum before = getGCB_VAL_UTF8(
64935bc6
KW
2273 reghop3((U8*)s, -1,
2274 (U8*)(reginfo->strbeg)),
2275 (U8*) reginfo->strend);
2276 while (s < strend) {
85e5f08b 2277 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
64935bc6 2278 (U8*) reginfo->strend);
b0e24409
KW
2279 if ( (to_complement ^ isGCB(before,
2280 after,
2281 (U8*) reginfo->strbeg,
2282 (U8*) s,
2283 utf8_target))
00e3344b
KW
2284 && (reginfo->intuit || regtry(reginfo, &s)))
2285 {
2286 goto got_it;
64935bc6 2287 }
43a7bd62 2288 before = after;
64935bc6
KW
2289 s += UTF8SKIP(s);
2290 }
2291 }
2292 else { /* Not utf8. Everything is a GCB except between CR and
2293 LF */
2294 while (s < strend) {
00e3344b
KW
2295 if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
2296 || UCHARAT(s) != '\n'))
2297 && (reginfo->intuit || regtry(reginfo, &s)))
64935bc6 2298 {
00e3344b 2299 goto got_it;
64935bc6 2300 }
43a7bd62 2301 s++;
64935bc6
KW
2302 }
2303 }
2304
6de80efc
KW
2305 /* And, since this is a bound, it can match after the final
2306 * character in the string */
67481c39 2307 if ((reginfo->intuit || regtry(reginfo, &s))) {
64935bc6
KW
2308 goto got_it;
2309 }
2310 break;
ae3bb8ea 2311
6b659339
KW
2312 case LB_BOUND:
2313 if (s == reginfo->strbeg) {
2314 if (reginfo->intuit || regtry(reginfo, &s)) {
2315 goto got_it;
2316 }
2317 s += (utf8_target) ? UTF8SKIP(s) : 1;
2318 if (UNLIKELY(s >= reginfo->strend)) {
2319 break;
2320 }
2321 }
2322
2323 if (utf8_target) {
2324 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2325 -1,
2326 (U8*)(reginfo->strbeg)),
2327 (U8*) reginfo->strend);
2328 while (s < strend) {
2329 LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2330 if (to_complement ^ isLB(before,
2331 after,
2332 (U8*) reginfo->strbeg,
2333 (U8*) s,
2334 (U8*) reginfo->strend,
2335 utf8_target)
2336 && (reginfo->intuit || regtry(reginfo, &s)))
2337 {
2338 goto got_it;
2339 }
2340 before = after;
2341 s += UTF8SKIP(s);
2342 }
2343 }
2344 else { /* Not utf8. */
2345 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2346 while (s < strend) {
2347 LB_enum after = getLB_VAL_CP((U8) *s);
2348 if (to_complement ^ isLB(before,
2349 after,
2350 (U8*) reginfo->strbeg,
2351 (U8*) s,
2352 (U8*) reginfo->strend,
2353 utf8_target)
2354 && (reginfo->intuit || regtry(reginfo, &s)))
2355 {
2356 goto got_it;
2357 }
2358 before = after;
2359 s++;
2360 }
2361 }
2362
2363 if (reginfo->intuit || regtry(reginfo, &s)) {
2364 goto got_it;
2365 }
2366
2367 break;
2368
06ae2722 2369 case SB_BOUND:
a7a8bd1e 2370 if (s == reginfo->strbeg) {
67481c39 2371 if (reginfo->intuit || regtry(reginfo, &s)) {
06ae2722
KW
2372 goto got_it;
2373 }
06ae2722 2374 s += (utf8_target) ? UTF8SKIP(s) : 1;
a7a8bd1e
KW
2375 if (UNLIKELY(s >= reginfo->strend)) {
2376 break;
2377 }
06ae2722
KW
2378 }
2379
2380 if (utf8_target) {
85e5f08b 2381 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
06ae2722
KW
2382 -1,
2383 (U8*)(reginfo->strbeg)),
2384 (U8*) reginfo->strend);
2385 while (s < strend) {
85e5f08b 2386 SB_enum after = getSB_VAL_UTF8((U8*) s,
06ae2722 2387 (U8*) reginfo->strend);
00e3344b
KW
2388 if ((to_complement ^ isSB(before,
2389 after,
2390 (U8*) reginfo->strbeg,
2391 (U8*) s,
2392 (U8*) reginfo->strend,
2393 utf8_target))
2394 && (reginfo->intuit || regtry(reginfo, &s)))
06ae2722 2395 {
00e3344b 2396 goto got_it;
06ae2722 2397 }
43a7bd62 2398 before = after;
06ae2722
KW
2399 s += UTF8SKIP(s);
2400 }
2401 }
2402 else { /* Not utf8. */
85e5f08b 2403 SB_enum before = getSB_VAL_CP((U8) *(s -1));
06ae2722 2404 while (s < strend) {
85e5f08b 2405 SB_enum after = getSB_VAL_CP((U8) *s);
00e3344b
KW
2406 if ((to_complement ^ isSB(before,
2407 after,
2408 (U8*) reginfo->strbeg,
2409 (U8*) s,
2410 (U8*) reginfo->strend,
2411 utf8_target))
2412 && (reginfo->intuit || regtry(reginfo, &s)))
06ae2722 2413 {
00e3344b 2414 goto got_it;
06ae2722 2415 }
43a7bd62 2416 before = after;
06ae2722
KW
2417 s++;
2418 }
2419 }
2420
2421 /* Here are at the final position in the target string. The SB
2422 * value is always true here, so matches, depending on other
2423 * constraints */
67481c39 2424 if (reginfo->intuit || regtry(reginfo, &s)) {
06ae2722
KW
2425 goto got_it;
2426 }
2427
2428 break;
2429
ae3bb8ea
KW
2430 case WB_BOUND:
2431 if (s == reginfo->strbeg) {
67481c39 2432 if (reginfo->intuit || regtry(reginfo, &s)) {
ae3bb8ea
KW
2433 goto got_it;
2434 }
2435 s += (utf8_target) ? UTF8SKIP(s) : 1;
a7a8bd1e
KW
2436 if (UNLIKELY(s >= reginfo->strend)) {
2437 break;
2438 }
ae3bb8ea
KW
2439 }
2440
2441 if (utf8_target) {
2442 /* We are at a boundary between char_sub_0 and char_sub_1.
2443 * We also keep track of the value for char_sub_-1 as we
2444 * loop through the line. Context may be needed to make a
2445 * determination, and if so, this can save having to
2446 * recalculate it */
85e5f08b
KW
2447 WB_enum previous = WB_UNKNOWN;
2448 WB_enum before = getWB_VAL_UTF8(
ae3bb8ea
KW
2449 reghop3((U8*)s,
2450 -1,
2451 (U8*)(reginfo->strbeg)),
2452 (U8*) reginfo->strend);
2453 while (s < strend) {
85e5f08b 2454 WB_enum after = getWB_VAL_UTF8((U8*) s,
ae3bb8ea 2455 (U8*) reginfo->strend);
00e3344b
KW
2456 if ((to_complement ^ isWB(previous,
2457 before,
2458 after,
2459 (U8*) reginfo->strbeg,
2460 (U8*) s,
2461 (U8*) reginfo->strend,
2462 utf8_target))
2463 && (reginfo->intuit || regtry(reginfo, &s)))
ae3bb8ea 2464 {
00e3344b 2465 goto got_it;
ae3bb8ea 2466 }
43a7bd62
KW
2467 previous = before;
2468 before = after;
ae3bb8ea
KW
2469 s += UTF8SKIP(s);
2470 }
2471 }
2472 else { /* Not utf8. */
85e5f08b
KW
2473 WB_enum previous = WB_UNKNOWN;
2474 WB_enum before = getWB_VAL_CP((U8) *(s -1));
ae3bb8ea 2475 while (s < strend) {
85e5f08b 2476 WB_enum after = getWB_VAL_CP((U8) *s);
00e3344b
KW
2477 if ((to_complement ^ isWB(previous,
2478 before,
2479 after,
2480 (U8*) reginfo->strbeg,
2481 (U8*) s,
2482 (U8*) reginfo->strend,
2483 utf8_target))
2484 && (reginfo->intuit || regtry(reginfo, &s)))
ae3bb8ea 2485 {
00e3344b 2486 goto got_it;
ae3bb8ea 2487 }
43a7bd62
KW
2488 previous = before;
2489 before = after;
ae3bb8ea
KW
2490 s++;
2491 }
2492 }
2493
67481c39 2494 if (reginfo->intuit || regtry(reginfo, &s)) {
ae3bb8ea
KW
2495 goto got_it;
2496 }
64935bc6 2497 }
73104a1b 2498 break;
64935bc6 2499
73104a1b
KW
2500 case LNBREAK:
2501 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2502 is_LNBREAK_latin1_safe(s, strend)
2503 );
2504 break;
3018b823 2505
aff4cafe
KW
2506 case ASCII:
2507 s = find_next_ascii(s, strend, utf8_target);
2508 if (s < strend && (reginfo->intuit || regtry(reginfo, &s))) {
2509 goto got_it;
2510 }
2511
2512 break;
2513
2514 case NASCII:
2515 s = find_next_non_ascii(s, strend, utf8_target);
2516 if (s < strend && (reginfo->intuit || regtry(reginfo, &s))) {
2517 goto got_it;
2518 }
2519
2520 break;
2521
3018b823
KW
2522 /* The argument to all the POSIX node types is the class number to pass to
2523 * _generic_isCC() to build a mask for searching in PL_charclass[] */
2524
2525 case NPOSIXL:
2526 to_complement = 1;
2527 /* FALLTHROUGH */
2528
2529 case POSIXL:
780fcc9f 2530 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3018b823
KW
2531 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2532 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 2533 break;
3018b823
KW
2534
2535 case NPOSIXD:
2536 to_complement = 1;
2537 /* FALLTHROUGH */
2538
2539 case POSIXD:
2540 if (utf8_target) {
2541 goto posix_utf8;
2542 }
2543 goto posixa;
2544
2545 case NPOSIXA:
2546 if (utf8_target) {
2547 /* The complement of something that matches only ASCII matches all
837226c8 2548 * non-ASCII, plus everything in ASCII that isn't in the class. */
7a207065 2549 REXEC_FBC_UTF8_CLASS_SCAN( ! isASCII_utf8_safe(s, strend)
3018b823
KW
2550 || ! _generic_isCC_A(*s, FLAGS(c)));
2551 break;
2552 }
2553
2554 to_complement = 1;
4a6c6db5 2555 goto posixa;
3018b823 2556
73104a1b
KW
2557 case POSIXA:
2558 /* Don't need to worry about utf8, as it can match only a single
4a6c6db5
KW
2559 * byte invariant character. But we do anyway for performance reasons,
2560 * as otherwise we would have to examine all the continuation
2561 * characters */
2562 if (utf8_target) {
2563 REXEC_FBC_UTF8_CLASS_SCAN(_generic_isCC_A(*s, FLAGS(c)));
2564 break;
2565 }
2566
2567 posixa:
3018b823
KW
2568 REXEC_FBC_CLASS_SCAN(
2569 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 2570 break;
3018b823
KW
2571
2572 case NPOSIXU:
2573 to_complement = 1;
2574 /* FALLTHROUGH */
2575
2576 case POSIXU:
2577 if (! utf8_target) {
2578 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2579 FLAGS(c))));
2580 }
2581 else {
2582
c52b8b12 2583 posix_utf8:
3018b823
KW
2584 classnum = (_char_class_number) FLAGS(c);
2585 if (classnum < _FIRST_NON_SWASH_CC) {
2586 while (s < strend) {
2587
2588 /* We avoid loading in the swash as long as possible, but
2589 * should we have to, we jump to a separate loop. This
2590 * extra 'if' statement is what keeps this code from being
2591 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2592 if (UTF8_IS_ABOVE_LATIN1(*s)) {
2593 goto found_above_latin1;
2594 }
2595 if ((UTF8_IS_INVARIANT(*s)
2596 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2597 classnum)))
042d9e50 2598 || ( UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, strend)
3018b823 2599 && to_complement ^ cBOOL(
a62b247b 2600 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
94bb8c36 2601 *(s + 1)),
3018b823
KW
2602 classnum))))
2603 {
02d5137b 2604 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
3018b823
KW
2605 goto got_it;
2606 else {
2607 tmp = doevery;
2608 }
2609 }
2610 else {
2611 tmp = 1;
2612 }
2613 s += UTF8SKIP(s);
2614 }
2615 }
2616 else switch (classnum) { /* These classes are implemented as
2617 macros */
779cf272 2618 case _CC_ENUM_SPACE:
3018b823 2619 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 2620 to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
3018b823
KW
2621 break;
2622
2623 case _CC_ENUM_BLANK:
2624 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 2625 to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
3018b823
KW
2626 break;
2627
2628 case _CC_ENUM_XDIGIT:
2629 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 2630 to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3018b823
KW
2631 break;
2632
2633 case _CC_ENUM_VERTSPACE:
2634 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 2635 to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3018b823
KW
2636 break;
2637
2638 case _CC_ENUM_CNTRL:
2639 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 2640 to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
3018b823
KW
2641 break;
2642
2643 default:
2644 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
e5964223 2645 NOT_REACHED; /* NOTREACHED */
3018b823
KW
2646 }
2647 }
2648 break;
2649
2650 found_above_latin1: /* Here we have to load a swash to get the result
2651 for the current code point */
2652 if (! PL_utf8_swash_ptrs[classnum]) {
2653 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2654 PL_utf8_swash_ptrs[classnum] =
2a16ac92
KW
2655 _core_swash_init("utf8",
2656 "",
2657 &PL_sv_undef, 1, 0,
2658 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
2659 }
2660
2661 /* This is a copy of the loop above for swash classes, though using the
2662 * FBC macro instead of being expanded out. Since we've loaded the
2663 * swash, we don't have to check for that each time through the loop */
2664 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 2665 to_complement ^ cBOOL(_generic_utf8_safe(
3018b823
KW
2666 classnum,
2667 s,
7a207065 2668 strend,
3018b823
KW
2669 swash_fetch(PL_utf8_swash_ptrs[classnum],
2670 (U8 *) s, TRUE))));
73104a1b
KW
2671 break;
2672
2673 case AHOCORASICKC:
2674 case AHOCORASICK:
2675 {
2676 DECL_TRIE_TYPE(c);
2677 /* what trie are we using right now */
2678 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2679 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2680 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2681
2682 const char *last_start = strend - trie->minlen;
6148ee25 2683#ifdef DEBUGGING
73104a1b 2684 const char *real_start = s;
6148ee25 2685#endif
73104a1b
KW
2686 STRLEN maxlen = trie->maxlen;
2687 SV *sv_points;
2688 U8 **points; /* map of where we were in the input string
2689 when reading a given char. For ASCII this
2690 is unnecessary overhead as the relationship
2691 is always 1:1, but for Unicode, especially
2692 case folded Unicode this is not true. */
2693 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2694 U8 *bitmap=NULL;
2695
2696
2697 GET_RE_DEBUG_FLAGS_DECL;
2698
2699 /* We can't just allocate points here. We need to wrap it in
2700 * an SV so it gets freed properly if there is a croak while
2701 * running the match */
2702 ENTER;
2703 SAVETMPS;
2704 sv_points=newSV(maxlen * sizeof(U8 *));
2705 SvCUR_set(sv_points,
2706 maxlen * sizeof(U8 *));
2707 SvPOK_on(sv_points);
2708 sv_2mortal(sv_points);
2709 points=(U8**)SvPV_nolen(sv_points );
2710 if ( trie_type != trie_utf8_fold
2711 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2712 {
2713 if (trie->bitmap)
2714 bitmap=(U8*)trie->bitmap;
2715 else
2716 bitmap=(U8*)ANYOF_BITMAP(c);
2717 }
2718 /* this is the Aho-Corasick algorithm modified a touch
2719 to include special handling for long "unknown char" sequences.
2720 The basic idea being that we use AC as long as we are dealing
2721 with a possible matching char, when we encounter an unknown char
2722 (and we have not encountered an accepting state) we scan forward
2723 until we find a legal starting char.
2724 AC matching is basically that of trie matching, except that when
2725 we encounter a failing transition, we fall back to the current
2726 states "fail state", and try the current char again, a process
2727 we repeat until we reach the root state, state 1, or a legal
2728 transition. If we fail on the root state then we can either
2729 terminate if we have reached an accepting state previously, or
2730 restart the entire process from the beginning if we have not.
2731
2732 */
2733 while (s <= last_start) {
2734 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2735 U8 *uc = (U8*)s;
2736 U16 charid = 0;
2737 U32 base = 1;
2738 U32 state = 1;
2739 UV uvc = 0;
2740 STRLEN len = 0;
2741 STRLEN foldlen = 0;
2742 U8 *uscan = (U8*)NULL;
2743 U8 *leftmost = NULL;
2744#ifdef DEBUGGING
2745 U32 accepted_word= 0;
786e8c11 2746#endif
73104a1b
KW
2747 U32 pointpos = 0;
2748
2749 while ( state && uc <= (U8*)strend ) {
2750 int failed=0;
2751 U32 word = aho->states[ state ].wordnum;
2752
2753 if( state==1 ) {
2754 if ( bitmap ) {
2755 DEBUG_TRIE_EXECUTE_r(
2756 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2757 dump_exec_pos( (char *)uc, c, strend, real_start,
cb41e5d6 2758 (char *)uc, utf8_target, 0 );
6ad9a8ab 2759 Perl_re_printf( aTHX_
73104a1b
KW
2760 " Scanning for legal start char...\n");
2761 }
2762 );
2763 if (utf8_target) {
2764 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2765 uc += UTF8SKIP(uc);
2766 }
2767 } else {
2768 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2769 uc++;
2770 }
786e8c11 2771 }
73104a1b 2772 s= (char *)uc;
07be1b83 2773 }
73104a1b
KW
2774 if (uc >(U8*)last_start) break;
2775 }
2776
2777 if ( word ) {
2778 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2779 if (!leftmost || lpos < leftmost) {
2780 DEBUG_r(accepted_word=word);
2781 leftmost= lpos;
7016d6eb 2782 }
73104a1b 2783 if (base==0) break;
7016d6eb 2784
73104a1b
KW
2785 }
2786 points[pointpos++ % maxlen]= uc;
2787 if (foldlen || uc < (U8*)strend) {
2788 REXEC_TRIE_READ_CHAR(trie_type, trie,
2789 widecharmap, uc,
2790 uscan, len, uvc, charid, foldlen,
2791 foldbuf, uniflags);
2792 DEBUG_TRIE_EXECUTE_r({
2793 dump_exec_pos( (char *)uc, c, strend,
cb41e5d6 2794 real_start, s, utf8_target, 0);
6ad9a8ab 2795 Perl_re_printf( aTHX_
147e3846 2796 " Charid:%3u CP:%4" UVxf " ",
73104a1b
KW
2797 charid, uvc);
2798 });
2799 }
2800 else {
2801 len = 0;
2802 charid = 0;
2803 }
07be1b83 2804
73104a1b
KW
2805
2806 do {
6148ee25 2807#ifdef DEBUGGING
73104a1b 2808 word = aho->states[ state ].wordnum;
6148ee25 2809#endif
73104a1b
KW
2810 base = aho->states[ state ].trans.base;
2811
2812 DEBUG_TRIE_EXECUTE_r({
2813 if (failed)
2814 dump_exec_pos( (char *)uc, c, strend, real_start,
cb41e5d6 2815 s, utf8_target, 0 );
6ad9a8ab 2816 Perl_re_printf( aTHX_
147e3846 2817 "%sState: %4" UVxf ", word=%" UVxf,
73104a1b
KW
2818 failed ? " Fail transition to " : "",
2819 (UV)state, (UV)word);
2820 });
2821 if ( base ) {
2822 U32 tmp;
2823 I32 offset;
2824 if (charid &&
2825 ( ((offset = base + charid
2826 - 1 - trie->uniquecharcount)) >= 0)
2827 && ((U32)offset < trie->lasttrans)
2828 && trie->trans[offset].check == state
2829 && (tmp=trie->trans[offset].next))
2830 {
2831 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 2832 Perl_re_printf( aTHX_ " - legal\n"));
73104a1b
KW
2833 state = tmp;
2834 break;
07be1b83
YO
2835 }
2836 else {
786e8c11 2837 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 2838 Perl_re_printf( aTHX_ " - fail\n"));
786e8c11 2839 failed = 1;
73104a1b 2840 state = aho->fail[state];
07be1b83 2841 }
07be1b83 2842 }
73104a1b
KW
2843 else {
2844 /* we must be accepting here */
2845 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 2846 Perl_re_printf( aTHX_ " - accepting\n"));
73104a1b
KW
2847 failed = 1;
2848 break;
786e8c11 2849 }
73104a1b
KW
2850 } while(state);
2851 uc += len;
2852 if (failed) {
2853 if (leftmost)
2854 break;
2855 if (!state) state = 1;
07be1b83 2856 }
73104a1b
KW
2857 }
2858 if ( aho->states[ state ].wordnum ) {
2859 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2860 if (!leftmost || lpos < leftmost) {
2861 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2862 leftmost = lpos;
07be1b83
YO
2863 }
2864 }
73104a1b
KW
2865 if (leftmost) {
2866 s = (char*)leftmost;
2867 DEBUG_TRIE_EXECUTE_r({
147e3846 2868 Perl_re_printf( aTHX_ "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
73104a1b
KW
2869 (UV)accepted_word, (IV)(s - real_start)
2870 );
2871 });
02d5137b 2872 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
2873 FREETMPS;
2874 LEAVE;
2875 goto got_it;
2876 }
2877 s = HOPc(s,1);
2878 DEBUG_TRIE_EXECUTE_r({
6ad9a8ab 2879 Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
73104a1b
KW
2880 });
2881 } else {
2882 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 2883 Perl_re_printf( aTHX_ "No match.\n"));
73104a1b
KW
2884 break;
2885 }
2886 }
2887 FREETMPS;
2888 LEAVE;
2889 }
2890 break;
2891 default:
2892 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
73104a1b
KW
2893 }
2894 return 0;
2895 got_it:
2896 return s;
6eb5f6b9
JH
2897}
2898
60165aa4
DM
2899/* set RX_SAVED_COPY, RX_SUBBEG etc.
2900 * flags have same meanings as with regexec_flags() */
2901
749f4950
DM
2902static void
2903S_reg_set_capture_string(pTHX_ REGEXP * const rx,
60165aa4
DM
2904 char *strbeg,
2905 char *strend,
2906 SV *sv,
2907 U32 flags,
2908 bool utf8_target)
2909{
2910 struct regexp *const prog = ReANY(rx);
2911
60165aa4
DM
2912 if (flags & REXEC_COPY_STR) {
2913#ifdef PERL_ANY_COW
2914 if (SvCANCOW(sv)) {
eb8fc9fe 2915 DEBUG_C(Perl_re_printf( aTHX_
60165aa4 2916 "Copy on write: regexp capture, type %d\n",
eb8fc9fe 2917 (int) SvTYPE(sv)));
5411a0e5
DM
2918 /* Create a new COW SV to share the match string and store
2919 * in saved_copy, unless the current COW SV in saved_copy
2920 * is valid and suitable for our purpose */
2921 if (( prog->saved_copy
2922 && SvIsCOW(prog->saved_copy)
2923 && SvPOKp(prog->saved_copy)
2924 && SvIsCOW(sv)
2925 && SvPOKp(sv)
2926 && SvPVX(sv) == SvPVX(prog->saved_copy)))
a76b0e90 2927 {
5411a0e5
DM
2928 /* just reuse saved_copy SV */
2929 if (RXp_MATCH_COPIED(prog)) {
2930 Safefree(prog->subbeg);
2931 RXp_MATCH_COPIED_off(prog);
2932 }
2933 }
2934 else {
2935 /* create new COW SV to share string */
196a02af 2936 RXp_MATCH_COPY_FREE(prog);
a76b0e90 2937 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
a76b0e90 2938 }
5411a0e5
DM
2939 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2940 assert (SvPOKp(prog->saved_copy));
60165aa4
DM
2941 prog->sublen = strend - strbeg;
2942 prog->suboffset = 0;
2943 prog->subcoffset = 0;
2944 } else
2945#endif
2946 {
99a90e59
FC
2947 SSize_t min = 0;
2948 SSize_t max = strend - strbeg;
ea3daa5d 2949 SSize_t sublen;
60165aa4
DM
2950
2951 if ( (flags & REXEC_COPY_SKIP_POST)
e322109a 2952 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2953 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2954 ) { /* don't copy $' part of string */
2955 U32 n = 0;
2956 max = -1;
2957 /* calculate the right-most part of the string covered
f67a5002 2958 * by a capture. Due to lookahead, this may be to
60165aa4
DM
2959 * the right of $&, so we have to scan all captures */
2960 while (n <= prog->lastparen) {
2961 if (prog->offs[n].end > max)
2962 max = prog->offs[n].end;
2963 n++;
2964 }
2965 if (max == -1)
2966 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2967 ? prog->offs[0].start
2968 : 0;
2969 assert(max >= 0 && max <= strend - strbeg);
2970 }
2971
2972 if ( (flags & REXEC_COPY_SKIP_PRE)
e322109a 2973 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2974 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2975 ) { /* don't copy $` part of string */
2976 U32 n = 0;
2977 min = max;
2978 /* calculate the left-most part of the string covered
f67a5002 2979 * by a capture. Due to lookbehind, this may be to
60165aa4
DM
2980 * the left of $&, so we have to scan all captures */
2981 while (min && n <= prog->lastparen) {
2982 if ( prog->offs[n].start != -1
2983 && prog->offs[n].start < min)
2984 {
2985 min = prog->offs[n].start;
2986 }
2987 n++;
2988 }
2989 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2990 && min > prog->offs[0].end
2991 )
2992 min = prog->offs[0].end;
2993
2994 }
2995
2996 assert(min >= 0 && min <= max && min <= strend - strbeg);
2997 sublen = max - min;
2998
196a02af 2999 if (RXp_MATCH_COPIED(prog)) {
60165aa4
DM
3000 if (sublen > prog->sublen)
3001 prog->subbeg =
3002 (char*)saferealloc(prog->subbeg, sublen+1);
3003 }
3004 else
3005 prog->subbeg = (char*)safemalloc(sublen+1);
3006 Copy(strbeg + min, prog->subbeg, sublen, char);
3007 prog->subbeg[sublen] = '\0';
3008 prog->suboffset = min;
3009 prog->sublen = sublen;
196a02af 3010 RXp_MATCH_COPIED_on(prog);
60165aa4
DM
3011 }
3012 prog->subcoffset = prog->suboffset;
3013 if (prog->suboffset && utf8_target) {
3014 /* Convert byte offset to chars.
3015 * XXX ideally should only compute this if @-/@+
3016 * has been seen, a la PL_sawampersand ??? */
3017
3018 /* If there's a direct correspondence between the
3019 * string which we're matching and the original SV,
3020 * then we can use the utf8 len cache associated with
3021 * the SV. In particular, it means that under //g,
3022 * sv_pos_b2u() will use the previously cached
3023 * position to speed up working out the new length of
3024 * subcoffset, rather than counting from the start of
3025 * the string each time. This stops
3026 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3027 * from going quadratic */
3028 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
ea3daa5d
FC
3029 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3030 SV_GMAGIC|SV_CONST_RETURN);
60165aa4
DM
3031 else
3032 prog->subcoffset = utf8_length((U8*)strbeg,
3033 (U8*)(strbeg+prog->suboffset));
3034 }
3035 }
3036 else {
196a02af 3037 RXp_MATCH_COPY_FREE(prog);
60165aa4
DM
3038 prog->subbeg = strbeg;
3039 prog->suboffset = 0;
3040 prog->subcoffset = 0;
3041 prog->sublen = strend - strbeg;
3042 }
3043}
3044
3045
3046
fae667d5 3047
6eb5f6b9
JH
3048/*
3049 - regexec_flags - match a regexp against a string
3050 */
3051I32
5aaab254 3052Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
ea3daa5d 3053 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
3054/* stringarg: the point in the string at which to begin matching */
3055/* strend: pointer to null at end of string */
3056/* strbeg: real beginning of string */
3057/* minend: end of match must be >= minend bytes after stringarg. */
3058/* sv: SV being matched: only used for utf8 flag, pos() etc; string
3059 * itself is accessed via the pointers above */
3060/* data: May be used for some additional optimizations.
d058ec57 3061 Currently unused. */
a340edde 3062/* flags: For optimizations. See REXEC_* in regexp.h */
8fd1a950 3063
6eb5f6b9 3064{
8d919b0a 3065 struct regexp *const prog = ReANY(rx);
5aaab254 3066 char *s;
eb578fdb 3067 regnode *c;
03c83e26 3068 char *startpos;
ea3daa5d
FC
3069 SSize_t minlen; /* must match at least this many chars */
3070 SSize_t dontbother = 0; /* how many characters not to try at end */
f2ed9b32 3071 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 3072 I32 multiline;
f8fc2ecf 3073 RXi_GET_DECL(prog,progi);
02d5137b
DM
3074 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
3075 regmatch_info *const reginfo = &reginfo_buf;
e9105d30 3076 regexp_paren_pair *swap = NULL;
006f26b2 3077 I32 oldsave;
a3621e74
YO
3078 GET_RE_DEBUG_FLAGS_DECL;
3079
7918f24d 3080 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 3081 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
3082
3083 /* Be paranoid... */
3dc78631 3084 if (prog == NULL) {
6eb5f6b9 3085 Perl_croak(aTHX_ "NULL regexp parameter");
6eb5f6b9
JH
3086 }
3087
6c3fea77 3088 DEBUG_EXECUTE_r(
03c83e26 3089 debug_start_match(rx, utf8_target, stringarg, strend,
6c3fea77
DM
3090 "Matching");
3091 );
8adc0f72 3092
b342a604
DM
3093 startpos = stringarg;
3094
4cf1a867
DM
3095 /* set these early as they may be used by the HOP macros below */
3096 reginfo->strbeg = strbeg;
3097 reginfo->strend = strend;
3098 reginfo->is_utf8_target = cBOOL(utf8_target);
3099
58430ea8 3100 if (prog->intflags & PREGf_GPOS_SEEN) {
d307c076
DM
3101 MAGIC *mg;
3102
fef7148b
DM
3103 /* set reginfo->ganch, the position where \G can match */
3104
3105 reginfo->ganch =
3106 (flags & REXEC_IGNOREPOS)
3107 ? stringarg /* use start pos rather than pos() */
3dc78631 3108 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
25fdce4a
FC
3109 /* Defined pos(): */
3110 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
fef7148b
DM
3111 : strbeg; /* pos() not defined; use start of string */
3112
6ad9a8ab 3113 DEBUG_GPOS_r(Perl_re_printf( aTHX_
147e3846 3114 "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
fef7148b 3115
03c83e26
DM
3116 /* in the presence of \G, we may need to start looking earlier in
3117 * the string than the suggested start point of stringarg:
0b2c2a84 3118 * if prog->gofs is set, then that's a known, fixed minimum
03c83e26
DM
3119 * offset, such as
3120 * /..\G/: gofs = 2
3121 * /ab|c\G/: gofs = 1
3122 * or if the minimum offset isn't known, then we have to go back
3123 * to the start of the string, e.g. /w+\G/
3124 */
2bfbe302 3125
8e1490ee 3126 if (prog->intflags & PREGf_ANCH_GPOS) {
4cf1a867
DM
3127 if (prog->gofs) {
3128 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3129 if (!startpos ||
3130 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3131 {
6ad9a8ab 3132 DEBUG_r(Perl_re_printf( aTHX_
4cf1a867
DM
3133 "fail: ganch-gofs before earliest possible start\n"));
3134 return 0;
3135 }
2bfbe302 3136 }
4cf1a867
DM
3137 else
3138 startpos = reginfo->ganch;
2bfbe302
DM
3139 }
3140 else if (prog->gofs) {
4cf1a867
DM
3141 startpos = HOPBACKc(startpos, prog->gofs);
3142 if (!startpos)
b342a604 3143 startpos = strbeg;
03c83e26 3144 }
58430ea8 3145 else if (prog->intflags & PREGf_GPOS_FLOAT)
b342a604 3146 startpos = strbeg;
03c83e26
DM
3147 }
3148
3149 minlen = prog->minlen;
b342a604 3150 if ((startpos + minlen) > strend || startpos < strbeg) {
6ad9a8ab 3151 DEBUG_r(Perl_re_printf( aTHX_
03c83e26
DM
3152 "Regex match can't succeed, so not even tried\n"));
3153 return 0;
3154 }
3155
63a3746a
DM
3156 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3157 * which will call destuctors to reset PL_regmatch_state, free higher
3158 * PL_regmatch_slabs, and clean up regmatch_info_aux and
3159 * regmatch_info_aux_eval */
3160
3161 oldsave = PL_savestack_ix;
3162
dfa77d06
DM
3163 s = startpos;
3164
e322109a 3165 if ((prog->extflags & RXf_USE_INTUIT)
7fadf4a7
DM
3166 && !(flags & REXEC_CHECKED))
3167 {
dfa77d06 3168 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
7fadf4a7 3169 flags, NULL);
dfa77d06 3170 if (!s)
7fadf4a7
DM
3171 return 0;
3172
e322109a 3173 if (prog->extflags & RXf_CHECK_ALL) {
7fadf4a7
DM
3174 /* we can match based purely on the result of INTUIT.
3175 * Set up captures etc just for $& and $-[0]
3176 * (an intuit-only match wont have $1,$2,..) */
3177 assert(!prog->nparens);
d5e7783a
DM
3178
3179 /* s/// doesn't like it if $& is earlier than where we asked it to
3180 * start searching (which can happen on something like /.\G/) */
3181 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3182 && (s < stringarg))
3183 {
3184 /* this should only be possible under \G */
58430ea8 3185 assert(prog->intflags & PREGf_GPOS_SEEN);
6ad9a8ab 3186 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
d5e7783a
DM
3187 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3188 goto phooey;
3189 }
3190
7fadf4a7
DM
3191 /* match via INTUIT shouldn't have any captures.
3192 * Let @-, @+, $^N know */
3193 prog->lastparen = prog->lastcloseparen = 0;
196a02af 3194 RXp_MATCH_UTF8_set(prog, utf8_target);
3ff69bd6
DM
3195 prog->offs[0].start = s - strbeg;
3196 prog->offs[0].end = utf8_target
3197 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3198 : s - strbeg + prog->minlenret;
7fadf4a7 3199 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 3200 S_reg_set_capture_string(aTHX_ rx,
7fadf4a7
DM
3201 strbeg, strend,
3202 sv, flags, utf8_target);
3203
7fadf4a7
DM
3204 return 1;
3205 }
3206 }
3207
6c3fea77 3208 multiline = prog->extflags & RXf_PMf_MULTILINE;
1de06328 3209
dfa77d06 3210 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
6ad9a8ab 3211 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
a72c7584
JH
3212 "String too short [regexec_flags]...\n"));
3213 goto phooey;
1aa99e6b 3214 }
1de06328 3215
6eb5f6b9 3216 /* Check validity of program. */
f8fc2ecf 3217 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
3218 Perl_croak(aTHX_ "corrupted regexp program");
3219 }
3220
196a02af
DM
3221 RXp_MATCH_TAINTED_off(prog);
3222 RXp_MATCH_UTF8_set(prog, utf8_target);
1738e041 3223
6c3fea77
DM
3224 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
3225 reginfo->intuit = 0;
02d5137b
DM
3226 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3227 reginfo->warned = FALSE;
02d5137b 3228 reginfo->sv = sv;
1cb48e53 3229 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
6eb5f6b9 3230 /* see how far we have to get to not match where we matched before */
fe3974be 3231 reginfo->till = stringarg + minend;
6eb5f6b9 3232
60779a30 3233 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
82c23608
FC
3234 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3235 S_cleanup_regmatch_info_aux has executed (registered by
3236 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
3237 magic belonging to this SV.
3238 Not newSVsv, either, as it does not COW.
3239 */
3240 reginfo->sv = newSV(0);
4cba5ac0 3241 SvSetSV_nosteal(reginfo->sv, sv);
82c23608
FC
3242 SAVEFREESV(reginfo->sv);
3243 }
3244
331b2dcc
DM
3245 /* reserve next 2 or 3 slots in PL_regmatch_state:
3246 * slot N+0: may currently be in use: skip it
3247 * slot N+1: use for regmatch_info_aux struct
3248 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3249 * slot N+3: ready for use by regmatch()
3250 */
bf2039a9 3251
331b2dcc
DM
3252 {
3253 regmatch_state *old_regmatch_state;
3254 regmatch_slab *old_regmatch_slab;
3255 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3256
3257 /* on first ever match, allocate first slab */
3258 if (!PL_regmatch_slab) {
3259 Newx(PL_regmatch_slab, 1, regmatch_slab);
3260 PL_regmatch_slab->prev = NULL;
3261 PL_regmatch_slab->next = NULL;
3262 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3263 }
bf2039a9 3264
331b2dcc
DM
3265 old_regmatch_state = PL_regmatch_state;
3266 old_regmatch_slab = PL_regmatch_slab;
bf2039a9 3267
331b2dcc
DM
3268 for (i=0; i <= max; i++) {
3269 if (i == 1)
3270 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3271 else if (i ==2)
3272 reginfo->info_aux_eval =
3273 reginfo->info_aux->info_aux_eval =
3274 &(PL_regmatch_state->u.info_aux_eval);
bf2039a9 3275
331b2dcc
DM
3276 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
3277 PL_regmatch_state = S_push_slab(aTHX);
3278 }
bf2039a9 3279
331b2dcc
DM
3280 /* note initial PL_regmatch_state position; at end of match we'll
3281 * pop back to there and free any higher slabs */
bf2039a9 3282
331b2dcc
DM
3283 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3284 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2ac8ff4b 3285 reginfo->info_aux->poscache = NULL;
bf2039a9 3286
331b2dcc 3287 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
bf2039a9 3288
331b2dcc
DM
3289 if ((prog->extflags & RXf_EVAL_SEEN))
3290 S_setup_eval_state(aTHX_ reginfo);
3291 else
3292 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
bf2039a9 3293 }
d3aa529c 3294
6eb5f6b9 3295 /* If there is a "must appear" string, look for it. */
6eb5f6b9 3296
288b8c02 3297 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
3298 /* We have to be careful. If the previous successful match
3299 was from this regex we don't want a subsequent partially
3300 successful match to clobber the old results.
3301 So when we detect this possibility we add a swap buffer
d8da0584
KW
3302 to the re, and switch the buffer each match. If we fail,
3303 we switch it back; otherwise we leave it swapped.
e9105d30
GG
3304 */
3305 swap = prog->offs;
3306 /* do we need a save destructor here for eval dies? */
3307 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2b1a3689 3308 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
147e3846 3309 "rex=0x%" UVxf " saving offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
2b1a3689
YO
3310 0,
3311 PTR2UV(prog),
495f47a5
DM
3312 PTR2UV(swap),
3313 PTR2UV(prog->offs)
3314 ));
c74340f9 3315 }
6eb5f6b9 3316
ba6840fb
YO
3317 if (prog->recurse_locinput)
3318 Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3319
0fa70a06
DM
3320 /* Simplest case: anchored match need be tried only once, or with
3321 * MBOL, only at the beginning of each line.
3322 *
3323 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3324 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3325 * match at the start of the string then it won't match anywhere else
3326 * either; while with /.*.../, if it doesn't match at the beginning,
3327 * the earliest it could match is at the start of the next line */
3328
8e1490ee 3329 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
0fa70a06
DM
3330 char *end;
3331
3332 if (regtry(reginfo, &s))
6eb5f6b9 3333 goto got_it;
0fa70a06
DM
3334
3335 if (!(prog->intflags & PREGf_ANCH_MBOL))
3336 goto phooey;
3337
3338 /* didn't match at start, try at other newline positions */
3339
3340 if (minlen)
3341 dontbother = minlen - 1;
3342 end = HOP3c(strend, -dontbother, strbeg) - 1;
3343
3344 /* skip to next newline */
3345
3346 while (s <= end) { /* note it could be possible to match at the end of the string */
3347 /* NB: newlines are the same in unicode as they are in latin */
3348 if (*s++ != '\n')
3349 continue;
3350 if (prog->check_substr || prog->check_utf8) {
3351 /* note that with PREGf_IMPLICIT, intuit can only fail
3352 * or return the start position, so it's of limited utility.
3353 * Nevertheless, I made the decision that the potential for
3354 * quick fail was still worth it - DAPM */
3355 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3356 if (!s)
3357 goto phooey;
3358 }
3359 if (regtry(reginfo, &s))
3360 goto got_it;
3361 }
3362 goto phooey;
3363 } /* end anchored search */
3364
3365 if (prog->intflags & PREGf_ANCH_GPOS)
f9f4320a 3366 {
a8430a8b
YO
3367 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3368 assert(prog->intflags & PREGf_GPOS_SEEN);
2bfbe302
DM
3369 /* For anchored \G, the only position it can match from is
3370 * (ganch-gofs); we already set startpos to this above; if intuit
3371 * moved us on from there, we can't possibly succeed */
4cf1a867 3372 assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
2bfbe302 3373 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9
JH
3374 goto got_it;
3375 goto phooey;
3376 }
3377
3378 /* Messy cases: unanchored match. */
bbe252da 3379 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 3380 /* we have /x+whatever/ */
984e6dd1 3381 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 3382 char ch;
bf93d4cc
GS
3383#ifdef DEBUGGING
3384 int did_match = 0;
3385#endif
f2ed9b32 3386 if (utf8_target) {
7e0d5ad7
KW
3387 if (! prog->anchored_utf8) {
3388 to_utf8_substr(prog);
3389 }
3390 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 3391 REXEC_FBC_SCAN(
6eb5f6b9 3392 if (*s == ch) {
a3621e74 3393 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 3394 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
3395 s += UTF8SKIP(s);
3396 while (s < strend && *s == ch)
3397 s += UTF8SKIP(s);
3398 }
4cadc6a9 3399 );
7e0d5ad7 3400
6eb5f6b9
JH
3401 }
3402 else {
7e0d5ad7
KW
3403 if (! prog->anchored_substr) {
3404 if (! to_byte_substr(prog)) {
6b54ddc5 3405 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3406 }
3407 }
3408 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 3409 REXEC_FBC_SCAN(
6eb5f6b9 3410 if (*s == ch) {
a3621e74 3411 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 3412 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
3413 s++;
3414 while (s < strend && *s == ch)
3415 s++;
3416 }
4cadc6a9 3417 );
6eb5f6b9 3418 }
a3621e74 3419 DEBUG_EXECUTE_r(if (!did_match)
6ad9a8ab 3420 Perl_re_printf( aTHX_
b7953727
JH
3421 "Did not find anchored character...\n")
3422 );
6eb5f6b9 3423 }
a0714e2c
SS
3424 else if (prog->anchored_substr != NULL
3425 || prog->anchored_utf8 != NULL
3426 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
3427 && prog->float_max_offset < strend - s)) {
3428 SV *must;
ea3daa5d
FC
3429 SSize_t back_max;
3430 SSize_t back_min;
33b8afdf 3431 char *last;
6eb5f6b9 3432 char *last1; /* Last position checked before */
bf93d4cc
GS
3433#ifdef DEBUGGING
3434 int did_match = 0;
3435#endif
33b8afdf 3436 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
3437 if (utf8_target) {
3438 if (! prog->anchored_utf8) {
3439 to_utf8_substr(prog);
3440 }
3441 must = prog->anchored_utf8;
3442 }
3443 else {
3444 if (! prog->anchored_substr) {
3445 if (! to_byte_substr(prog)) {
6b54ddc5 3446 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3447 }
3448 }
3449 must = prog->anchored_substr;
3450 }
33b8afdf
JH
3451 back_max = back_min = prog->anchored_offset;
3452 } else {
7e0d5ad7
KW
3453 if (utf8_target) {
3454 if (! prog->float_utf8) {
3455 to_utf8_substr(prog);
3456 }
3457 must = prog->float_utf8;
3458 }
3459 else {
3460 if (! prog->float_substr) {
3461 if (! to_byte_substr(prog)) {
6b54ddc5 3462 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3463 }
3464 }
3465 must = prog->float_substr;
3466 }
33b8afdf
JH
3467 back_max = prog->float_max_offset;
3468 back_min = prog->float_min_offset;
3469 }
1de06328 3470
1de06328
YO
3471 if (back_min<0) {
3472 last = strend;
3473 } else {
3474 last = HOP3c(strend, /* Cannot start after this */
ea3daa5d 3475 -(SSize_t)(CHR_SVLEN(must)
1de06328
YO
3476 - (SvTAIL(must) != 0) + back_min), strbeg);
3477 }
9d9163fb 3478 if (s > reginfo->strbeg)
6eb5f6b9
JH
3479 last1 = HOPc(s, -1);
3480 else
3481 last1 = s - 1; /* bogus */
3482
a0288114 3483 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9 3484 check_substr==must. */
bf05793b 3485 dontbother = 0;
6eb5f6b9
JH
3486 strend = HOPc(strend, -dontbother);
3487 while ( (s <= last) &&
e50d57d4 3488 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
9041c2e3 3489 (unsigned char*)strend, must,
c33e64f0 3490 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 3491 DEBUG_EXECUTE_r( did_match = 1 );