This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Actually remove stray ';'
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
f65819ce
CO
5 * One Ring to rule them all, One Ring to find them
6 *
4ac71550
TC
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
b992490d 83#include "invlist_inline.h"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
bbac6b20
KW
86#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
88
a0bd1a30
KW
89static const char utf8_locale_required[] =
90 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
91
e1cf74e3
CB
92#ifdef DEBUGGING
93/* At least one required character in the target string is expressible only in
94 * UTF-8. */
95static const char* const non_utf8_target_but_utf8_required
96 = "Can't match, because target string needs to be in UTF-8\n";
97#endif
98
7631e439
KW
99/* Returns a boolean as to whether the input unsigned number is a power of 2
100 * (2**0, 2**1, etc). In other words if it has just a single bit set.
101 * If not, subtracting 1 would leave the uppermost bit set, so the & would
102 * yield non-zero */
103#define isPOWER_OF_2(n) ((n & (n-1)) == 0)
104
7b031478 105#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
6ad9a8ab 106 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\
7b031478 107 goto target; \
e1cf74e3
CB
108} STMT_END
109
c74f6de9
KW
110#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
111
a687059c
LW
112#ifndef STATIC
113#define STATIC static
114#endif
115
451c6e0b
KW
116/* Valid only if 'c', the character being looke-up, is an invariant under
117 * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
118 * everything matchable is straight forward in the bitmap */
119#define REGINCLASS(prog,p,c,u) (ANYOF_FLAGS(p) \
120 ? reginclass(prog,p,c,c+1,u) \
121 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 122
c277df42
IZ
123/*
124 * Forwards.
125 */
126
f2ed9b32 127#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 128
3dab1dad 129#define HOPc(pos,off) \
ba44c216 130 (char *)(reginfo->is_utf8_target \
220db18a 131 ? reghop3((U8*)pos, off, \
9d9163fb 132 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad 133 : (U8*)(pos + off))
557f47af 134
bb152a4b
DM
135/* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
136#define HOPBACK3(pos, off, lim) \
137 (reginfo->is_utf8_target \
138 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
139 : (pos - off >= lim) \
140 ? (U8*)pos - off \
3dab1dad 141 : NULL)
efb30f32 142
bb152a4b
DM
143#define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
144
ba44c216 145#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 146#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 147
557f47af
DM
148/* lim must be +ve. Returns NULL on overshoot */
149#define HOPMAYBE3(pos,off,lim) \
150 (reginfo->is_utf8_target \
151 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
152 : ((U8*)pos + off <= lim) \
153 ? (U8*)pos + off \
154 : NULL)
155
8e9f2289
DM
156/* like HOP3, but limits the result to <= lim even for the non-utf8 case.
157 * off must be >=0; args should be vars rather than expressions */
158#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
159 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
160 : (U8*)((pos + off) > lim ? lim : (pos + off)))
67853908 161#define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
8e9f2289 162
2974eaec
DM
163#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
164 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
165 : (U8*)(pos + off))
166#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
7016d6eb
DM
167
168#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
169#define NEXTCHR_IS_EOS (nextchr < 0)
170
171#define SET_nextchr \
220db18a 172 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
7016d6eb
DM
173
174#define SET_locinput(p) \
175 locinput = (p); \
176 SET_nextchr
177
178
2a16ac92 179#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
c7304fe2
KW
180 if (!swash_ptr) { \
181 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
c7304fe2 182 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
2a16ac92 183 1, 0, invlist, &flags); \
c7304fe2
KW
184 assert(swash_ptr); \
185 } \
186 } STMT_END
187
188/* If in debug mode, we test that a known character properly matches */
189#ifdef DEBUGGING
190# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
191 property_name, \
2a16ac92 192 invlist, \
c7304fe2 193 utf8_char_in_property) \
2a16ac92 194 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
c7304fe2
KW
195 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
196#else
197# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
198 property_name, \
2a16ac92 199 invlist, \
c7304fe2 200 utf8_char_in_property) \
2a16ac92 201 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
c7304fe2 202#endif
d1eb3177 203
c7304fe2
KW
204#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
205 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
2a16ac92
KW
206 "", \
207 PL_XPosix_ptrs[_CC_WORDCHAR], \
0766489e 208 LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
c7304fe2 209
c7304fe2 210#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
211/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
212
5f80c4cf 213/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
214/* it would be nice to rework regcomp.sym to generate this stuff. sigh
215 *
216 * NOTE that *nothing* that affects backtracking should be in here, specifically
217 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
218 * node that is in between two EXACT like nodes when ascertaining what the required
219 * "follow" character is. This should probably be moved to regex compile time
220 * although it may be done at run time beause of the REF possibility - more
221 * investigation required. -- demerphq
222*/
baa60164
KW
223#define JUMPABLE(rn) ( \
224 OP(rn) == OPEN || \
24be3102
YO
225 (OP(rn) == CLOSE && \
226 !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) || \
baa60164
KW
227 OP(rn) == EVAL || \
228 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
229 OP(rn) == PLUS || OP(rn) == MINMOD || \
230 OP(rn) == KEEPS || \
231 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 232)
ee9b8eae 233#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 234
ee9b8eae
YO
235#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
236
237#if 0
238/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
a4525e78 239 we don't need this definition. XXX These are now out-of-sync*/
ee9b8eae 240#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
098b07d5 241#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
242#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
243
244#else
245/* ... so we use this as its faster. */
a4525e78
KW
246#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==EXACTL )
247#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
ee9b8eae
YO
248#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
249#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
250
251#endif
e2d8ce26 252
a84d97b6
HS
253/*
254 Search for mandatory following text node; for lookahead, the text must
255 follow but for lookbehind (rn->flags != 0) we skip to the next step.
256*/
baa60164 257#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
258 while (JUMPABLE(rn)) { \
259 const OPCODE type = OP(rn); \
260 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 261 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 262 else if (type == PLUS) \
cca55fe3 263 rn = NEXTOPER(rn); \
3dab1dad 264 else if (type == IFMATCH) \
a84d97b6 265 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 266 else rn += NEXT_OFF(rn); \
3dab1dad 267 } \
5f80c4cf 268} STMT_END
74750237 269
006f26b2
DM
270#define SLAB_FIRST(s) (&(s)->states[0])
271#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
272
a75351a1 273static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 274static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
bf2039a9 275static regmatch_state * S_push_slab(pTHX);
51371543 276
87c0511b 277#define REGCP_PAREN_ELEMS 3
f067efbf 278#define REGCP_OTHER_ELEMS 3
e0fa7e2b 279#define REGCP_FRAME_ELEMS 1
620d5b66
NC
280/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
281 * are needed for the regexp context stack bookkeeping. */
282
76e3520e 283STATIC CHECKPOINT
21553840 284S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
a0d0e21e 285{
a3b680e6 286 const int retval = PL_savestack_ix;
92da3157
DM
287 const int paren_elems_to_push =
288 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
289 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
290 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 291 I32 p;
40a82448 292 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 293
b93070ed
DM
294 PERL_ARGS_ASSERT_REGCPPUSH;
295
e49a9654 296 if (paren_elems_to_push < 0)
e8a85d26
JH
297 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
298 (int)paren_elems_to_push, (int)maxopenparen,
299 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
e49a9654 300
e0fa7e2b 301 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
147e3846 302 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
5df417d0 303 " out of range (%lu-%ld)",
92da3157
DM
304 total_elems,
305 (unsigned long)maxopenparen,
306 (long)parenfloor);
e0fa7e2b 307
620d5b66 308 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 309
495f47a5 310 DEBUG_BUFFERS_r(
92da3157 311 if ((int)maxopenparen > (int)parenfloor)
2b1a3689 312 Perl_re_exec_indentf( aTHX_
147e3846 313 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
2b1a3689
YO
314 depth,
315 PTR2UV(rex),
495f47a5
DM
316 PTR2UV(rex->offs)
317 );
318 );
92da3157 319 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 320/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
99a90e59
FC
321 SSPUSHIV(rex->offs[p].end);
322 SSPUSHIV(rex->offs[p].start);
1ca2007e 323 SSPUSHINT(rex->offs[p].start_tmp);
2b1a3689 324 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
147e3846 325 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
2b1a3689
YO
326 depth,
327 (UV)p,
495f47a5
DM
328 (IV)rex->offs[p].start,
329 (IV)rex->offs[p].start_tmp,
330 (IV)rex->offs[p].end
40a82448 331 ));
a0d0e21e 332 }
b1ce53c5 333/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 334 SSPUSHINT(maxopenparen);
b93070ed
DM
335 SSPUSHINT(rex->lastparen);
336 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 337 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 338
a0d0e21e
LW
339 return retval;
340}
341
c277df42 342/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
343#define REGCP_SET(cp) \
344 DEBUG_STATE_r( \
147e3846
KW
345 Perl_re_exec_indentf( aTHX_ \
346 "Setting an EVAL scope, savestack=%" IVdf ",\n", \
cb41e5d6
YO
347 depth, (IV)PL_savestack_ix \
348 ) \
349 ); \
ab3bbdeb 350 cp = PL_savestack_ix
c3464db5 351
ab3bbdeb 352#define REGCP_UNWIND(cp) \
e4f74956 353 DEBUG_STATE_r( \
cb41e5d6 354 if (cp != PL_savestack_ix) \
147e3846
KW
355 Perl_re_exec_indentf( aTHX_ \
356 "Clearing an EVAL scope, savestack=%" \
357 IVdf "..%" IVdf "\n", \
cb41e5d6
YO
358 depth, (IV)(cp), (IV)PL_savestack_ix \
359 ) \
360 ); \
ab3bbdeb 361 regcpblow(cp)
c277df42 362
a8d1f4b4
DM
363#define UNWIND_PAREN(lp, lcp) \
364 for (n = rex->lastparen; n > lp; n--) \
365 rex->offs[n].end = -1; \
366 rex->lastparen = n; \
367 rex->lastcloseparen = lcp;
368
369
f067efbf 370STATIC void
21553840 371S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
a0d0e21e 372{
e0fa7e2b 373 UV i;
87c0511b 374 U32 paren;
a3621e74
YO
375 GET_RE_DEBUG_FLAGS_DECL;
376
7918f24d
NC
377 PERL_ARGS_ASSERT_REGCPPOP;
378
b1ce53c5 379 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 380 i = SSPOPUV;
e0fa7e2b
NC
381 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
382 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
383 rex->lastcloseparen = SSPOPINT;
384 rex->lastparen = SSPOPINT;
92da3157 385 *maxopenparen_p = SSPOPINT;
b1ce53c5 386
620d5b66 387 i -= REGCP_OTHER_ELEMS;
b1ce53c5 388 /* Now restore the parentheses context. */
495f47a5
DM
389 DEBUG_BUFFERS_r(
390 if (i || rex->lastparen + 1 <= rex->nparens)
2b1a3689 391 Perl_re_exec_indentf( aTHX_
147e3846 392 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
2b1a3689
YO
393 depth,
394 PTR2UV(rex),
495f47a5
DM
395 PTR2UV(rex->offs)
396 );
397 );
92da3157 398 paren = *maxopenparen_p;
620d5b66 399 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
ea3daa5d 400 SSize_t tmps;
1ca2007e 401 rex->offs[paren].start_tmp = SSPOPINT;
99a90e59
FC
402 rex->offs[paren].start = SSPOPIV;
403 tmps = SSPOPIV;
b93070ed
DM
404 if (paren <= rex->lastparen)
405 rex->offs[paren].end = tmps;
2b1a3689 406 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
147e3846 407 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
2b1a3689
YO
408 depth,
409 (UV)paren,
495f47a5
DM
410 (IV)rex->offs[paren].start,
411 (IV)rex->offs[paren].start_tmp,
412 (IV)rex->offs[paren].end,
413 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 414 );
87c0511b 415 paren--;
a0d0e21e 416 }
daf18116 417#if 1
dafc8851
JH
418 /* It would seem that the similar code in regtry()
419 * already takes care of this, and in fact it is in
420 * a better location to since this code can #if 0-ed out
421 * but the code in regtry() is needed or otherwise tests
422 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
423 * (as of patchlevel 7877) will fail. Then again,
424 * this code seems to be necessary or otherwise
225593e1
DM
425 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
426 * --jhi updated by dapm */
b93070ed 427 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 428 if (i > *maxopenparen_p)
b93070ed
DM
429 rex->offs[i].start = -1;
430 rex->offs[i].end = -1;
2b1a3689 431 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
147e3846 432 " \\%" UVuf ": %s ..-1 undeffing\n",
2b1a3689
YO
433 depth,
434 (UV)i,
92da3157 435 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 436 ));
a0d0e21e 437 }
dafc8851 438#endif
a0d0e21e
LW
439}
440
74088413
DM
441/* restore the parens and associated vars at savestack position ix,
442 * but without popping the stack */
443
444STATIC void
21553840 445S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
74088413
DM
446{
447 I32 tmpix = PL_savestack_ix;
85882954
YO
448 PERL_ARGS_ASSERT_REGCP_RESTORE;
449
74088413 450 PL_savestack_ix = ix;
21553840 451 regcppop(rex, maxopenparen_p);
74088413
DM
452 PL_savestack_ix = tmpix;
453}
454
02db2b7b 455#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 456
9637d2a5
CB
457#ifndef PERL_IN_XSUB_RE
458
24e16d7b
KW
459bool
460Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
31c7f561
KW
461{
462 /* Returns a boolean as to whether or not 'character' is a member of the
463 * Posix character class given by 'classnum' that should be equivalent to a
464 * value in the typedef '_char_class_number'.
465 *
466 * Ideally this could be replaced by a just an array of function pointers
467 * to the C library functions that implement the macros this calls.
468 * However, to compile, the precise function signatures are required, and
469 * these may vary from platform to to platform. To avoid having to figure
470 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
471 * which adds an extra layer of function call overhead (unless the C
472 * optimizer strips it away). But we don't particularly care about
473 * performance with locales anyway. */
31c7f561
KW
474
475 switch ((_char_class_number) classnum) {
15861f94 476 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 477 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
478 case _CC_ENUM_ASCII: return isASCII_LC(character);
479 case _CC_ENUM_BLANK: return isBLANK_LC(character);
cee69f79 480 case _CC_ENUM_CASED: return isLOWER_LC(character)
b0d691b2 481 || isUPPER_LC(character);
e8d596e0 482 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
483 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
484 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
485 case _CC_ENUM_LOWER: return isLOWER_LC(character);
486 case _CC_ENUM_PRINT: return isPRINT_LC(character);
487 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 488 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
489 case _CC_ENUM_UPPER: return isUPPER_LC(character);
490 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 491 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
492 default: /* VERTSPACE should never occur in locales */
493 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
494 }
495
e5964223 496 NOT_REACHED; /* NOTREACHED */
31c7f561
KW
497 return FALSE;
498}
499
9637d2a5
CB
500#endif
501
3018b823
KW
502STATIC bool
503S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
504{
505 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
506 * 'character' is a member of the Posix character class given by 'classnum'
507 * that should be equivalent to a value in the typedef
508 * '_char_class_number'.
509 *
510 * This just calls isFOO_lc on the code point for the character if it is in
2f306ab9 511 * the range 0-255. Outside that range, all characters use Unicode
3018b823
KW
512 * rules, ignoring any locale. So use the Unicode function if this class
513 * requires a swash, and use the Unicode macro otherwise. */
514
515 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
516
517 if (UTF8_IS_INVARIANT(*character)) {
518 return isFOO_lc(classnum, *character);
519 }
520 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
521 return isFOO_lc(classnum,
a62b247b 522 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
3018b823
KW
523 }
524
613abc6d
KW
525 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
526
3018b823
KW
527 if (classnum < _FIRST_NON_SWASH_CC) {
528
529 /* Initialize the swash unless done already */
530 if (! PL_utf8_swash_ptrs[classnum]) {
531 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2a16ac92
KW
532 PL_utf8_swash_ptrs[classnum] =
533 _core_swash_init("utf8",
534 "",
535 &PL_sv_undef, 1, 0,
536 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
537 }
538
92a2046b
KW
539 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
540 character,
541 TRUE /* is UTF */ ));
3018b823
KW
542 }
543
544 switch ((_char_class_number) classnum) {
779cf272 545 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
3018b823
KW
546 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
547 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
548 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
e1ee3960 549 default: break;
3018b823
KW
550 }
551
e1ee3960 552 return FALSE; /* Things like CNTRL are always below 256 */
3018b823
KW
553}
554
aff4cafe
KW
555STATIC char *
556S_find_next_ascii(char * s, const char * send, const bool utf8_target)
557{
558 /* Returns the position of the first ASCII byte in the sequence between 's'
559 * and 'send-1' inclusive; returns 'send' if none found */
560
561 PERL_ARGS_ASSERT_FIND_NEXT_ASCII;
562
563#ifndef EBCDIC
564
565 if ((STRLEN) (send - s) >= PERL_WORDSIZE
566
567 /* This term is wordsize if subword; 0 if not */
568 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
569
570 /* 'offset' */
571 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
572 {
573
574 /* Process per-byte until reach word boundary. XXX This loop could be
575 * eliminated if we knew that this platform had fast unaligned reads */
576 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
577 if (isASCII(*s)) {
578 return s;
579 }
580 s++; /* khw didn't bother creating a separate loop for
581 utf8_target */
582 }
583
584 /* Here, we know we have at least one full word to process. Process
585 * per-word as long as we have at least a full word left */
586 do {
587 if ((* (PERL_UINTMAX_T *) s) & ~ PERL_VARIANTS_WORD_MASK) {
588 break;
589 }
590 s += PERL_WORDSIZE;
591 } while (s + PERL_WORDSIZE <= send);
592 }
593
594#endif
595
596 /* Process per-character */
597 if (utf8_target) {
598 while (s < send) {
599 if (isASCII(*s)) {
600 return s;
601 }
602 s += UTF8SKIP(s);
603 }
604 }
605 else {
606 while (s < send) {
607 if (isASCII(*s)) {
608 return s;
609 }
610 s++;
611 }
612 }
613
614 return s;
615}
616
617STATIC char *
618S_find_next_non_ascii(char * s, const char * send, const bool utf8_target)
619{
620 /* Returns the position of the first non-ASCII byte in the sequence between
621 * 's' and 'send-1' inclusive; returns 'send' if none found */
622
623#ifdef EBCDIC
624
625 PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
626
627 if (utf8_target) {
628 while (s < send) {
629 if ( ! isASCII(*s)) {
630 return s;
631 }
632 s += UTF8SKIP(s);
633 }
634 }
635 else {
636 while (s < send) {
637 if ( ! isASCII(*s)) {
638 return s;
639 }
640 s++;
641 }
642 }
643
644 return s;
645
646#else
647
648 const U8 * next_non_ascii = NULL;
649
650 PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
651 PERL_UNUSED_ARG(utf8_target);
652
653 /* On ASCII platforms invariants and ASCII are identical, so if the string
654 * is entirely invariants, there is no non-ASCII character */
655 return (is_utf8_invariant_string_loc((U8 *) s,
656 (STRLEN) (send - s),
657 &next_non_ascii))
658 ? (char *) send
659 : (char *) next_non_ascii;
660
661#endif
662
663}
664
a687059c 665/*
e50aee73 666 * pregexec and friends
a687059c
LW
667 */
668
76234dfb 669#ifndef PERL_IN_XSUB_RE
a687059c 670/*
c277df42 671 - pregexec - match a regexp against a string
a687059c 672 */
c277df42 673I32
5aaab254 674Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
ea3daa5d 675 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
8fd1a950
DM
676/* stringarg: the point in the string at which to begin matching */
677/* strend: pointer to null at end of string */
678/* strbeg: real beginning of string */
679/* minend: end of match must be >= minend bytes after stringarg. */
680/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
681 * itself is accessed via the pointers above */
682/* nosave: For optimizations. */
c277df42 683{
7918f24d
NC
684 PERL_ARGS_ASSERT_PREGEXEC;
685
c277df42 686 return
9041c2e3 687 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
688 nosave ? 0 : REXEC_COPY_STR);
689}
76234dfb 690#endif
22e551b9 691
cad2e5aa 692
6eb5f6b9 693
1a4edc3c
DM
694/* re_intuit_start():
695 *
696 * Based on some optimiser hints, try to find the earliest position in the
697 * string where the regex could match.
698 *
699 * rx: the regex to match against
700 * sv: the SV being matched: only used for utf8 flag; the string
701 * itself is accessed via the pointers below. Note that on
702 * something like an overloaded SV, SvPOK(sv) may be false
703 * and the string pointers may point to something unrelated to
704 * the SV itself.
705 * strbeg: real beginning of string
706 * strpos: the point in the string at which to begin matching
707 * strend: pointer to the byte following the last char of the string
708 * flags currently unused; set to 0
709 * data: currently unused; set to NULL
710 *
711 * The basic idea of re_intuit_start() is to use some known information
712 * about the pattern, namely:
713 *
714 * a) the longest known anchored substring (i.e. one that's at a
715 * constant offset from the beginning of the pattern; but not
716 * necessarily at a fixed offset from the beginning of the
717 * string);
718 * b) the longest floating substring (i.e. one that's not at a constant
719 * offset from the beginning of the pattern);
720 * c) Whether the pattern is anchored to the string; either
721 * an absolute anchor: /^../, or anchored to \n: /^.../m,
722 * or anchored to pos(): /\G/;
723 * d) A start class: a real or synthetic character class which
724 * represents which characters are legal at the start of the pattern;
725 *
726 * to either quickly reject the match, or to find the earliest position
727 * within the string at which the pattern might match, thus avoiding
728 * running the full NFA engine at those earlier locations, only to
729 * eventually fail and retry further along.
730 *
731 * Returns NULL if the pattern can't match, or returns the address within
732 * the string which is the earliest place the match could occur.
733 *
734 * The longest of the anchored and floating substrings is called 'check'
735 * and is checked first. The other is called 'other' and is checked
736 * second. The 'other' substring may not be present. For example,
737 *
738 * /(abc|xyz)ABC\d{0,3}DEFG/
739 *
740 * will have
741 *
742 * check substr (float) = "DEFG", offset 6..9 chars
743 * other substr (anchored) = "ABC", offset 3..3 chars
744 * stclass = [ax]
745 *
746 * Be aware that during the course of this function, sometimes 'anchored'
747 * refers to a substring being anchored relative to the start of the
748 * pattern, and sometimes to the pattern itself being anchored relative to
749 * the string. For example:
750 *
751 * /\dabc/: "abc" is anchored to the pattern;
752 * /^\dabc/: "abc" is anchored to the pattern and the string;
753 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
754 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
755 * but the pattern is anchored to the string.
52a21eb3
DM
756 */
757
cad2e5aa 758char *
52a21eb3
DM
759Perl_re_intuit_start(pTHX_
760 REGEXP * const rx,
761 SV *sv,
762 const char * const strbeg,
763 char *strpos,
764 char *strend,
765 const U32 flags,
766 re_scream_pos_data *data)
cad2e5aa 767{
8d919b0a 768 struct regexp *const prog = ReANY(rx);
b2ad2123
DM
769 SSize_t start_shift = prog->check_offset_min;
770 /* Should be nonnegative! */
771 SSize_t end_shift = 0;
0fc004dd
DM
772 /* current lowest pos in string where the regex can start matching */
773 char *rx_origin = strpos;
eb578fdb 774 SV *check;
f2ed9b32 775 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
6480a6c4 776 U8 other_ix = 1 - prog->substrs->check_ix;
6ad5ffb3 777 bool ml_anch = 0;
8f4bf5fc 778 char *other_last = strpos;/* latest pos 'other' substr already checked to */
bd61b366 779 char *check_at = NULL; /* check substr found at this pos */
bbe252da 780 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 781 RXi_GET_DECL(prog,progi);
02d5137b
DM
782 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
783 regmatch_info *const reginfo = &reginfo_buf;
a3621e74
YO
784 GET_RE_DEBUG_FLAGS_DECL;
785
7918f24d 786 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
787 PERL_UNUSED_ARG(flags);
788 PERL_UNUSED_ARG(data);
7918f24d 789
6ad9a8ab 790 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0
DM
791 "Intuit: trying to determine minimum start position...\n"));
792
fb9bbddb 793 /* for now, assume that all substr offsets are positive. If at some point
f67a5002 794 * in the future someone wants to do clever things with lookbehind and
fb9bbddb
DM
795 * -ve offsets, they'll need to fix up any code in this function
796 * which uses these offsets. See the thread beginning
797 * <20140113145929.GF27210@iabyn.com>
798 */
799 assert(prog->substrs->data[0].min_offset >= 0);
800 assert(prog->substrs->data[0].max_offset >= 0);
801 assert(prog->substrs->data[1].min_offset >= 0);
802 assert(prog->substrs->data[1].max_offset >= 0);
803 assert(prog->substrs->data[2].min_offset >= 0);
804 assert(prog->substrs->data[2].max_offset >= 0);
805
f7022b5a 806 /* for now, assume that if both present, that the floating substring
83f2232d 807 * doesn't start before the anchored substring.
f7022b5a
DM
808 * If you break this assumption (e.g. doing better optimisations
809 * with lookahead/behind), then you'll need to audit the code in this
810 * function carefully first
811 */
812 assert(
813 ! ( (prog->anchored_utf8 || prog->anchored_substr)
814 && (prog->float_utf8 || prog->float_substr))
815 || (prog->float_min_offset >= prog->anchored_offset));
816
1a4edc3c
DM
817 /* byte rather than char calculation for efficiency. It fails
818 * to quickly reject some cases that can't match, but will reject
819 * them later after doing full char arithmetic */
c344f387 820 if (prog->minlen > strend - strpos) {
6ad9a8ab 821 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 822 " String too short...\n"));
cad2e5aa 823 goto fail;
2c2d71f5 824 }
d8da0584 825
196a02af 826 RXp_MATCH_UTF8_set(prog, utf8_target);
6c3fea77 827 reginfo->is_utf8_target = cBOOL(utf8_target);
bf2039a9 828 reginfo->info_aux = NULL;
9d9163fb 829 reginfo->strbeg = strbeg;
220db18a 830 reginfo->strend = strend;
aed7b151 831 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 832 reginfo->intuit = 1;
1cb48e53
DM
833 /* not actually used within intuit, but zero for safety anyway */
834 reginfo->poscache_maxiter = 0;
02d5137b 835
f2ed9b32 836 if (utf8_target) {
2814f4b3
HS
837 if ((!prog->anchored_utf8 && prog->anchored_substr)
838 || (!prog->float_utf8 && prog->float_substr))
33b8afdf
JH
839 to_utf8_substr(prog);
840 check = prog->check_utf8;
841 } else {
7e0d5ad7
KW
842 if (!prog->check_substr && prog->check_utf8) {
843 if (! to_byte_substr(prog)) {
6b54ddc5 844 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
845 }
846 }
33b8afdf
JH
847 check = prog->check_substr;
848 }
274cd312 849
1dc475d0
DM
850 /* dump the various substring data */
851 DEBUG_OPTIMISE_MORE_r({
852 int i;
853 for (i=0; i<=2; i++) {
854 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
855 : prog->substrs->data[i].substr);
856 if (!sv)
857 continue;
858
6ad9a8ab 859 Perl_re_printf( aTHX_
147e3846
KW
860 " substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
861 " useful=%" IVdf " utf8=%d [%s]\n",
1dc475d0
DM
862 i,
863 (IV)prog->substrs->data[i].min_offset,
864 (IV)prog->substrs->data[i].max_offset,
865 (IV)prog->substrs->data[i].end_shift,
866 BmUSEFUL(sv),
867 utf8_target ? 1 : 0,
868 SvPEEK(sv));
869 }
870 });
871
8e1490ee 872 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
9fc7410e
DM
873
874 /* ml_anch: check after \n?
875 *
0fa70a06 876 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
9fc7410e
DM
877 * with /.*.../, these flags will have been added by the
878 * compiler:
879 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
880 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
881 */
7d2d37f5
DM
882 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
883 && !(prog->intflags & PREGf_IMPLICIT);
cad2e5aa 884
343c8a29 885 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
c889ccc8
DM
886 /* we are only allowed to match at BOS or \G */
887
57fcbfa7 888 /* trivially reject if there's a BOS anchor and we're not at BOS.
7bb3b9eb
DM
889 *
890 * Note that we don't try to do a similar quick reject for
891 * \G, since generally the caller will have calculated strpos
892 * based on pos() and gofs, so the string is already correctly
893 * anchored by definition; and handling the exceptions would
894 * be too fiddly (e.g. REXEC_IGNOREPOS).
57fcbfa7 895 */
7bb3b9eb 896 if ( strpos != strbeg
d3d47aac 897 && (prog->intflags & PREGf_ANCH_SBOL))
c889ccc8 898 {
6ad9a8ab 899 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 900 " Not at start...\n"));
c889ccc8
DM
901 goto fail;
902 }
903
a5d12a4b
DM
904 /* in the presence of an anchor, the anchored (relative to the
905 * start of the regex) substr must also be anchored relative
66b7ec5c
DM
906 * to strpos. So quickly reject if substr isn't found there.
907 * This works for \G too, because the caller will already have
908 * subtracted gofs from pos, and gofs is the offset from the
909 * \G to the start of the regex. For example, in /.abc\Gdef/,
910 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
911 * caller will have set strpos=pos()-4; we look for the substr
912 * at position pos()-4+1, which lines up with the "a" */
a5d12a4b 913
33c28ab2 914 if (prog->check_offset_min == prog->check_offset_max) {
c889ccc8 915 /* Substring at constant offset from beg-of-str... */
b2ad2123 916 SSize_t slen = SvCUR(check);
343c8a29 917 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 918
6ad9a8ab 919 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 920 " Looking for check substr at fixed offset %" IVdf "...\n",
1dc475d0
DM
921 (IV)prog->check_offset_min));
922
7742aa66
DM
923 if (SvTAIL(check)) {
924 /* In this case, the regex is anchored at the end too.
925 * Unless it's a multiline match, the lengths must match
b2ad2123 926 * exactly, give or take a \n. NB: slen >= 1 since
7742aa66
DM
927 * the last char of check is \n */
928 if (!multiline
b2ad2123
DM
929 && ( strend - s > slen
930 || strend - s < slen - 1
931 || (strend - s == slen && strend[-1] != '\n')))
c889ccc8 932 {
6ad9a8ab 933 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 934 " String too long...\n"));
c889ccc8
DM
935 goto fail_finish;
936 }
b2ad2123
DM
937 /* Now should match s[0..slen-2] */
938 slen--;
c889ccc8 939 }
b2ad2123 940 if (slen && (strend - s < slen
26fb2318 941 || *SvPVX_const(check) != *s
b2ad2123 942 || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
d307bf57 943 {
6ad9a8ab 944 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 945 " String not equal...\n"));
d307bf57
DM
946 goto fail_finish;
947 }
c889ccc8
DM
948
949 check_at = s;
950 goto success_at_start;
cad2e5aa 951 }
cad2e5aa 952 }
cad2e5aa 953 }
0fc004dd 954
b2ad2123 955 end_shift = prog->check_end_shift;
cad2e5aa 956
19188028 957#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
b2ad2123 958 if (end_shift < 0)
147e3846 959 Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
b2ad2123 960 (IV)end_shift, RX_PRECOMP(rx));
2c2d71f5
JH
961#endif
962
2c2d71f5 963 restart:
1de06328 964
66b7ec5c
DM
965 /* This is the (re)entry point of the main loop in this function.
966 * The goal of this loop is to:
967 * 1) find the "check" substring in the region rx_origin..strend
b2ad2123 968 * (adjusted by start_shift / end_shift). If not found, reject
66b7ec5c
DM
969 * immediately.
970 * 2) If it exists, look for the "other" substr too if defined; for
971 * example, if the check substr maps to the anchored substr, then
972 * check the floating substr, and vice-versa. If not found, go
973 * back to (1) with rx_origin suitably incremented.
974 * 3) If we find an rx_origin position that doesn't contradict
975 * either of the substrings, then check the possible additional
976 * constraints on rx_origin of /^.../m or a known start class.
977 * If these fail, then depending on which constraints fail, jump
978 * back to here, or to various other re-entry points further along
979 * that skip some of the first steps.
980 * 4) If we pass all those tests, update the BmUSEFUL() count on the
981 * substring. If the start position was determined to be at the
982 * beginning of the string - so, not rejected, but not optimised,
983 * since we have to run regmatch from position 0 - decrement the
984 * BmUSEFUL() count. Otherwise increment it.
985 */
986
1a4edc3c
DM
987
988 /* first, look for the 'check' substring */
989
1de06328 990 {
c33e64f0
FC
991 U8* start_point;
992 U8* end_point;
c889ccc8 993
c889ccc8 994 DEBUG_OPTIMISE_MORE_r({
6ad9a8ab 995 Perl_re_printf( aTHX_
147e3846
KW
996 " At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
997 " Start shift: %" IVdf " End shift %" IVdf
998 " Real end Shift: %" IVdf "\n",
675e93ee 999 (IV)(rx_origin - strbeg),
c889ccc8 1000 (IV)prog->check_offset_min,
b2ad2123
DM
1001 (IV)start_shift,
1002 (IV)end_shift,
c889ccc8
DM
1003 (IV)prog->check_end_shift);
1004 });
1de06328 1005
b2ad2123 1006 end_point = HOPBACK3(strend, end_shift, rx_origin);
bb152a4b
DM
1007 if (!end_point)
1008 goto fail_finish;
b2ad2123 1009 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
33c28ab2
DM
1010 if (!start_point)
1011 goto fail_finish;
c889ccc8 1012
557f47af 1013
e0362b86 1014 /* If the regex is absolutely anchored to either the start of the
d3d47aac 1015 * string (SBOL) or to pos() (ANCH_GPOS), then
e0362b86
DM
1016 * check_offset_max represents an upper bound on the string where
1017 * the substr could start. For the ANCH_GPOS case, we assume that
1018 * the caller of intuit will have already set strpos to
1019 * pos()-gofs, so in this case strpos + offset_max will still be
1020 * an upper bound on the substr.
1021 */
c19c836a
DM
1022 if (!ml_anch
1023 && prog->intflags & PREGf_ANCH
e0362b86 1024 && prog->check_offset_max != SSize_t_MAX)
c19c836a 1025 {
b2ad2123 1026 SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
e0362b86
DM
1027 const char * const anchor =
1028 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
b2ad2123 1029 SSize_t targ_len = (char*)end_point - anchor;
2ce94a86 1030
b2ad2123 1031 if (check_len > targ_len) {
2ce94a86
DM
1032 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1033 "Anchored string too short...\n"));
1034 goto fail_finish;
1035 }
e0362b86
DM
1036
1037 /* do a bytes rather than chars comparison. It's conservative;
1038 * so it skips doing the HOP if the result can't possibly end
1039 * up earlier than the old value of end_point.
1040 */
b2ad2123
DM
1041 assert(anchor + check_len <= (char *)end_point);
1042 if (prog->check_offset_max + check_len < targ_len) {
e0362b86
DM
1043 end_point = HOP3lim((U8*)anchor,
1044 prog->check_offset_max,
b2ad2123 1045 end_point - check_len
2ce94a86 1046 )
b2ad2123 1047 + check_len;
e0362b86 1048 }
d6ef1678
DM
1049 }
1050
ae5d4331 1051 check_at = fbm_instr( start_point, end_point,
7fba1cd6 1052 check, multiline ? FBMrf_MULTILINE : 0);
c889ccc8 1053
6ad9a8ab 1054 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1055 " doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
675e93ee
DM
1056 (IV)((char*)start_point - strbeg),
1057 (IV)((char*)end_point - strbeg),
1058 (IV)(check_at ? check_at - strbeg : -1)
1059 ));
1060
8fd34720
DM
1061 /* Update the count-of-usability, remove useless subpatterns,
1062 unshift s. */
1063
1064 DEBUG_EXECUTE_r({
1065 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1066 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
6ad9a8ab 1067 Perl_re_printf( aTHX_ " %s %s substr %s%s%s",
8fd34720
DM
1068 (check_at ? "Found" : "Did not find"),
1069 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1070 ? "anchored" : "floating"),
1071 quoted,
1072 RE_SV_TAIL(check),
1073 (check_at ? " at offset " : "...\n") );
1074 });
2c2d71f5 1075
8fd34720
DM
1076 if (!check_at)
1077 goto fail_finish;
8fd34720
DM
1078 /* set rx_origin to the minimum position where the regex could start
1079 * matching, given the constraint of the just-matched check substring.
1080 * But don't set it lower than previously.
1081 */
fdc003fd 1082
8fd34720
DM
1083 if (check_at - rx_origin > prog->check_offset_max)
1084 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
675e93ee 1085 /* Finish the diagnostic message */
6ad9a8ab 1086 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1087 "%ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1088 (long)(check_at - strbeg),
1089 (IV)(rx_origin - strbeg)
1090 ));
8fd34720 1091 }
fdc003fd
DM
1092
1093
1a4edc3c 1094 /* now look for the 'other' substring if defined */
2c2d71f5 1095
6480a6c4
DM
1096 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
1097 : prog->substrs->data[other_ix].substr)
1de06328 1098 {
30944b6d 1099 /* Take into account the "other" substring. */
6c3343a6
DM
1100 char *last, *last1;
1101 char *s;
1102 SV* must;
1103 struct reg_substr_datum *other;
1104
1105 do_other_substr:
1106 other = &prog->substrs->data[other_ix];
1107
1108 /* if "other" is anchored:
1109 * we've previously found a floating substr starting at check_at.
1110 * This means that the regex origin must lie somewhere
1111 * between min (rx_origin): HOP3(check_at, -check_offset_max)
1112 * and max: HOP3(check_at, -check_offset_min)
1113 * (except that min will be >= strpos)
1114 * So the fixed substr must lie somewhere between
1115 * HOP3(min, anchored_offset)
1116 * HOP3(max, anchored_offset) + SvCUR(substr)
1117 */
1118
1119 /* if "other" is floating
1120 * Calculate last1, the absolute latest point where the
1121 * floating substr could start in the string, ignoring any
1122 * constraints from the earlier fixed match. It is calculated
1123 * as follows:
1124 *
1125 * strend - prog->minlen (in chars) is the absolute latest
1126 * position within the string where the origin of the regex
1127 * could appear. The latest start point for the floating
1128 * substr is float_min_offset(*) on from the start of the
1129 * regex. last1 simply combines thee two offsets.
1130 *
1131 * (*) You might think the latest start point should be
1132 * float_max_offset from the regex origin, and technically
1133 * you'd be correct. However, consider
1134 * /a\d{2,4}bcd\w/
1135 * Here, float min, max are 3,5 and minlen is 7.
1136 * This can match either
1137 * /a\d\dbcd\w/
1138 * /a\d\d\dbcd\w/
1139 * /a\d\d\d\dbcd\w/
1140 * In the first case, the regex matches minlen chars; in the
1141 * second, minlen+1, in the third, minlen+2.
1142 * In the first case, the floating offset is 3 (which equals
1143 * float_min), in the second, 4, and in the third, 5 (which
1144 * equals float_max). In all cases, the floating string bcd
1145 * can never start more than 4 chars from the end of the
1146 * string, which equals minlen - float_min. As the substring
1147 * starts to match more than float_min from the start of the
1148 * regex, it makes the regex match more than minlen chars,
1149 * and the two cancel each other out. So we can always use
1150 * float_min - minlen, rather than float_max - minlen for the
1151 * latest position in the string.
1152 *
1153 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1154 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1155 */
1156
e7a14a9c 1157 assert(prog->minlen >= other->min_offset);
6c3343a6
DM
1158 last1 = HOP3c(strend,
1159 other->min_offset - prog->minlen, strbeg);
1160
4d006249 1161 if (other_ix) {/* i.e. if (other-is-float) */
6c3343a6
DM
1162 /* last is the latest point where the floating substr could
1163 * start, *given* any constraints from the earlier fixed
1164 * match. This constraint is that the floating string starts
1165 * <= float_max_offset chars from the regex origin (rx_origin).
1166 * If this value is less than last1, use it instead.
eb3831ce 1167 */
6c3343a6
DM
1168 assert(rx_origin <= last1);
1169 last =
1170 /* this condition handles the offset==infinity case, and
1171 * is a short-cut otherwise. Although it's comparing a
1172 * byte offset to a char length, it does so in a safe way,
1173 * since 1 char always occupies 1 or more bytes,
1174 * so if a string range is (last1 - rx_origin) bytes,
1175 * it will be less than or equal to (last1 - rx_origin)
1176 * chars; meaning it errs towards doing the accurate HOP3
1177 * rather than just using last1 as a short-cut */
1178 (last1 - rx_origin) < other->max_offset
1179 ? last1
1180 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1181 }
1182 else {
b2ad2123
DM
1183 assert(strpos + start_shift <= check_at);
1184 last = HOP4c(check_at, other->min_offset - start_shift,
6c3343a6
DM
1185 strbeg, strend);
1186 }
ead917d0 1187
6c3343a6
DM
1188 s = HOP3c(rx_origin, other->min_offset, strend);
1189 if (s < other_last) /* These positions already checked */
1190 s = other_last;
1191
1192 must = utf8_target ? other->utf8_substr : other->substr;
1193 assert(SvPOK(must));
675e93ee
DM
1194 {
1195 char *from = s;
1196 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1197
71a9d105
DM
1198 if (to > strend)
1199 to = strend;
88203927
DM
1200 if (from > to) {
1201 s = NULL;
6ad9a8ab 1202 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1203 " skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
88203927
DM
1204 (IV)(from - strbeg),
1205 (IV)(to - strbeg)
1206 ));
1207 }
1208 else {
1209 s = fbm_instr(
1210 (unsigned char*)from,
1211 (unsigned char*)to,
1212 must,
1213 multiline ? FBMrf_MULTILINE : 0
1214 );
6ad9a8ab 1215 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1216 " doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
88203927
DM
1217 (IV)(from - strbeg),
1218 (IV)(to - strbeg),
1219 (IV)(s ? s - strbeg : -1)
1220 ));
1221 }
675e93ee
DM
1222 }
1223
6c3343a6
DM
1224 DEBUG_EXECUTE_r({
1225 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1226 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
6ad9a8ab 1227 Perl_re_printf( aTHX_ " %s %s substr %s%s",
6c3343a6
DM
1228 s ? "Found" : "Contradicts",
1229 other_ix ? "floating" : "anchored",
1230 quoted, RE_SV_TAIL(must));
1231 });
ead917d0 1232
ead917d0 1233
6c3343a6
DM
1234 if (!s) {
1235 /* last1 is latest possible substr location. If we didn't
1236 * find it before there, we never will */
1237 if (last >= last1) {
6ad9a8ab 1238 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee 1239 "; giving up...\n"));
6c3343a6 1240 goto fail_finish;
ead917d0
DM
1241 }
1242
6c3343a6
DM
1243 /* try to find the check substr again at a later
1244 * position. Maybe next time we'll find the "other" substr
1245 * in range too */
6c3343a6
DM
1246 other_last = HOP3c(last, 1, strend) /* highest failure */;
1247 rx_origin =
4d006249 1248 other_ix /* i.e. if other-is-float */
6c3343a6
DM
1249 ? HOP3c(rx_origin, 1, strend)
1250 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
6ad9a8ab 1251 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1252 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1253 (other_ix ? "floating" : "anchored"),
1254 (long)(HOP3c(check_at, 1, strend) - strbeg),
1255 (IV)(rx_origin - strbeg)
1256 ));
6c3343a6
DM
1257 goto restart;
1258 }
1259 else {
4d006249 1260 if (other_ix) { /* if (other-is-float) */
6c3343a6
DM
1261 /* other_last is set to s, not s+1, since its possible for
1262 * a floating substr to fail first time, then succeed
1263 * second time at the same floating position; e.g.:
1264 * "-AB--AABZ" =~ /\wAB\d*Z/
1265 * The first time round, anchored and float match at
1266 * "-(AB)--AAB(Z)" then fail on the initial \w character
1267 * class. Second time round, they match at "-AB--A(AB)(Z)".
1268 */
1269 other_last = s;
ead917d0
DM
1270 }
1271 else {
6c3343a6
DM
1272 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1273 other_last = HOP3c(s, 1, strend);
ead917d0 1274 }
6ad9a8ab 1275 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1276 " at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1277 (long)(s - strbeg),
1278 (IV)(rx_origin - strbeg)
1279 ));
1280
6c3343a6 1281 }
cad2e5aa 1282 }
acba93e8
DM
1283 else {
1284 DEBUG_OPTIMISE_MORE_r(
6ad9a8ab 1285 Perl_re_printf( aTHX_
147e3846
KW
1286 " Check-only match: offset min:%" IVdf " max:%" IVdf
1287 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1288 " strend:%" IVdf "\n",
acba93e8
DM
1289 (IV)prog->check_offset_min,
1290 (IV)prog->check_offset_max,
675e93ee
DM
1291 (IV)(check_at-strbeg),
1292 (IV)(rx_origin-strbeg),
1c1c599d 1293 (IV)(rx_origin-check_at),
675e93ee 1294 (IV)(strend-strbeg)
acba93e8
DM
1295 )
1296 );
1297 }
2c2d71f5 1298
acba93e8 1299 postprocess_substr_matches:
0991020e 1300
1a4edc3c 1301 /* handle the extra constraint of /^.../m if present */
e3c6feb0 1302
7d2d37f5 1303 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
4620cb61
DM
1304 char *s;
1305
6ad9a8ab 1306 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
a62659bd 1307 " looking for /^/m anchor"));
d0880ea7
DM
1308
1309 /* we have failed the constraint of a \n before rx_origin.
2e759faa
DM
1310 * Find the next \n, if any, even if it's beyond the current
1311 * anchored and/or floating substrings. Whether we should be
1312 * scanning ahead for the next \n or the next substr is debatable.
1313 * On the one hand you'd expect rare substrings to appear less
1314 * often than \n's. On the other hand, searching for \n means
675e93ee 1315 * we're effectively flipping between check_substr and "\n" on each
2e759faa
DM
1316 * iteration as the current "rarest" string candidate, which
1317 * means for example that we'll quickly reject the whole string if
1318 * hasn't got a \n, rather than trying every substr position
1319 * first
1320 */
d0880ea7 1321
4620cb61
DM
1322 s = HOP3c(strend, - prog->minlen, strpos);
1323 if (s <= rx_origin ||
1324 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1325 {
6ad9a8ab 1326 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
d0880ea7
DM
1327 " Did not find /%s^%s/m...\n",
1328 PL_colors[0], PL_colors[1]));
a62659bd
DM
1329 goto fail_finish;
1330 }
d0880ea7 1331
4ada1233
DM
1332 /* earliest possible origin is 1 char after the \n.
1333 * (since *rx_origin == '\n', it's safe to ++ here rather than
1334 * HOP(rx_origin, 1)) */
1335 rx_origin++;
d0880ea7 1336
f4f115de 1337 if (prog->substrs->check_ix == 0 /* check is anchored */
4ada1233 1338 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
f4f115de 1339 {
d0880ea7
DM
1340 /* Position contradicts check-string; either because
1341 * check was anchored (and thus has no wiggle room),
4ada1233 1342 * or check was float and rx_origin is above the float range */
6ad9a8ab 1343 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee
DM
1344 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1345 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
d0880ea7
DM
1346 goto restart;
1347 }
1348
1349 /* if we get here, the check substr must have been float,
2e759faa 1350 * is in range, and we may or may not have had an anchored
d0880ea7
DM
1351 * "other" substr which still contradicts */
1352 assert(prog->substrs->check_ix); /* check is float */
1353
1354 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1355 /* whoops, the anchored "other" substr exists, so we still
1356 * contradict. On the other hand, the float "check" substr
1357 * didn't contradict, so just retry the anchored "other"
1358 * substr */
6ad9a8ab 1359 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1360 " Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
d0880ea7 1361 PL_colors[0], PL_colors[1],
73e8ff00
DM
1362 (IV)(rx_origin - strbeg + prog->anchored_offset),
1363 (IV)(rx_origin - strbeg)
675e93ee 1364 ));
d0880ea7
DM
1365 goto do_other_substr;
1366 }
1367
1368 /* success: we don't contradict the found floating substring
1369 * (and there's no anchored substr). */
6ad9a8ab 1370 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee
DM
1371 " Found /%s^%s/m with rx_origin %ld...\n",
1372 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
e3c6feb0
DM
1373 }
1374 else {
6ad9a8ab 1375 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
fe4f3442 1376 " (multiline anchor test skipped)\n"));
e3c6feb0
DM
1377 }
1378
ffad1e6a 1379 success_at_start:
e3c6feb0 1380
cad2e5aa 1381
dd170ff5
DM
1382 /* if we have a starting character class, then test that extra constraint.
1383 * (trie stclasses are too expensive to use here, we are better off to
1384 * leave it to regmatch itself) */
1385
f8fc2ecf 1386 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
f8fc2ecf 1387 const U8* const str = (U8*)STRING(progi->regstclass);
0991020e 1388
b2ad2123
DM
1389 /* XXX this value could be pre-computed */
1390 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
2c75e362
DM
1391 ? (reginfo->is_utf8_pat
1392 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1393 : STR_LEN(progi->regstclass))
66e933ab 1394 : 1);
1de06328 1395 char * endpos;
fa3bb21d 1396 char *s;
000dfd2d
DM
1397 /* latest pos that a matching float substr constrains rx start to */
1398 char *rx_max_float = NULL;
1399
c75a3985
DM
1400 /* if the current rx_origin is anchored, either by satisfying an
1401 * anchored substring constraint, or a /^.../m constraint, then we
1402 * can reject the current origin if the start class isn't found
1403 * at the current position. If we have a float-only match, then
1404 * rx_origin is constrained to a range; so look for the start class
1405 * in that range. if neither, then look for the start class in the
1406 * whole rest of the string */
1407
dd170ff5
DM
1408 /* XXX DAPM it's not clear what the minlen test is for, and why
1409 * it's not used in the floating case. Nothing in the test suite
1410 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1411 * Here are some old comments, which may or may not be correct:
1412 *
1413 * minlen == 0 is possible if regstclass is \b or \B,
1414 * and the fixed substr is ''$.
1415 * Since minlen is already taken into account, rx_origin+1 is
1416 * before strend; accidentally, minlen >= 1 guaranties no false
1417 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1418 * 0) below assumes that regstclass does not come from lookahead...
1419 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1420 * This leaves EXACTF-ish only, which are dealt with in
1421 * find_byclass().
1422 */
1423
7d2d37f5 1424 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
b2ad2123 1425 endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
000dfd2d 1426 else if (prog->float_substr || prog->float_utf8) {
b2ad2123
DM
1427 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1428 endpos = HOP3clim(rx_max_float, cl_l, strend);
000dfd2d 1429 }
1de06328
YO
1430 else
1431 endpos= strend;
1432
6ad9a8ab 1433 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
b2ad2123 1434 " looking for class: start_shift: %" IVdf " check_at: %" IVdf
147e3846 1435 " rx_origin: %" IVdf " endpos: %" IVdf "\n",
b2ad2123 1436 (IV)start_shift, (IV)(check_at - strbeg),
c43b5520 1437 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
d8080198 1438
c43b5520 1439 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
f9176b44 1440 reginfo);
be778b1a 1441 if (!s) {
6eb5f6b9 1442 if (endpos == strend) {
6ad9a8ab 1443 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1dc475d0 1444 " Could not match STCLASS...\n") );
6eb5f6b9
JH
1445 goto fail;
1446 }
6ad9a8ab 1447 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1dc475d0 1448 " This position contradicts STCLASS...\n") );
e0eb31e7
DM
1449 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1450 && !(prog->intflags & PREGf_IMPLICIT))
653099ff 1451 goto fail;
9fed8d02 1452
6eb5f6b9 1453 /* Contradict one of substrings */
97136c8a
DM
1454 if (prog->anchored_substr || prog->anchored_utf8) {
1455 if (prog->substrs->check_ix == 1) { /* check is float */
1456 /* Have both, check_string is floating */
b2ad2123
DM
1457 assert(rx_origin + start_shift <= check_at);
1458 if (rx_origin + start_shift != check_at) {
97136c8a 1459 /* not at latest position float substr could match:
c75a3985
DM
1460 * Recheck anchored substring, but not floating.
1461 * The condition above is in bytes rather than
1462 * chars for efficiency. It's conservative, in
1463 * that it errs on the side of doing 'goto
88203927
DM
1464 * do_other_substr'. In this case, at worst,
1465 * an extra anchored search may get done, but in
1466 * practice the extra fbm_instr() is likely to
1467 * get skipped anyway. */
6ad9a8ab 1468 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
147e3846 1469 " about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1470 (long)(other_last - strbeg),
1471 (IV)(rx_origin - strbeg)
1472 ));
97136c8a 1473 goto do_other_substr;
3369914b 1474 }
3369914b
DM
1475 }
1476 }
97136c8a 1477 else {
9fed8d02
DM
1478 /* float-only */
1479
7d2d37f5 1480 if (ml_anch) {
c75a3985
DM
1481 /* In the presence of ml_anch, we might be able to
1482 * find another \n without breaking the current float
1483 * constraint. */
1484
1485 /* strictly speaking this should be HOP3c(..., 1, ...),
1486 * but since we goto a block of code that's going to
1487 * search for the next \n if any, its safe here */
9fed8d02 1488 rx_origin++;
6ad9a8ab 1489 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
675e93ee 1490 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
9fed8d02 1491 PL_colors[0], PL_colors[1],
675e93ee 1492 (long)(rx_origin - strbeg)) );
9fed8d02 1493 goto postprocess_substr_matches;
ab60c45a 1494 }
c75a3985
DM
1495
1496 /* strictly speaking this can never be true; but might
1497 * be if we ever allow intuit without substrings */
1498 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
9fed8d02 1499 goto fail;
c75a3985 1500
000dfd2d 1501 rx_origin = rx_max_float;
9fed8d02
DM
1502 }
1503
c75a3985
DM
1504 /* at this point, any matching substrings have been
1505 * contradicted. Start again... */
1506
9fed8d02 1507 rx_origin = HOP3c(rx_origin, 1, strend);
557f47af
DM
1508
1509 /* uses bytes rather than char calculations for efficiency.
1510 * It's conservative: it errs on the side of doing 'goto restart',
1511 * where there is code that does a proper char-based test */
b2ad2123 1512 if (rx_origin + start_shift + end_shift > strend) {
6ad9a8ab 1513 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
9fed8d02
DM
1514 " Could not match STCLASS...\n") );
1515 goto fail;
1516 }
6ad9a8ab 1517 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
147e3846 1518 " about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
9fed8d02 1519 (prog->substrs->check_ix ? "floating" : "anchored"),
b2ad2123 1520 (long)(rx_origin + start_shift - strbeg),
675e93ee
DM
1521 (IV)(rx_origin - strbeg)
1522 ));
9fed8d02 1523 goto restart;
6eb5f6b9 1524 }
9fed8d02 1525
c75a3985
DM
1526 /* Success !!! */
1527
5f9c6575 1528 if (rx_origin != s) {
6ad9a8ab 1529 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1530 " By STCLASS: moving %ld --> %ld\n",
675e93ee 1531 (long)(rx_origin - strbeg), (long)(s - strbeg))
b7953727
JH
1532 );
1533 }
1534 else {
6ad9a8ab 1535 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1536 " Does not contradict STCLASS...\n");
b7953727
JH
1537 );
1538 }
6eb5f6b9 1539 }
ffad1e6a
DM
1540
1541 /* Decide whether using the substrings helped */
1542
1543 if (rx_origin != strpos) {
1544 /* Fixed substring is found far enough so that the match
1545 cannot start at strpos. */
1546
6ad9a8ab 1547 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n"));
ffad1e6a
DM
1548 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1549 }
1550 else {
70563e16
DM
1551 /* The found rx_origin position does not prohibit matching at
1552 * strpos, so calling intuit didn't gain us anything. Decrement
1553 * the BmUSEFUL() count on the check substring, and if we reach
1554 * zero, free it. */
1555 if (!(prog->intflags & PREGf_NAUGHTY)
ffad1e6a
DM
1556 && (utf8_target ? (
1557 prog->check_utf8 /* Could be deleted already */
1558 && --BmUSEFUL(prog->check_utf8) < 0
1559 && (prog->check_utf8 == prog->float_utf8)
1560 ) : (
1561 prog->check_substr /* Could be deleted already */
1562 && --BmUSEFUL(prog->check_substr) < 0
1563 && (prog->check_substr == prog->float_substr)
1564 )))
1565 {
1566 /* If flags & SOMETHING - do not do it many times on the same match */
6ad9a8ab 1567 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n"));
ffad1e6a
DM
1568 /* XXX Does the destruction order has to change with utf8_target? */
1569 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1570 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1571 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1572 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1573 check = NULL; /* abort */
ffad1e6a
DM
1574 /* XXXX This is a remnant of the old implementation. It
1575 looks wasteful, since now INTUIT can use many
1576 other heuristics. */
1577 prog->extflags &= ~RXf_USE_INTUIT;
ffad1e6a
DM
1578 }
1579 }
1580
6ad9a8ab 1581 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
ffad1e6a 1582 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
675e93ee 1583 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
ffad1e6a 1584
c765d6e0 1585 return rx_origin;
2c2d71f5
JH
1586
1587 fail_finish: /* Substring not found */
33b8afdf 1588 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1589 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1590 fail:
6ad9a8ab 1591 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n",
e4584336 1592 PL_colors[4], PL_colors[5]));
bd61b366 1593 return NULL;
cad2e5aa 1594}
9661b544 1595
70563e16 1596
a0a388a1 1597#define DECL_TRIE_TYPE(scan) \
e7fd4aa1 1598 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
a4525e78
KW
1599 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
1600 trie_utf8l, trie_flu8 } \
e7fd4aa1
KW
1601 trie_type = ((scan->flags == EXACT) \
1602 ? (utf8_target ? trie_utf8 : trie_plain) \
a4525e78
KW
1603 : (scan->flags == EXACTL) \
1604 ? (utf8_target ? trie_utf8l : trie_plain) \
1605 : (scan->flags == EXACTFA) \
1606 ? (utf8_target \
1607 ? trie_utf8_exactfa_fold \
1608 : trie_latin_utf8_exactfa_fold) \
1609 : (scan->flags == EXACTFLU8 \
1610 ? trie_flu8 \
1611 : (utf8_target \
1612 ? trie_utf8_fold \
1613 : trie_latin_utf8_fold)))
fab2782b 1614
fd3249ee 1615#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
baa60164 1616STMT_START { \
fab2782b 1617 STRLEN skiplen; \
baa60164 1618 U8 flags = FOLD_FLAGS_FULL; \
fab2782b 1619 switch (trie_type) { \
a4525e78 1620 case trie_flu8: \
780fcc9f 1621 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
613abc6d
KW
1622 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1623 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1624 } \
a4525e78 1625 goto do_trie_utf8_fold; \
31f05a37 1626 case trie_utf8_exactfa_fold: \
baa60164 1627 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1628 /* FALLTHROUGH */ \
fab2782b 1629 case trie_utf8_fold: \
a4525e78 1630 do_trie_utf8_fold: \
fab2782b 1631 if ( foldlen>0 ) { \
c80e42f3 1632 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1633 foldlen -= len; \
1634 uscan += len; \
1635 len=0; \
1636 } else { \
fab2782b 1637 len = UTF8SKIP(uc); \
a1a5ec35
KW
1638 uvc = _toFOLD_utf8_flags( (const U8*) uc, uc + len, foldbuf, &foldlen, \
1639 flags); \
5f560d8a 1640 skiplen = UVCHR_SKIP( uvc ); \
fab2782b
YO
1641 foldlen -= skiplen; \
1642 uscan = foldbuf + skiplen; \
1643 } \
1644 break; \
baa60164
KW
1645 case trie_latin_utf8_exactfa_fold: \
1646 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1647 /* FALLTHROUGH */ \
fab2782b
YO
1648 case trie_latin_utf8_fold: \
1649 if ( foldlen>0 ) { \
c80e42f3 1650 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1651 foldlen -= len; \
1652 uscan += len; \
1653 len=0; \
1654 } else { \
1655 len = 1; \
31f05a37 1656 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
5f560d8a 1657 skiplen = UVCHR_SKIP( uvc ); \
fab2782b
YO
1658 foldlen -= skiplen; \
1659 uscan = foldbuf + skiplen; \
1660 } \
1661 break; \
a4525e78 1662 case trie_utf8l: \
780fcc9f 1663 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
613abc6d
KW
1664 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1665 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1666 } \
780fcc9f 1667 /* FALLTHROUGH */ \
fab2782b 1668 case trie_utf8: \
c80e42f3 1669 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1670 break; \
1671 case trie_plain: \
1672 uvc = (UV)*uc; \
1673 len = 1; \
1674 } \
1675 if (uvc < 256) { \
1676 charid = trie->charmap[ uvc ]; \
1677 } \
1678 else { \
1679 charid = 0; \
1680 if (widecharmap) { \
1681 SV** const svpp = hv_fetch(widecharmap, \
1682 (char*)&uvc, sizeof(UV), 0); \
1683 if (svpp) \
1684 charid = (U16)SvIV(*svpp); \
1685 } \
1686 } \
4cadc6a9
YO
1687} STMT_END
1688
cb41e5d6 1689#define DUMP_EXEC_POS(li,s,doutf8,depth) \
ae7c5b9b 1690 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
cb41e5d6 1691 startpos, doutf8, depth)
ae7c5b9b 1692
c84a03c5 1693#define REXEC_FBC_EXACTISH_SCAN(COND) \
4cadc6a9
YO
1694STMT_START { \
1695 while (s <= e) { \
c84a03c5 1696 if ( (COND) \
fac1af77 1697 && (ln == 1 || folder(s, pat_string, ln)) \
02d5137b 1698 && (reginfo->intuit || regtry(reginfo, &s)) )\
4cadc6a9
YO
1699 goto got_it; \
1700 s++; \
1701 } \
1702} STMT_END
1703
c84a03c5 1704#define REXEC_FBC_UTF8_SCAN(CODE) \
4cadc6a9 1705STMT_START { \
9a902117 1706 while (s < strend) { \
c84a03c5 1707 CODE \
9a902117 1708 s += UTF8SKIP(s); \
4cadc6a9
YO
1709 } \
1710} STMT_END
1711
c84a03c5 1712#define REXEC_FBC_SCAN(CODE) \
4cadc6a9
YO
1713STMT_START { \
1714 while (s < strend) { \
c84a03c5 1715 CODE \
4cadc6a9
YO
1716 s++; \
1717 } \
1718} STMT_END
1719
05bd126c
KW
1720#define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
1721REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */ \
1722 if (COND) { \
1723 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1724 goto got_it; \
1725 else \
1726 tmp = doevery; \
1727 } \
1728 else \
1729 tmp = 1; \
4cadc6a9
YO
1730)
1731
05bd126c
KW
1732#define REXEC_FBC_CLASS_SCAN(COND) \
1733REXEC_FBC_SCAN( /* Loops while (s < strend) */ \
1734 if (COND) { \
1735 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1736 goto got_it; \
1737 else \
1738 tmp = doevery; \
1739 } \
1740 else \
1741 tmp = 1; \
4cadc6a9
YO
1742)
1743
c84a03c5 1744#define REXEC_FBC_CSCAN(CONDUTF8,COND) \
baa60164 1745 if (utf8_target) { \
c84a03c5 1746 REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8); \
e1d1eefb
YO
1747 } \
1748 else { \
c84a03c5 1749 REXEC_FBC_CLASS_SCAN(COND); \
d981ef24 1750 }
05bd126c 1751
05bd126c
KW
1752/* The three macros below are slightly different versions of the same logic.
1753 *
1754 * The first is for /a and /aa when the target string is UTF-8. This can only
1755 * match ascii, but it must advance based on UTF-8. The other two handle the
1756 * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking
1757 * for the boundary (or non-boundary) between a word and non-word character.
1758 * The utf8 and non-utf8 cases have the same logic, but the details must be
1759 * different. Find the "wordness" of the character just prior to this one, and
1760 * compare it with the wordness of this one. If they differ, we have a
1761 * boundary. At the beginning of the string, pretend that the previous
1762 * character was a new-line.
1763 *
1764 * All these macros uncleanly have side-effects with each other and outside
1765 * variables. So far it's been too much trouble to clean-up
1766 *
1767 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1768 * a word character or not.
1769 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1770 * word/non-word
1771 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1772 *
1773 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1774 * are looking for a boundary or for a non-boundary. If we are looking for a
1775 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1776 * see if this tentative match actually works, and if so, to quit the loop
1777 * here. And vice-versa if we are looking for a non-boundary.
1778 *
1779 * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1780 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1781 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
1782 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1783 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1784 * complement. But in that branch we complement tmp, meaning that at the
1785 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1786 * which means at the top of the loop in the next iteration, it is
1787 * TEST_NON_UTF8(s-1) */
b2f4e957 1788#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1789 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1790 tmp = TEST_NON_UTF8(tmp); \
1791 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1792 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1793 tmp = !tmp; \
1794 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
1795 } \
1796 else { \
1797 IF_FAIL; \
1798 } \
1799 ); \
1800
1801/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1802 * TEST_UTF8 is a macro that for the same input code points returns identically
1803 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
236d82fd 1804#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1805 if (s == reginfo->strbeg) { \
1806 tmp = '\n'; \
1807 } \
1808 else { /* Back-up to the start of the previous character */ \
1809 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1810 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
3db24e1e 1811 0, UTF8_ALLOW_DEFAULT); \
05bd126c
KW
1812 } \
1813 tmp = TEST_UV(tmp); \
1814 LOAD_UTF8_CHARCLASS_ALNUM(); \
1815 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
7a207065 1816 if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \
05bd126c
KW
1817 tmp = !tmp; \
1818 IF_SUCCESS; \
1819 } \
1820 else { \
1821 IF_FAIL; \
1822 } \
1823 );
cfaf538b 1824
05bd126c
KW
1825/* Like the above two macros. UTF8_CODE is the complete code for handling
1826 * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1827 * macros below */
baa60164 1828#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1829 if (utf8_target) { \
05bd126c 1830 UTF8_CODE \
63ac0dad
KW
1831 } \
1832 else { /* Not utf8 */ \
9d9163fb 1833 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
63ac0dad 1834 tmp = TEST_NON_UTF8(tmp); \
05bd126c 1835 REXEC_FBC_SCAN( /* advances s while s < strend */ \
63ac0dad 1836 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
63ac0dad 1837 IF_SUCCESS; \
760cfa8e 1838 tmp = !tmp; \
63ac0dad
KW
1839 } \
1840 else { \
1841 IF_FAIL; \
1842 } \
1843 ); \
1844 } \
c8519dc7
KW
1845 /* Here, things have been set up by the previous code so that tmp is the \
1846 * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \
1847 * utf8ness of the target). We also have to check if this matches against \
1848 * the EOS, which we treat as a \n (which is the same value in both UTF-8 \
1849 * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \
1850 * string */ \
1851 if (tmp == ! TEST_NON_UTF8('\n')) { \
1852 IF_SUCCESS; \
1853 } \
1854 else { \
1855 IF_FAIL; \
1856 }
63ac0dad 1857
ae7c5b9b
KW
1858/* This is the macro to use when we want to see if something that looks like it
1859 * could match, actually does, and if so exits the loop */
1860#define REXEC_FBC_TRYIT \
1861 if ((reginfo->intuit || regtry(reginfo, &s))) \
1862 goto got_it
1863
1864/* The only difference between the BOUND and NBOUND cases is that
1865 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1866 * NBOUND. This is accomplished by passing it as either the if or else clause,
1867 * with the other one being empty (PLACEHOLDER is defined as empty).
1868 *
1869 * The TEST_FOO parameters are for operating on different forms of input, but
1870 * all should be ones that return identically for the same underlying code
1871 * points */
1872#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1873 FBC_BOUND_COMMON( \
1874 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1875 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1876
44129e46 1877#define FBC_BOUND_A(TEST_NON_UTF8) \
ae7c5b9b
KW
1878 FBC_BOUND_COMMON( \
1879 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1880 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1881
1882#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1883 FBC_BOUND_COMMON( \
1884 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1885 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1886
44129e46 1887#define FBC_NBOUND_A(TEST_NON_UTF8) \
ae7c5b9b
KW
1888 FBC_BOUND_COMMON( \
1889 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1890 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1891
8bde5eaf
JH
1892#ifdef DEBUGGING
1893static IV
1894S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
1895 IV cp_out = Perl__invlist_search(invlist, cp_in);
1896 assert(cp_out >= 0);
1897 return cp_out;
1898}
1899# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1900 invmap[S_get_break_val_cp_checked(invlist, cp)]
1901#else
1902# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1903 invmap[_invlist_search(invlist, cp)]
1904#endif
1905
64935bc6
KW
1906/* Takes a pointer to an inversion list, a pointer to its corresponding
1907 * inversion map, and a code point, and returns the code point's value
1908 * according to the two arrays. It assumes that all code points have a value.
1909 * This is used as the base macro for macros for particular properties */
1910#define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
8bde5eaf 1911 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
64935bc6
KW
1912
1913/* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
1914 * of a code point, returning the value for the first code point in the string.
1915 * And it takes the particular macro name that finds the desired value given a
1916 * code point. Merely convert the UTF-8 to code point and call the cp macro */
1917#define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
1918 (__ASSERT_(pos < strend) \
1919 /* Note assumes is valid UTF-8 */ \
1920 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
1921
1922/* Returns the GCB value for the input code point */
1923#define getGCB_VAL_CP(cp) \
1924 _generic_GET_BREAK_VAL_CP( \
1925 PL_GCB_invlist, \
02f811dd 1926 _Perl_GCB_invmap, \
64935bc6
KW
1927 (cp))
1928
1929/* Returns the GCB value for the first code point in the UTF-8 encoded string
1930 * bounded by pos and strend */
1931#define getGCB_VAL_UTF8(pos, strend) \
1932 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
05bd126c 1933
6b659339
KW
1934/* Returns the LB value for the input code point */
1935#define getLB_VAL_CP(cp) \
1936 _generic_GET_BREAK_VAL_CP( \
1937 PL_LB_invlist, \
1938 _Perl_LB_invmap, \
1939 (cp))
1940
1941/* Returns the LB value for the first code point in the UTF-8 encoded string
1942 * bounded by pos and strend */
1943#define getLB_VAL_UTF8(pos, strend) \
1944 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
1945
06ae2722
KW
1946
1947/* Returns the SB value for the input code point */
1948#define getSB_VAL_CP(cp) \
1949 _generic_GET_BREAK_VAL_CP( \
1950 PL_SB_invlist, \
bf4268fa 1951 _Perl_SB_invmap, \
06ae2722
KW
1952 (cp))
1953
1954/* Returns the SB value for the first code point in the UTF-8 encoded string
1955 * bounded by pos and strend */
1956#define getSB_VAL_UTF8(pos, strend) \
1957 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
1958
ae3bb8ea
KW
1959/* Returns the WB value for the input code point */
1960#define getWB_VAL_CP(cp) \
1961 _generic_GET_BREAK_VAL_CP( \
1962 PL_WB_invlist, \
bf4268fa 1963 _Perl_WB_invmap, \
ae3bb8ea
KW
1964 (cp))
1965
1966/* Returns the WB value for the first code point in the UTF-8 encoded string
1967 * bounded by pos and strend */
1968#define getWB_VAL_UTF8(pos, strend) \
1969 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
1970
786e8c11 1971/* We know what class REx starts with. Try to find this position... */
02d5137b 1972/* if reginfo->intuit, its a dryrun */
786e8c11
YO
1973/* annoyingly all the vars in this routine have different names from their counterparts
1974 in regmatch. /grrr */
3c3eec57 1975STATIC char *
07be1b83 1976S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 1977 const char *strend, regmatch_info *reginfo)
a687059c 1978{
73104a1b
KW
1979 dVAR;
1980 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1981 char *pat_string; /* The pattern's exactish string */
1982 char *pat_end; /* ptr to end char of pat_string */
1983 re_fold_t folder; /* Function for computing non-utf8 folds */
1984 const U8 *fold_array; /* array for folding ords < 256 */
1985 STRLEN ln;
1986 STRLEN lnc;
73104a1b
KW
1987 U8 c1;
1988 U8 c2;
1989 char *e;
1990 I32 tmp = 1; /* Scratch variable? */
ba44c216 1991 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 1992 UV utf8_fold_flags = 0;
f9176b44 1993 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
1994 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1995 with a result inverts that result, as 0^1 =
1996 1 and 1^1 = 0 */
1997 _char_class_number classnum;
1998
73104a1b 1999 RXi_GET_DECL(prog,progi);
2f7f8cb1 2000
73104a1b 2001 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 2002
73104a1b
KW
2003 /* We know what class it must start with. */
2004 switch (OP(c)) {
a4525e78 2005 case ANYOFL:
780fcc9f 2006 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
a0bd1a30 2007
d1c40ef5 2008 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
a0bd1a30
KW
2009 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2010 }
2011
780fcc9f 2012 /* FALLTHROUGH */
ac44c12e 2013 case ANYOFD:
73104a1b
KW
2014 case ANYOF:
2015 if (utf8_target) {
2016 REXEC_FBC_UTF8_CLASS_SCAN(
3db24e1e 2017 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
73104a1b 2018 }
1451f692
DM
2019 else if (ANYOF_FLAGS(c)) {
2020 REXEC_FBC_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
2021 }
73104a1b 2022 else {
1451f692 2023 REXEC_FBC_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
73104a1b
KW
2024 }
2025 break;
73104a1b 2026
098b07d5
KW
2027 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
2028 assert(! is_utf8_pat);
924ba076 2029 /* FALLTHROUGH */
73104a1b 2030 case EXACTFA:
984e6dd1 2031 if (is_utf8_pat || utf8_target) {
73104a1b
KW
2032 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2033 goto do_exactf_utf8;
2034 }
2035 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
2036 folder = foldEQ_latin1; /* /a, except the sharp s one which */
2037 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 2038
2fdb7295
KW
2039 case EXACTF: /* This node only generated for non-utf8 patterns */
2040 assert(! is_utf8_pat);
73104a1b 2041 if (utf8_target) {
73104a1b
KW
2042 utf8_fold_flags = 0;
2043 goto do_exactf_utf8;
2044 }
2045 fold_array = PL_fold;
2046 folder = foldEQ;
2047 goto do_exactf_non_utf8;
2048
2049 case EXACTFL:
780fcc9f 2050 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
31f05a37 2051 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
cea315b6 2052 utf8_fold_flags = FOLDEQ_LOCALE;
73104a1b
KW
2053 goto do_exactf_utf8;
2054 }
2055 fold_array = PL_fold_locale;
2056 folder = foldEQ_locale;
2057 goto do_exactf_non_utf8;
3c760661 2058
73104a1b 2059 case EXACTFU_SS:
984e6dd1 2060 if (is_utf8_pat) {
73104a1b
KW
2061 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2062 }
2063 goto do_exactf_utf8;
16d951b7 2064
a4525e78
KW
2065 case EXACTFLU8:
2066 if (! utf8_target) { /* All code points in this node require
2067 UTF-8 to express. */
2068 break;
2069 }
613abc6d
KW
2070 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2071 | FOLDEQ_S2_FOLDS_SANE;
a4525e78
KW
2072 goto do_exactf_utf8;
2073
73104a1b 2074 case EXACTFU:
984e6dd1
DM
2075 if (is_utf8_pat || utf8_target) {
2076 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
2077 goto do_exactf_utf8;
2078 }
fac1af77 2079
73104a1b
KW
2080 /* Any 'ss' in the pattern should have been replaced by regcomp,
2081 * so we don't have to worry here about this single special case
2082 * in the Latin1 range */
2083 fold_array = PL_fold_latin1;
2084 folder = foldEQ_latin1;
2085
924ba076 2086 /* FALLTHROUGH */
73104a1b 2087
c52b8b12 2088 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
73104a1b
KW
2089 are no glitches with fold-length differences
2090 between the target string and pattern */
2091
2092 /* The idea in the non-utf8 EXACTF* cases is to first find the
2093 * first character of the EXACTF* node and then, if necessary,
2094 * case-insensitively compare the full text of the node. c1 is the
2095 * first character. c2 is its fold. This logic will not work for
2096 * Unicode semantics and the german sharp ss, which hence should
2097 * not be compiled into a node that gets here. */
2098 pat_string = STRING(c);
2099 ln = STR_LEN(c); /* length to match in octets/bytes */
2100
2101 /* We know that we have to match at least 'ln' bytes (which is the
2102 * same as characters, since not utf8). If we have to match 3
2103 * characters, and there are only 2 availabe, we know without
2104 * trying that it will fail; so don't start a match past the
2105 * required minimum number from the far end */
ea3daa5d 2106 e = HOP3c(strend, -((SSize_t)ln), s);
dda01918
HS
2107 if (e < s)
2108 break;
fac1af77 2109
73104a1b
KW
2110 c1 = *pat_string;
2111 c2 = fold_array[c1];
2112 if (c1 == c2) { /* If char and fold are the same */
2113 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
2114 }
2115 else {
2116 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
2117 }
2118 break;
fac1af77 2119
c52b8b12
KW
2120 do_exactf_utf8:
2121 {
73104a1b
KW
2122 unsigned expansion;
2123
2124 /* If one of the operands is in utf8, we can't use the simpler folding
2125 * above, due to the fact that many different characters can have the
2126 * same fold, or portion of a fold, or different- length fold */
2127 pat_string = STRING(c);
2128 ln = STR_LEN(c); /* length to match in octets/bytes */
2129 pat_end = pat_string + ln;
984e6dd1 2130 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
2131 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2132 : ln;
2133
2134 /* We have 'lnc' characters to match in the pattern, but because of
2135 * multi-character folding, each character in the target can match
2136 * up to 3 characters (Unicode guarantees it will never exceed
2137 * this) if it is utf8-encoded; and up to 2 if not (based on the
2138 * fact that the Latin 1 folds are already determined, and the
2139 * only multi-char fold in that range is the sharp-s folding to
2140 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
2141 * string character. Adjust lnc accordingly, rounding up, so that
2142 * if we need to match at least 4+1/3 chars, that really is 5. */
2143 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2144 lnc = (lnc + expansion - 1) / expansion;
2145
2146 /* As in the non-UTF8 case, if we have to match 3 characters, and
2147 * only 2 are left, it's guaranteed to fail, so don't start a
2148 * match that would require us to go beyond the end of the string
2149 */
ea3daa5d 2150 e = HOP3c(strend, -((SSize_t)lnc), s);
73104a1b 2151
73104a1b
KW
2152 /* XXX Note that we could recalculate e to stop the loop earlier,
2153 * as the worst case expansion above will rarely be met, and as we
2154 * go along we would usually find that e moves further to the left.
2155 * This would happen only after we reached the point in the loop
2156 * where if there were no expansion we should fail. Unclear if
2157 * worth the expense */
2158
2159 while (s <= e) {
2160 char *my_strend= (char *)strend;
2161 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 2162 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
02d5137b 2163 && (reginfo->intuit || regtry(reginfo, &s)) )
73104a1b
KW
2164 {
2165 goto got_it;
2166 }
2167 s += (utf8_target) ? UTF8SKIP(s) : 1;
2168 }
2169 break;
2170 }
236d82fd 2171
73104a1b 2172 case BOUNDL:
780fcc9f 2173 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6 2174 if (FLAGS(c) != TRADITIONAL_BOUND) {
89ad707a
KW
2175 if (! IN_UTF8_CTYPE_LOCALE) {
2176 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
64935bc6 2177 B_ON_NON_UTF8_LOCALE_IS_WRONG);
89ad707a 2178 }
64935bc6
KW
2179 goto do_boundu;
2180 }
2181
7a207065 2182 FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
73104a1b 2183 break;
64935bc6 2184
73104a1b 2185 case NBOUNDL:
780fcc9f 2186 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6 2187 if (FLAGS(c) != TRADITIONAL_BOUND) {
89ad707a
KW
2188 if (! IN_UTF8_CTYPE_LOCALE) {
2189 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
64935bc6 2190 B_ON_NON_UTF8_LOCALE_IS_WRONG);
89ad707a 2191 }
64935bc6
KW
2192 goto do_nboundu;
2193 }
2194
7a207065 2195 FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
73104a1b 2196 break;
64935bc6
KW
2197
2198 case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2199 meaning */
2200 assert(FLAGS(c) == TRADITIONAL_BOUND);
2201
7a207065 2202 FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
73104a1b 2203 break;
64935bc6
KW
2204
2205 case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2206 meaning */
2207 assert(FLAGS(c) == TRADITIONAL_BOUND);
2208
44129e46 2209 FBC_BOUND_A(isWORDCHAR_A);
73104a1b 2210 break;
64935bc6
KW
2211
2212 case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2213 meaning */
2214 assert(FLAGS(c) == TRADITIONAL_BOUND);
2215
7a207065 2216 FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
73104a1b 2217 break;
64935bc6
KW
2218
2219 case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2220 meaning */
2221 assert(FLAGS(c) == TRADITIONAL_BOUND);
2222
44129e46 2223 FBC_NBOUND_A(isWORDCHAR_A);
73104a1b 2224 break;
64935bc6 2225
73104a1b 2226 case NBOUNDU:
64935bc6 2227 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
7a207065 2228 FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
64935bc6
KW
2229 break;
2230 }
2231
2232 do_nboundu:
2233
2234 to_complement = 1;
2235 /* FALLTHROUGH */
2236
2237 case BOUNDU:
2238 do_boundu:
2239 switch((bound_type) FLAGS(c)) {
2240 case TRADITIONAL_BOUND:
7a207065 2241 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
64935bc6
KW
2242 break;
2243 case GCB_BOUND:
a7a8bd1e 2244 if (s == reginfo->strbeg) {
67481c39 2245 if (reginfo->intuit || regtry(reginfo, &s))
64935bc6
KW
2246 {
2247 goto got_it;
2248 }
a7a8bd1e
KW
2249
2250 /* Didn't match. Try at the next position (if there is one) */
64935bc6 2251 s += (utf8_target) ? UTF8SKIP(s) : 1;
a7a8bd1e
KW
2252 if (UNLIKELY(s >= reginfo->strend)) {
2253 break;
2254 }
64935bc6
KW
2255 }
2256
2257 if (utf8_target) {
85e5f08b 2258 GCB_enum before = getGCB_VAL_UTF8(
64935bc6
KW
2259 reghop3((U8*)s, -1,
2260 (U8*)(reginfo->strbeg)),
2261 (U8*) reginfo->strend);
2262 while (s < strend) {
85e5f08b 2263 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
64935bc6 2264 (U8*) reginfo->strend);
b0e24409
KW
2265 if ( (to_complement ^ isGCB(before,
2266 after,
2267 (U8*) reginfo->strbeg,
2268 (U8*) s,
2269 utf8_target))
00e3344b
KW
2270 && (reginfo->intuit || regtry(reginfo, &s)))
2271 {
2272 goto got_it;
64935bc6 2273 }
43a7bd62 2274 before = after;
64935bc6
KW
2275 s += UTF8SKIP(s);
2276 }
2277 }
2278 else { /* Not utf8. Everything is a GCB except between CR and
2279 LF */
2280 while (s < strend) {
00e3344b
KW
2281 if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
2282 || UCHARAT(s) != '\n'))
2283 && (reginfo->intuit || regtry(reginfo, &s)))
64935bc6 2284 {
00e3344b 2285 goto got_it;
64935bc6 2286 }
43a7bd62 2287 s++;
64935bc6
KW
2288 }
2289 }
2290
6de80efc
KW
2291 /* And, since this is a bound, it can match after the final
2292 * character in the string */
67481c39 2293 if ((reginfo->intuit || regtry(reginfo, &s))) {
64935bc6
KW
2294 goto got_it;
2295 }
2296 break;
ae3bb8ea 2297
6b659339
KW
2298 case LB_BOUND:
2299 if (s == reginfo->strbeg) {
2300 if (reginfo->intuit || regtry(reginfo, &s)) {
2301 goto got_it;
2302 }
2303 s += (utf8_target) ? UTF8SKIP(s) : 1;
2304 if (UNLIKELY(s >= reginfo->strend)) {
2305 break;
2306 }
2307 }
2308
2309 if (utf8_target) {
2310 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2311 -1,
2312 (U8*)(reginfo->strbeg)),
2313 (U8*) reginfo->strend);
2314 while (s < strend) {
2315 LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2316 if (to_complement ^ isLB(before,
2317 after,
2318 (U8*) reginfo->strbeg,
2319 (U8*) s,
2320 (U8*) reginfo->strend,
2321 utf8_target)
2322 && (reginfo->intuit || regtry(reginfo, &s)))
2323 {
2324 goto got_it;
2325 }
2326 before = after;
2327 s += UTF8SKIP(s);
2328 }
2329 }
2330 else { /* Not utf8. */
2331 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2332 while (s < strend) {
2333 LB_enum after = getLB_VAL_CP((U8) *s);
2334 if (to_complement ^ isLB(before,
2335 after,
2336 (U8*) reginfo->strbeg,
2337 (U8*) s,
2338 (U8*) reginfo->strend,
2339 utf8_target)
2340 && (reginfo->intuit || regtry(reginfo, &s)))
2341 {
2342 goto got_it;
2343 }
2344 before = after;
2345 s++;
2346 }
2347 }
2348
2349 if (reginfo->intuit || regtry(reginfo, &s)) {
2350 goto got_it;
2351 }
2352
2353 break;
2354
06ae2722 2355 case SB_BOUND:
a7a8bd1e 2356 if (s == reginfo->strbeg) {
67481c39 2357 if (reginfo->intuit || regtry(reginfo, &s)) {
06ae2722
KW
2358 goto got_it;
2359 }
06ae2722 2360 s += (utf8_target) ? UTF8SKIP(s) : 1;
a7a8bd1e
KW
2361 if (UNLIKELY(s >= reginfo->strend)) {
2362 break;
2363 }
06ae2722
KW
2364 }
2365
2366 if (utf8_target) {
85e5f08b 2367 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
06ae2722
KW
2368 -1,
2369 (U8*)(reginfo->strbeg)),
2370 (U8*) reginfo->strend);
2371 while (s < strend) {
85e5f08b 2372 SB_enum after = getSB_VAL_UTF8((U8*) s,
06ae2722 2373 (U8*) reginfo->strend);
00e3344b
KW
2374 if ((to_complement ^ isSB(before,
2375 after,
2376 (U8*) reginfo->strbeg,
2377 (U8*) s,
2378 (U8*) reginfo->strend,
2379 utf8_target))
2380 && (reginfo->intuit || regtry(reginfo, &s)))
06ae2722 2381 {
00e3344b 2382 goto got_it;
06ae2722 2383 }
43a7bd62 2384 before = after;
06ae2722
KW
2385 s += UTF8SKIP(s);
2386 }
2387 }
2388 else { /* Not utf8. */
85e5f08b 2389 SB_enum before = getSB_VAL_CP((U8) *(s -1));
06ae2722 2390 while (s < strend) {
85e5f08b 2391 SB_enum after = getSB_VAL_CP((U8) *s);
00e3344b
KW
2392 if ((to_complement ^ isSB(before,
2393 after,
2394 (U8*) reginfo->strbeg,
2395 (U8*) s,
2396 (U8*) reginfo->strend,
2397 utf8_target))
2398 && (reginfo->intuit || regtry(reginfo, &s)))
06ae2722 2399 {
00e3344b 2400 goto got_it;
06ae2722 2401 }
43a7bd62 2402 before = after;
06ae2722
KW
2403 s++;
2404 }
2405 }
2406
2407 /* Here are at the final position in the target string. The SB
2408 * value is always true here, so matches, depending on other
2409 * constraints */
67481c39 2410 if (reginfo->intuit || regtry(reginfo, &s)) {
06ae2722
KW
2411 goto got_it;
2412 }
2413
2414 break;
2415
ae3bb8ea
KW
2416 case WB_BOUND:
2417 if (s == reginfo->strbeg) {
67481c39 2418 if (reginfo->intuit || regtry(reginfo, &s)) {
ae3bb8ea
KW
2419 goto got_it;
2420 }
2421 s += (utf8_target) ? UTF8SKIP(s) : 1;
a7a8bd1e
KW
2422 if (UNLIKELY(s >= reginfo->strend)) {
2423 break;
2424 }
ae3bb8ea
KW
2425 }
2426
2427 if (utf8_target) {
2428 /* We are at a boundary between char_sub_0 and char_sub_1.
2429 * We also keep track of the value for char_sub_-1 as we
2430 * loop through the line. Context may be needed to make a
2431 * determination, and if so, this can save having to
2432 * recalculate it */
85e5f08b
KW
2433 WB_enum previous = WB_UNKNOWN;
2434 WB_enum before = getWB_VAL_UTF8(
ae3bb8ea
KW
2435 reghop3((U8*)s,
2436 -1,
2437 (U8*)(reginfo->strbeg)),
2438 (U8*) reginfo->strend);
2439 while (s < strend) {
85e5f08b 2440 WB_enum after = getWB_VAL_UTF8((U8*) s,
ae3bb8ea 2441 (U8*) reginfo->strend);
00e3344b
KW
2442 if ((to_complement ^ isWB(previous,
2443 before,
2444 after,
2445 (U8*) reginfo->strbeg,
2446 (U8*) s,
2447 (U8*) reginfo->strend,
2448 utf8_target))
2449 && (reginfo->intuit || regtry(reginfo, &s)))
ae3bb8ea 2450 {
00e3344b 2451 goto got_it;
ae3bb8ea 2452 }
43a7bd62
KW
2453 previous = before;
2454 before = after;
ae3bb8ea
KW
2455 s += UTF8SKIP(s);
2456 }
2457 }
2458 else { /* Not utf8. */
85e5f08b
KW
2459 WB_enum previous = WB_UNKNOWN;
2460 WB_enum before = getWB_VAL_CP((U8) *(s -1));
ae3bb8ea 2461 while (s < strend) {
85e5f08b 2462 WB_enum after = getWB_VAL_CP((U8) *s);
00e3344b
KW
2463 if ((to_complement ^ isWB(previous,
2464 before,
2465 after,
2466 (U8*) reginfo->strbeg,
2467 (U8*) s,
2468 (U8*) reginfo->strend,
2469 utf8_target))
2470 && (reginfo->intuit || regtry(reginfo, &s)))
ae3bb8ea 2471 {
00e3344b 2472 goto got_it;
ae3bb8ea 2473 }
43a7bd62
KW
2474 previous = before;
2475 before = after;
ae3bb8ea
KW
2476 s++;
2477 }
2478 }
2479
67481c39 2480 if (reginfo->intuit || regtry(reginfo, &s)) {
ae3bb8ea
KW
2481 goto got_it;
2482 }
64935bc6 2483 }
73104a1b 2484 break;
64935bc6 2485
73104a1b
KW
2486 case LNBREAK:
2487 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2488 is_LNBREAK_latin1_safe(s, strend)
2489 );
2490 break;
3018b823 2491
aff4cafe
KW
2492 case ASCII:
2493 s = find_next_ascii(s, strend, utf8_target);
2494 if (s < strend && (reginfo->intuit || regtry(reginfo, &s))) {
2495 goto got_it;
2496 }
2497
2498 break;
2499
2500 case NASCII:
2501 s = find_next_non_ascii(s, strend, utf8_target);
2502 if (s < strend && (reginfo->intuit || regtry(reginfo, &s))) {
2503 goto got_it;
2504 }
2505
2506 break;
2507
3018b823
KW
2508 /* The argument to all the POSIX node types is the class number to pass to
2509 * _generic_isCC() to build a mask for searching in PL_charclass[] */
2510
2511 case NPOSIXL:
2512 to_complement = 1;
2513 /* FALLTHROUGH */
2514
2515 case POSIXL:
780fcc9f 2516 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3018b823
KW
2517 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2518 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 2519 break;
3018b823
KW
2520
2521 case NPOSIXD:
2522 to_complement = 1;
2523 /* FALLTHROUGH */
2524
2525 case POSIXD:
2526 if (utf8_target) {
2527 goto posix_utf8;
2528 }
2529 goto posixa;
2530
2531 case NPOSIXA:
2532 if (utf8_target) {
2533 /* The complement of something that matches only ASCII matches all
837226c8 2534 * non-ASCII, plus everything in ASCII that isn't in the class. */
7a207065 2535 REXEC_FBC_UTF8_CLASS_SCAN( ! isASCII_utf8_safe(s, strend)
3018b823
KW
2536 || ! _generic_isCC_A(*s, FLAGS(c)));
2537 break;
2538 }
2539
2540 to_complement = 1;
4a6c6db5 2541 goto posixa;
3018b823 2542
73104a1b
KW
2543 case POSIXA:
2544 /* Don't need to worry about utf8, as it can match only a single
4a6c6db5
KW
2545 * byte invariant character. But we do anyway for performance reasons,
2546 * as otherwise we would have to examine all the continuation
2547 * characters */
2548 if (utf8_target) {
2549 REXEC_FBC_UTF8_CLASS_SCAN(_generic_isCC_A(*s, FLAGS(c)));
2550 break;
2551 }
2552
2553 posixa:
3018b823
KW
2554 REXEC_FBC_CLASS_SCAN(
2555 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 2556 break;
3018b823
KW
2557
2558 case NPOSIXU:
2559 to_complement = 1;
2560 /* FALLTHROUGH */
2561
2562 case POSIXU:
2563 if (! utf8_target) {
2564 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2565 FLAGS(c))));
2566 }
2567 else {
2568
c52b8b12 2569 posix_utf8:
3018b823
KW
2570 classnum = (_char_class_number) FLAGS(c);
2571 if (classnum < _FIRST_NON_SWASH_CC) {
2572 while (s < strend) {
2573
2574 /* We avoid loading in the swash as long as possible, but
2575 * should we have to, we jump to a separate loop. This
2576 * extra 'if' statement is what keeps this code from being
2577 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2578 if (UTF8_IS_ABOVE_LATIN1(*s)) {
2579 goto found_above_latin1;
2580 }
2581 if ((UTF8_IS_INVARIANT(*s)
2582 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2583 classnum)))
042d9e50 2584 || ( UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, strend)
3018b823 2585 && to_complement ^ cBOOL(
a62b247b 2586 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
94bb8c36 2587 *(s + 1)),
3018b823
KW
2588 classnum))))
2589 {
02d5137b 2590 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
3018b823
KW
2591 goto got_it;
2592 else {
2593 tmp = doevery;
2594 }
2595 }
2596 else {
2597 tmp = 1;
2598 }
2599 s += UTF8SKIP(s);
2600 }
2601 }
2602 else switch (classnum) { /* These classes are implemented as
2603 macros */
779cf272 2604 case _CC_ENUM_SPACE:
3018b823 2605 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 2606 to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
3018b823
KW
2607 break;
2608
2609 case _CC_ENUM_BLANK:
2610 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 2611 to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
3018b823
KW
2612 break;
2613
2614 case _CC_ENUM_XDIGIT:
2615 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 2616 to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3018b823
KW
2617 break;
2618
2619 case _CC_ENUM_VERTSPACE:
2620 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 2621 to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3018b823
KW
2622 break;
2623
2624 case _CC_ENUM_CNTRL:
2625 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 2626 to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
3018b823
KW
2627 break;
2628
2629 default:
2630 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
e5964223 2631 NOT_REACHED; /* NOTREACHED */
3018b823
KW
2632 }
2633 }
2634 break;
2635
2636 found_above_latin1: /* Here we have to load a swash to get the result
2637 for the current code point */
2638 if (! PL_utf8_swash_ptrs[classnum]) {
2639 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2640 PL_utf8_swash_ptrs[classnum] =
2a16ac92
KW
2641 _core_swash_init("utf8",
2642 "",
2643 &PL_sv_undef, 1, 0,
2644 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
2645 }
2646
2647 /* This is a copy of the loop above for swash classes, though using the
2648 * FBC macro instead of being expanded out. Since we've loaded the
2649 * swash, we don't have to check for that each time through the loop */
2650 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 2651 to_complement ^ cBOOL(_generic_utf8_safe(
3018b823
KW
2652 classnum,
2653 s,
7a207065 2654 strend,
3018b823
KW
2655 swash_fetch(PL_utf8_swash_ptrs[classnum],
2656 (U8 *) s, TRUE))));
73104a1b
KW
2657 break;
2658
2659 case AHOCORASICKC:
2660 case AHOCORASICK:
2661 {
2662 DECL_TRIE_TYPE(c);
2663 /* what trie are we using right now */
2664 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2665 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2666 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2667
2668 const char *last_start = strend - trie->minlen;
6148ee25 2669#ifdef DEBUGGING
73104a1b 2670 const char *real_start = s;
6148ee25 2671#endif
73104a1b
KW
2672 STRLEN maxlen = trie->maxlen;
2673 SV *sv_points;
2674 U8 **points; /* map of where we were in the input string
2675 when reading a given char. For ASCII this
2676 is unnecessary overhead as the relationship
2677 is always 1:1, but for Unicode, especially
2678 case folded Unicode this is not true. */
2679 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2680 U8 *bitmap=NULL;
2681
2682
2683 GET_RE_DEBUG_FLAGS_DECL;
2684
2685 /* We can't just allocate points here. We need to wrap it in
2686 * an SV so it gets freed properly if there is a croak while
2687 * running the match */
2688 ENTER;
2689 SAVETMPS;
2690 sv_points=newSV(maxlen * sizeof(U8 *));
2691 SvCUR_set(sv_points,
2692 maxlen * sizeof(U8 *));
2693 SvPOK_on(sv_points);
2694 sv_2mortal(sv_points);
2695 points=(U8**)SvPV_nolen(sv_points );
2696 if ( trie_type != trie_utf8_fold
2697 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2698 {
2699 if (trie->bitmap)
2700 bitmap=(U8*)trie->bitmap;
2701 else
2702 bitmap=(U8*)ANYOF_BITMAP(c);
2703 }
2704 /* this is the Aho-Corasick algorithm modified a touch
2705 to include special handling for long "unknown char" sequences.
2706 The basic idea being that we use AC as long as we are dealing
2707 with a possible matching char, when we encounter an unknown char
2708 (and we have not encountered an accepting state) we scan forward
2709 until we find a legal starting char.
2710 AC matching is basically that of trie matching, except that when
2711 we encounter a failing transition, we fall back to the current
2712 states "fail state", and try the current char again, a process
2713 we repeat until we reach the root state, state 1, or a legal
2714 transition. If we fail on the root state then we can either
2715 terminate if we have reached an accepting state previously, or
2716 restart the entire process from the beginning if we have not.
2717
2718 */
2719 while (s <= last_start) {
2720 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2721 U8 *uc = (U8*)s;
2722 U16 charid = 0;
2723 U32 base = 1;
2724 U32 state = 1;
2725 UV uvc = 0;
2726 STRLEN len = 0;
2727 STRLEN foldlen = 0;
2728 U8 *uscan = (U8*)NULL;
2729 U8 *leftmost = NULL;
2730#ifdef DEBUGGING
2731 U32 accepted_word= 0;
786e8c11 2732#endif
73104a1b
KW
2733 U32 pointpos = 0;
2734
2735 while ( state && uc <= (U8*)strend ) {
2736 int failed=0;
2737 U32 word = aho->states[ state ].wordnum;
2738
2739 if( state==1 ) {
2740 if ( bitmap ) {
2741 DEBUG_TRIE_EXECUTE_r(
2742 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2743 dump_exec_pos( (char *)uc, c, strend, real_start,
cb41e5d6 2744 (char *)uc, utf8_target, 0 );
6ad9a8ab 2745 Perl_re_printf( aTHX_
73104a1b
KW
2746 " Scanning for legal start char...\n");
2747 }
2748 );
2749 if (utf8_target) {
2750 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2751 uc += UTF8SKIP(uc);
2752 }
2753 } else {
2754 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2755 uc++;
2756 }
786e8c11 2757 }
73104a1b 2758 s= (char *)uc;
07be1b83 2759 }
73104a1b
KW
2760 if (uc >(U8*)last_start) break;
2761 }
2762
2763 if ( word ) {
2764 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2765 if (!leftmost || lpos < leftmost) {
2766 DEBUG_r(accepted_word=word);
2767 leftmost= lpos;
7016d6eb 2768 }
73104a1b 2769 if (base==0) break;
7016d6eb 2770
73104a1b
KW
2771 }
2772 points[pointpos++ % maxlen]= uc;
2773 if (foldlen || uc < (U8*)strend) {
2774 REXEC_TRIE_READ_CHAR(trie_type, trie,
2775 widecharmap, uc,
2776 uscan, len, uvc, charid, foldlen,
2777 foldbuf, uniflags);
2778 DEBUG_TRIE_EXECUTE_r({
2779 dump_exec_pos( (char *)uc, c, strend,
cb41e5d6 2780 real_start, s, utf8_target, 0);
6ad9a8ab 2781 Perl_re_printf( aTHX_
147e3846 2782 " Charid:%3u CP:%4" UVxf " ",
73104a1b
KW
2783 charid, uvc);
2784 });
2785 }
2786 else {
2787 len = 0;
2788 charid = 0;
2789 }
07be1b83 2790
73104a1b
KW
2791
2792 do {
6148ee25 2793#ifdef DEBUGGING
73104a1b 2794 word = aho->states[ state ].wordnum;
6148ee25 2795#endif
73104a1b
KW
2796 base = aho->states[ state ].trans.base;
2797
2798 DEBUG_TRIE_EXECUTE_r({
2799 if (failed)
2800 dump_exec_pos( (char *)uc, c, strend, real_start,
cb41e5d6 2801 s, utf8_target, 0 );
6ad9a8ab 2802 Perl_re_printf( aTHX_
147e3846 2803 "%sState: %4" UVxf ", word=%" UVxf,
73104a1b
KW
2804 failed ? " Fail transition to " : "",
2805 (UV)state, (UV)word);
2806 });
2807 if ( base ) {
2808 U32 tmp;
2809 I32 offset;
2810 if (charid &&
2811 ( ((offset = base + charid
2812 - 1 - trie->uniquecharcount)) >= 0)
2813 && ((U32)offset < trie->lasttrans)
2814 && trie->trans[offset].check == state
2815 && (tmp=trie->trans[offset].next))
2816 {
2817 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 2818 Perl_re_printf( aTHX_ " - legal\n"));
73104a1b
KW
2819 state = tmp;
2820 break;
07be1b83
YO
2821 }
2822 else {
786e8c11 2823 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 2824 Perl_re_printf( aTHX_ " - fail\n"));
786e8c11 2825 failed = 1;
73104a1b 2826 state = aho->fail[state];
07be1b83 2827 }
07be1b83 2828 }
73104a1b
KW
2829 else {
2830 /* we must be accepting here */
2831 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 2832 Perl_re_printf( aTHX_ " - accepting\n"));
73104a1b
KW
2833 failed = 1;
2834 break;
786e8c11 2835 }
73104a1b
KW
2836 } while(state);
2837 uc += len;
2838 if (failed) {
2839 if (leftmost)
2840 break;
2841 if (!state) state = 1;
07be1b83 2842 }
73104a1b
KW
2843 }
2844 if ( aho->states[ state ].wordnum ) {
2845 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2846 if (!leftmost || lpos < leftmost) {
2847 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2848 leftmost = lpos;
07be1b83
YO
2849 }
2850 }
73104a1b
KW
2851 if (leftmost) {
2852 s = (char*)leftmost;
2853 DEBUG_TRIE_EXECUTE_r({
147e3846 2854 Perl_re_printf( aTHX_ "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
73104a1b
KW
2855 (UV)accepted_word, (IV)(s - real_start)
2856 );
2857 });
02d5137b 2858 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
2859 FREETMPS;
2860 LEAVE;
2861 goto got_it;
2862 }
2863 s = HOPc(s,1);
2864 DEBUG_TRIE_EXECUTE_r({
6ad9a8ab 2865 Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
73104a1b
KW
2866 });
2867 } else {
2868 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 2869 Perl_re_printf( aTHX_ "No match.\n"));
73104a1b
KW
2870 break;
2871 }
2872 }
2873 FREETMPS;
2874 LEAVE;
2875 }
2876 break;
2877 default:
2878 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
73104a1b
KW
2879 }
2880 return 0;
2881 got_it:
2882 return s;
6eb5f6b9
JH
2883}
2884
60165aa4
DM
2885/* set RX_SAVED_COPY, RX_SUBBEG etc.
2886 * flags have same meanings as with regexec_flags() */
2887
749f4950
DM
2888static void
2889S_reg_set_capture_string(pTHX_ REGEXP * const rx,
60165aa4
DM
2890 char *strbeg,
2891 char *strend,
2892 SV *sv,
2893 U32 flags,
2894 bool utf8_target)
2895{
2896 struct regexp *const prog = ReANY(rx);
2897
60165aa4
DM
2898 if (flags & REXEC_COPY_STR) {
2899#ifdef PERL_ANY_COW
2900 if (SvCANCOW(sv)) {
eb8fc9fe 2901 DEBUG_C(Perl_re_printf( aTHX_
60165aa4 2902 "Copy on write: regexp capture, type %d\n",
eb8fc9fe 2903 (int) SvTYPE(sv)));
5411a0e5
DM
2904 /* Create a new COW SV to share the match string and store
2905 * in saved_copy, unless the current COW SV in saved_copy
2906 * is valid and suitable for our purpose */
2907 if (( prog->saved_copy
2908 && SvIsCOW(prog->saved_copy)
2909 && SvPOKp(prog->saved_copy)
2910 && SvIsCOW(sv)
2911 && SvPOKp(sv)
2912 && SvPVX(sv) == SvPVX(prog->saved_copy)))
a76b0e90 2913 {
5411a0e5
DM
2914 /* just reuse saved_copy SV */
2915 if (RXp_MATCH_COPIED(prog)) {
2916 Safefree(prog->subbeg);
2917 RXp_MATCH_COPIED_off(prog);
2918 }
2919 }
2920 else {
2921 /* create new COW SV to share string */
196a02af 2922 RXp_MATCH_COPY_FREE(prog);
a76b0e90 2923 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
a76b0e90 2924 }
5411a0e5
DM
2925 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2926 assert (SvPOKp(prog->saved_copy));
60165aa4
DM
2927 prog->sublen = strend - strbeg;
2928 prog->suboffset = 0;
2929 prog->subcoffset = 0;
2930 } else
2931#endif
2932 {
99a90e59
FC
2933 SSize_t min = 0;
2934 SSize_t max = strend - strbeg;
ea3daa5d 2935 SSize_t sublen;
60165aa4
DM
2936
2937 if ( (flags & REXEC_COPY_SKIP_POST)
e322109a 2938 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2939 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2940 ) { /* don't copy $' part of string */
2941 U32 n = 0;
2942 max = -1;
2943 /* calculate the right-most part of the string covered
f67a5002 2944 * by a capture. Due to lookahead, this may be to
60165aa4
DM
2945 * the right of $&, so we have to scan all captures */
2946 while (n <= prog->lastparen) {
2947 if (prog->offs[n].end > max)
2948 max = prog->offs[n].end;
2949 n++;
2950 }
2951 if (max == -1)
2952 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2953 ? prog->offs[0].start
2954 : 0;
2955 assert(max >= 0 && max <= strend - strbeg);
2956 }
2957
2958 if ( (flags & REXEC_COPY_SKIP_PRE)
e322109a 2959 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2960 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2961 ) { /* don't copy $` part of string */
2962 U32 n = 0;
2963 min = max;
2964 /* calculate the left-most part of the string covered
f67a5002 2965 * by a capture. Due to lookbehind, this may be to
60165aa4
DM
2966 * the left of $&, so we have to scan all captures */
2967 while (min && n <= prog->lastparen) {
2968 if ( prog->offs[n].start != -1
2969 && prog->offs[n].start < min)
2970 {
2971 min = prog->offs[n].start;
2972 }
2973 n++;
2974 }
2975 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2976 && min > prog->offs[0].end
2977 )
2978 min = prog->offs[0].end;
2979
2980 }
2981
2982 assert(min >= 0 && min <= max && min <= strend - strbeg);
2983 sublen = max - min;
2984
196a02af 2985 if (RXp_MATCH_COPIED(prog)) {
60165aa4
DM
2986 if (sublen > prog->sublen)
2987 prog->subbeg =
2988 (char*)saferealloc(prog->subbeg, sublen+1);
2989 }
2990 else
2991 prog->subbeg = (char*)safemalloc(sublen+1);
2992 Copy(strbeg + min, prog->subbeg, sublen, char);
2993 prog->subbeg[sublen] = '\0';
2994 prog->suboffset = min;
2995 prog->sublen = sublen;
196a02af 2996 RXp_MATCH_COPIED_on(prog);
60165aa4
DM
2997 }
2998 prog->subcoffset = prog->suboffset;
2999 if (prog->suboffset && utf8_target) {
3000 /* Convert byte offset to chars.
3001 * XXX ideally should only compute this if @-/@+
3002 * has been seen, a la PL_sawampersand ??? */
3003
3004 /* If there's a direct correspondence between the
3005 * string which we're matching and the original SV,
3006 * then we can use the utf8 len cache associated with
3007 * the SV. In particular, it means that under //g,
3008 * sv_pos_b2u() will use the previously cached
3009 * position to speed up working out the new length of
3010 * subcoffset, rather than counting from the start of
3011 * the string each time. This stops
3012 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3013 * from going quadratic */
3014 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
ea3daa5d
FC
3015 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3016 SV_GMAGIC|SV_CONST_RETURN);
60165aa4
DM
3017 else
3018 prog->subcoffset = utf8_length((U8*)strbeg,
3019 (U8*)(strbeg+prog->suboffset));
3020 }
3021 }
3022 else {
196a02af 3023 RXp_MATCH_COPY_FREE(prog);
60165aa4
DM
3024 prog->subbeg = strbeg;
3025 prog->suboffset = 0;
3026 prog->subcoffset = 0;
3027 prog->sublen = strend - strbeg;
3028 }
3029}
3030
3031
3032
fae667d5 3033
6eb5f6b9
JH
3034/*
3035 - regexec_flags - match a regexp against a string
3036 */
3037I32
5aaab254 3038Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
ea3daa5d 3039 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
3040/* stringarg: the point in the string at which to begin matching */
3041/* strend: pointer to null at end of string */
3042/* strbeg: real beginning of string */
3043/* minend: end of match must be >= minend bytes after stringarg. */
3044/* sv: SV being matched: only used for utf8 flag, pos() etc; string
3045 * itself is accessed via the pointers above */
3046/* data: May be used for some additional optimizations.
d058ec57 3047 Currently unused. */
a340edde 3048/* flags: For optimizations. See REXEC_* in regexp.h */
8fd1a950 3049
6eb5f6b9 3050{
8d919b0a 3051 struct regexp *const prog = ReANY(rx);
5aaab254 3052 char *s;
eb578fdb 3053 regnode *c;
03c83e26 3054 char *startpos;
ea3daa5d
FC
3055 SSize_t minlen; /* must match at least this many chars */
3056 SSize_t dontbother = 0; /* how many characters not to try at end */
f2ed9b32 3057 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 3058 I32 multiline;
f8fc2ecf 3059 RXi_GET_DECL(prog,progi);
02d5137b
DM
3060 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
3061 regmatch_info *const reginfo = &reginfo_buf;
e9105d30 3062 regexp_paren_pair *swap = NULL;
006f26b2 3063 I32 oldsave;
a3621e74
YO
3064 GET_RE_DEBUG_FLAGS_DECL;
3065
7918f24d 3066 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 3067 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
3068
3069 /* Be paranoid... */
3dc78631 3070 if (prog == NULL) {
6eb5f6b9 3071 Perl_croak(aTHX_ "NULL regexp parameter");
6eb5f6b9
JH
3072 }
3073
6c3fea77 3074 DEBUG_EXECUTE_r(
03c83e26 3075 debug_start_match(rx, utf8_target, stringarg, strend,
6c3fea77
DM
3076 "Matching");
3077 );
8adc0f72 3078
b342a604
DM
3079 startpos = stringarg;
3080
4cf1a867
DM
3081 /* set these early as they may be used by the HOP macros below */
3082 reginfo->strbeg = strbeg;
3083 reginfo->strend = strend;
3084 reginfo->is_utf8_target = cBOOL(utf8_target);
3085
58430ea8 3086 if (prog->intflags & PREGf_GPOS_SEEN) {
d307c076
DM
3087 MAGIC *mg;
3088
fef7148b
DM
3089 /* set reginfo->ganch, the position where \G can match */
3090
3091 reginfo->ganch =
3092 (flags & REXEC_IGNOREPOS)
3093 ? stringarg /* use start pos rather than pos() */
3dc78631 3094 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
25fdce4a
FC
3095 /* Defined pos(): */
3096 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
fef7148b
DM
3097 : strbeg; /* pos() not defined; use start of string */
3098
6ad9a8ab 3099 DEBUG_GPOS_r(Perl_re_printf( aTHX_
147e3846 3100 "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
fef7148b 3101
03c83e26
DM
3102 /* in the presence of \G, we may need to start looking earlier in
3103 * the string than the suggested start point of stringarg:
0b2c2a84 3104 * if prog->gofs is set, then that's a known, fixed minimum
03c83e26
DM
3105 * offset, such as
3106 * /..\G/: gofs = 2
3107 * /ab|c\G/: gofs = 1
3108 * or if the minimum offset isn't known, then we have to go back
3109 * to the start of the string, e.g. /w+\G/
3110 */
2bfbe302 3111
8e1490ee 3112 if (prog->intflags & PREGf_ANCH_GPOS) {
4cf1a867
DM
3113 if (prog->gofs) {
3114 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3115 if (!startpos ||
3116 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3117 {
6ad9a8ab 3118 DEBUG_r(Perl_re_printf( aTHX_
4cf1a867
DM
3119 "fail: ganch-gofs before earliest possible start\n"));
3120 return 0;
3121 }
2bfbe302 3122 }
4cf1a867
DM
3123 else
3124 startpos = reginfo->ganch;
2bfbe302
DM
3125 }
3126 else if (prog->gofs) {
4cf1a867
DM
3127 startpos = HOPBACKc(startpos, prog->gofs);
3128 if (!startpos)
b342a604 3129 startpos = strbeg;
03c83e26 3130 }
58430ea8 3131 else if (prog->intflags & PREGf_GPOS_FLOAT)
b342a604 3132 startpos = strbeg;
03c83e26
DM
3133 }
3134
3135 minlen = prog->minlen;
b342a604 3136 if ((startpos + minlen) > strend || startpos < strbeg) {
6ad9a8ab 3137 DEBUG_r(Perl_re_printf( aTHX_
03c83e26
DM
3138 "Regex match can't succeed, so not even tried\n"));
3139 return 0;
3140 }
3141
63a3746a
DM
3142 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3143 * which will call destuctors to reset PL_regmatch_state, free higher
3144 * PL_regmatch_slabs, and clean up regmatch_info_aux and
3145 * regmatch_info_aux_eval */
3146
3147 oldsave = PL_savestack_ix;
3148
dfa77d06
DM
3149 s = startpos;
3150
e322109a 3151 if ((prog->extflags & RXf_USE_INTUIT)
7fadf4a7
DM
3152 && !(flags & REXEC_CHECKED))
3153 {
dfa77d06 3154 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
7fadf4a7 3155 flags, NULL);
dfa77d06 3156 if (!s)
7fadf4a7
DM
3157 return 0;
3158
e322109a 3159 if (prog->extflags & RXf_CHECK_ALL) {
7fadf4a7
DM
3160 /* we can match based purely on the result of INTUIT.
3161 * Set up captures etc just for $& and $-[0]
3162 * (an intuit-only match wont have $1,$2,..) */
3163 assert(!prog->nparens);
d5e7783a
DM
3164
3165 /* s/// doesn't like it if $& is earlier than where we asked it to
3166 * start searching (which can happen on something like /.\G/) */
3167 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3168 && (s < stringarg))
3169 {
3170 /* this should only be possible under \G */
58430ea8 3171 assert(prog->intflags & PREGf_GPOS_SEEN);
6ad9a8ab 3172 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
d5e7783a
DM
3173 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3174 goto phooey;
3175 }
3176
7fadf4a7
DM
3177 /* match via INTUIT shouldn't have any captures.
3178 * Let @-, @+, $^N know */
3179 prog->lastparen = prog->lastcloseparen = 0;
196a02af 3180 RXp_MATCH_UTF8_set(prog, utf8_target);
3ff69bd6
DM
3181 prog->offs[0].start = s - strbeg;
3182 prog->offs[0].end = utf8_target
3183 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3184 : s - strbeg + prog->minlenret;
7fadf4a7 3185 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 3186 S_reg_set_capture_string(aTHX_ rx,
7fadf4a7
DM
3187 strbeg, strend,
3188 sv, flags, utf8_target);
3189
7fadf4a7
DM
3190 return 1;
3191 }
3192 }
3193
6c3fea77 3194 multiline = prog->extflags & RXf_PMf_MULTILINE;
1de06328 3195
dfa77d06 3196 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
6ad9a8ab 3197 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
a72c7584
JH
3198 "String too short [regexec_flags]...\n"));
3199 goto phooey;
1aa99e6b 3200 }
1de06328 3201
6eb5f6b9 3202 /* Check validity of program. */
f8fc2ecf 3203 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
3204 Perl_croak(aTHX_ "corrupted regexp program");
3205 }
3206
196a02af
DM
3207 RXp_MATCH_TAINTED_off(prog);
3208 RXp_MATCH_UTF8_set(prog, utf8_target);
1738e041 3209
6c3fea77
DM
3210 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
3211 reginfo->intuit = 0;
02d5137b
DM
3212 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3213 reginfo->warned = FALSE;
02d5137b 3214 reginfo->sv = sv;
1cb48e53 3215 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
6eb5f6b9 3216 /* see how far we have to get to not match where we matched before */
fe3974be 3217 reginfo->till = stringarg + minend;
6eb5f6b9 3218
60779a30 3219 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
82c23608
FC
3220 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3221 S_cleanup_regmatch_info_aux has executed (registered by
3222 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
3223 magic belonging to this SV.
3224 Not newSVsv, either, as it does not COW.
3225 */
3226 reginfo->sv = newSV(0);
4cba5ac0 3227 SvSetSV_nosteal(reginfo->sv, sv);
82c23608
FC
3228 SAVEFREESV(reginfo->sv);
3229 }
3230
331b2dcc
DM
3231 /* reserve next 2 or 3 slots in PL_regmatch_state:
3232 * slot N+0: may currently be in use: skip it
3233 * slot N+1: use for regmatch_info_aux struct
3234 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3235 * slot N+3: ready for use by regmatch()
3236 */
bf2039a9 3237
331b2dcc
DM
3238 {
3239 regmatch_state *old_regmatch_state;
3240 regmatch_slab *old_regmatch_slab;
3241 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3242
3243 /* on first ever match, allocate first slab */
3244 if (!PL_regmatch_slab) {
3245 Newx(PL_regmatch_slab, 1, regmatch_slab);
3246 PL_regmatch_slab->prev = NULL;
3247 PL_regmatch_slab->next = NULL;
3248 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3249 }
bf2039a9 3250
331b2dcc
DM
3251 old_regmatch_state = PL_regmatch_state;
3252 old_regmatch_slab = PL_regmatch_slab;
bf2039a9 3253
331b2dcc
DM
3254 for (i=0; i <= max; i++) {
3255 if (i == 1)
3256 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3257 else if (i ==2)
3258 reginfo->info_aux_eval =
3259 reginfo->info_aux->info_aux_eval =
3260 &(PL_regmatch_state->u.info_aux_eval);
bf2039a9 3261
331b2dcc
DM
3262 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
3263 PL_regmatch_state = S_push_slab(aTHX);
3264 }
bf2039a9 3265
331b2dcc
DM
3266 /* note initial PL_regmatch_state position; at end of match we'll
3267 * pop back to there and free any higher slabs */
bf2039a9 3268
331b2dcc
DM
3269 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3270 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2ac8ff4b 3271 reginfo->info_aux->poscache = NULL;
bf2039a9 3272
331b2dcc 3273 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
bf2039a9 3274
331b2dcc
DM
3275 if ((prog->extflags & RXf_EVAL_SEEN))
3276 S_setup_eval_state(aTHX_ reginfo);
3277 else
3278 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
bf2039a9 3279 }
d3aa529c 3280
6eb5f6b9 3281 /* If there is a "must appear" string, look for it. */
6eb5f6b9 3282
288b8c02 3283 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
3284 /* We have to be careful. If the previous successful match
3285 was from this regex we don't want a subsequent partially
3286 successful match to clobber the old results.
3287 So when we detect this possibility we add a swap buffer
d8da0584
KW
3288 to the re, and switch the buffer each match. If we fail,
3289 we switch it back; otherwise we leave it swapped.
e9105d30
GG
3290 */
3291 swap = prog->offs;
3292 /* do we need a save destructor here for eval dies? */
3293 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2b1a3689 3294 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
147e3846 3295 "rex=0x%" UVxf " saving offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
2b1a3689
YO
3296 0,
3297 PTR2UV(prog),
495f47a5
DM
3298 PTR2UV(swap),
3299 PTR2UV(prog->offs)
3300 ));
c74340f9 3301 }
6eb5f6b9 3302
ba6840fb
YO
3303 if (prog->recurse_locinput)
3304 Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3305
0fa70a06
DM
3306 /* Simplest case: anchored match need be tried only once, or with
3307 * MBOL, only at the beginning of each line.
3308 *
3309 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3310 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3311 * match at the start of the string then it won't match anywhere else
3312 * either; while with /.*.../, if it doesn't match at the beginning,
3313 * the earliest it could match is at the start of the next line */
3314
8e1490ee 3315 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
0fa70a06
DM
3316 char *end;
3317
3318 if (regtry(reginfo, &s))
6eb5f6b9 3319 goto got_it;
0fa70a06
DM
3320
3321 if (!(prog->intflags & PREGf_ANCH_MBOL))
3322 goto phooey;
3323
3324 /* didn't match at start, try at other newline positions */
3325
3326 if (minlen)
3327 dontbother = minlen - 1;
3328 end = HOP3c(strend, -dontbother, strbeg) - 1;
3329
3330 /* skip to next newline */
3331
3332 while (s <= end) { /* note it could be possible to match at the end of the string */
3333 /* NB: newlines are the same in unicode as they are in latin */
3334 if (*s++ != '\n')
3335 continue;
3336 if (prog->check_substr || prog->check_utf8) {
3337 /* note that with PREGf_IMPLICIT, intuit can only fail
3338 * or return the start position, so it's of limited utility.
3339 * Nevertheless, I made the decision that the potential for
3340 * quick fail was still worth it - DAPM */
3341 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3342 if (!s)
3343 goto phooey;
3344 }
3345 if (regtry(reginfo, &s))
3346 goto got_it;
3347 }
3348 goto phooey;
3349 } /* end anchored search */
3350
3351 if (prog->intflags & PREGf_ANCH_GPOS)
f9f4320a 3352 {
a8430a8b
YO
3353 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3354 assert(prog->intflags & PREGf_GPOS_SEEN);
2bfbe302
DM
3355 /* For anchored \G, the only position it can match from is
3356 * (ganch-gofs); we already set startpos to this above; if intuit
3357 * moved us on from there, we can't possibly succeed */
4cf1a867 3358 assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
2bfbe302 3359 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9
JH
3360 goto got_it;
3361 goto phooey;
3362 }
3363
3364 /* Messy cases: unanchored match. */
bbe252da 3365 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 3366 /* we have /x+whatever/ */
984e6dd1 3367 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 3368 char ch;
bf93d4cc
GS
3369#ifdef DEBUGGING
3370 int did_match = 0;
3371#endif
f2ed9b32 3372 if (utf8_target) {
7e0d5ad7
KW
3373 if (! prog->anchored_utf8) {
3374 to_utf8_substr(prog);
3375 }
3376 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 3377 REXEC_FBC_SCAN(
6eb5f6b9 3378 if (*s == ch) {
a3621e74 3379 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 3380 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
3381 s += UTF8SKIP(s);
3382 while (s < strend && *s == ch)
3383 s += UTF8SKIP(s);
3384 }
4cadc6a9 3385 );
7e0d5ad7 3386
6eb5f6b9
JH
3387 }
3388 else {
7e0d5ad7
KW
3389 if (! prog->anchored_substr) {
3390 if (! to_byte_substr(prog)) {
6b54ddc5 3391 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3392 }
3393 }
3394 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 3395 REXEC_FBC_SCAN(
6eb5f6b9 3396 if (*s == ch) {
a3621e74 3397 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 3398 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
3399 s++;
3400 while (s < strend && *s == ch)
3401 s++;
3402 }
4cadc6a9 3403 );
6eb5f6b9 3404 }
a3621e74 3405 DEBUG_EXECUTE_r(if (!did_match)
6ad9a8ab 3406 Perl_re_printf( aTHX_
b7953727
JH
3407 "Did not find anchored character...\n")
3408 );
6eb5f6b9 3409 }
a0714e2c
SS
3410 else if (prog->anchored_substr != NULL
3411 || prog->anchored_utf8 != NULL
3412 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
3413 && prog->float_max_offset < strend - s)) {
3414 SV *must;
ea3daa5d
FC
3415 SSize_t back_max;
3416 SSize_t back_min;
33b8afdf 3417 char *last;
6eb5f6b9 3418 char *last1; /* Last position checked before */
bf93d4cc
GS
3419#ifdef DEBUGGING
3420 int did_match = 0;
3421#endif
33b8afdf 3422 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
3423 if (utf8_target) {
3424 if (! prog->anchored_utf8) {
3425 to_utf8_substr(prog);
3426 }
3427 must = prog->anchored_utf8;
3428 }
3429 else {
3430 if (! prog->anchored_substr) {
3431 if (! to_byte_substr(prog)) {
6b54ddc5 3432 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3433 }
3434 }
3435 must = prog->anchored_substr;
3436 }
33b8afdf
JH
3437 back_max = back_min = prog->anchored_offset;
3438 } else {
7e0d5ad7
KW
3439 if (utf8_target) {
3440 if (! prog->float_utf8) {
3441 to_utf8_substr(prog);
3442 }
3443 must = prog->float_utf8;
3444 }
3445 else {
3446 if (! prog->float_substr) {
3447 if (! to_byte_substr(prog)) {
6b54ddc5 3448 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3449 }
3450 }
3451 must = prog->float_substr;
3452 }
33b8afdf
JH
3453 back_max = prog->float_max_offset;
3454 back_min = prog->float_min_offset;
3455 }
1de06328 3456
1de06328
YO
3457 if (back_min<0) {
3458 last = strend;
3459 } else {
3460 last = HOP3c(strend, /* Cannot start after this */
ea3daa5d 3461 -(SSize_t)(CHR_SVLEN(must)
1de06328
YO
3462 - (SvTAIL(must) != 0) + back_min), strbeg);
3463 }
9d9163fb 3464 if (s > reginfo->strbeg)
6eb5f6b9
JH
3465 last1 = HOPc(s, -1);
3466 else
3467 last1 = s - 1; /* bogus */
3468
a0288114 3469 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9 3470 check_substr==must. */
bf05793b 3471 dontbother = 0;
6eb5f6b9
JH
3472 strend = HOPc(strend, -dontbother);
3473 while ( (s <= last) &&
e50d57d4 3474 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
9041c2e3 3475 (unsigned char*)strend, must,
c33e64f0 3476 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 3477 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
3478 if (HOPc(s, -back_max) > last1) {
3479 last1 = HOPc(s, -back_min);
3480 s = HOPc(s, -back_max);
3481 }
3482 else {
9d9163fb
DM
3483 char * const t = (last1 >= reginfo->strbeg)
3484 ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
3485
3486 last1 = HOPc(s, -back_min);
52657f30 3487 s = t;
6eb5f6b9 3488 }
f2ed9b32 3489 if (utf8_target) {
6eb5f6b9 3490 while (s <= last1) {
02d5137b 3491 if (regtry(reginfo, &s))