This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regexec.c: Don't retest the same byte immediately
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
f65819ce
CO
5 * One Ring to rule them all, One Ring to find them
6 *
4ac71550
TC
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
b992490d 83#include "invlist_inline.h"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
bbac6b20
KW
86#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
88
a0bd1a30
KW
89static const char utf8_locale_required[] =
90 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
91
e1cf74e3
CB
92#ifdef DEBUGGING
93/* At least one required character in the target string is expressible only in
94 * UTF-8. */
95static const char* const non_utf8_target_but_utf8_required
96 = "Can't match, because target string needs to be in UTF-8\n";
97#endif
98
7631e439
KW
99/* Returns a boolean as to whether the input unsigned number is a power of 2
100 * (2**0, 2**1, etc). In other words if it has just a single bit set.
101 * If not, subtracting 1 would leave the uppermost bit set, so the & would
102 * yield non-zero */
103#define isPOWER_OF_2(n) ((n & (n-1)) == 0)
104
7b031478 105#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
6ad9a8ab 106 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\
7b031478 107 goto target; \
e1cf74e3
CB
108} STMT_END
109
c74f6de9
KW
110#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
111
a687059c
LW
112#ifndef STATIC
113#define STATIC static
114#endif
115
451c6e0b
KW
116/* Valid only if 'c', the character being looke-up, is an invariant under
117 * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
118 * everything matchable is straight forward in the bitmap */
119#define REGINCLASS(prog,p,c,u) (ANYOF_FLAGS(p) \
120 ? reginclass(prog,p,c,c+1,u) \
121 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 122
c277df42
IZ
123/*
124 * Forwards.
125 */
126
f2ed9b32 127#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 128
3dab1dad 129#define HOPc(pos,off) \
ba44c216 130 (char *)(reginfo->is_utf8_target \
220db18a 131 ? reghop3((U8*)pos, off, \
9d9163fb 132 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad 133 : (U8*)(pos + off))
557f47af 134
bb152a4b
DM
135/* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
136#define HOPBACK3(pos, off, lim) \
137 (reginfo->is_utf8_target \
138 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
139 : (pos - off >= lim) \
140 ? (U8*)pos - off \
3dab1dad 141 : NULL)
efb30f32 142
bb152a4b
DM
143#define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
144
ba44c216 145#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 146#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 147
557f47af
DM
148/* lim must be +ve. Returns NULL on overshoot */
149#define HOPMAYBE3(pos,off,lim) \
150 (reginfo->is_utf8_target \
151 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
152 : ((U8*)pos + off <= lim) \
153 ? (U8*)pos + off \
154 : NULL)
155
8e9f2289
DM
156/* like HOP3, but limits the result to <= lim even for the non-utf8 case.
157 * off must be >=0; args should be vars rather than expressions */
158#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
159 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
160 : (U8*)((pos + off) > lim ? lim : (pos + off)))
67853908 161#define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
8e9f2289 162
2974eaec
DM
163#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
164 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
165 : (U8*)(pos + off))
166#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
7016d6eb
DM
167
168#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
169#define NEXTCHR_IS_EOS (nextchr < 0)
170
171#define SET_nextchr \
220db18a 172 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
7016d6eb
DM
173
174#define SET_locinput(p) \
175 locinput = (p); \
176 SET_nextchr
177
178
2a16ac92 179#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
c7304fe2
KW
180 if (!swash_ptr) { \
181 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
c7304fe2 182 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
2a16ac92 183 1, 0, invlist, &flags); \
c7304fe2
KW
184 assert(swash_ptr); \
185 } \
186 } STMT_END
187
188/* If in debug mode, we test that a known character properly matches */
189#ifdef DEBUGGING
190# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
191 property_name, \
2a16ac92 192 invlist, \
c7304fe2 193 utf8_char_in_property) \
2a16ac92 194 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
c7304fe2
KW
195 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
196#else
197# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
198 property_name, \
2a16ac92 199 invlist, \
c7304fe2 200 utf8_char_in_property) \
2a16ac92 201 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
c7304fe2 202#endif
d1eb3177 203
c7304fe2
KW
204#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
205 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
2a16ac92
KW
206 "", \
207 PL_XPosix_ptrs[_CC_WORDCHAR], \
0766489e 208 LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
c7304fe2 209
c7304fe2 210#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
211/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
212
5f80c4cf 213/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
214/* it would be nice to rework regcomp.sym to generate this stuff. sigh
215 *
216 * NOTE that *nothing* that affects backtracking should be in here, specifically
217 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
218 * node that is in between two EXACT like nodes when ascertaining what the required
219 * "follow" character is. This should probably be moved to regex compile time
220 * although it may be done at run time beause of the REF possibility - more
221 * investigation required. -- demerphq
222*/
baa60164
KW
223#define JUMPABLE(rn) ( \
224 OP(rn) == OPEN || \
24be3102
YO
225 (OP(rn) == CLOSE && \
226 !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) || \
baa60164
KW
227 OP(rn) == EVAL || \
228 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
229 OP(rn) == PLUS || OP(rn) == MINMOD || \
230 OP(rn) == KEEPS || \
231 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 232)
ee9b8eae 233#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 234
ee9b8eae
YO
235#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
236
237#if 0
238/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
a4525e78 239 we don't need this definition. XXX These are now out-of-sync*/
ee9b8eae 240#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
098b07d5 241#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
242#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
243
244#else
245/* ... so we use this as its faster. */
a4525e78
KW
246#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==EXACTL )
247#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
ee9b8eae
YO
248#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
249#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
250
251#endif
e2d8ce26 252
a84d97b6
HS
253/*
254 Search for mandatory following text node; for lookahead, the text must
255 follow but for lookbehind (rn->flags != 0) we skip to the next step.
256*/
baa60164 257#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
258 while (JUMPABLE(rn)) { \
259 const OPCODE type = OP(rn); \
260 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 261 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 262 else if (type == PLUS) \
cca55fe3 263 rn = NEXTOPER(rn); \
3dab1dad 264 else if (type == IFMATCH) \
a84d97b6 265 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 266 else rn += NEXT_OFF(rn); \
3dab1dad 267 } \
5f80c4cf 268} STMT_END
74750237 269
006f26b2
DM
270#define SLAB_FIRST(s) (&(s)->states[0])
271#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
272
a75351a1 273static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 274static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
bf2039a9 275static regmatch_state * S_push_slab(pTHX);
51371543 276
87c0511b 277#define REGCP_PAREN_ELEMS 3
f067efbf 278#define REGCP_OTHER_ELEMS 3
e0fa7e2b 279#define REGCP_FRAME_ELEMS 1
620d5b66
NC
280/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
281 * are needed for the regexp context stack bookkeeping. */
282
76e3520e 283STATIC CHECKPOINT
21553840 284S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
a0d0e21e 285{
a3b680e6 286 const int retval = PL_savestack_ix;
92da3157
DM
287 const int paren_elems_to_push =
288 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
289 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
290 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 291 I32 p;
40a82448 292 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 293
b93070ed
DM
294 PERL_ARGS_ASSERT_REGCPPUSH;
295
e49a9654 296 if (paren_elems_to_push < 0)
e8a85d26
JH
297 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
298 (int)paren_elems_to_push, (int)maxopenparen,
299 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
e49a9654 300
e0fa7e2b 301 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
147e3846 302 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
5df417d0 303 " out of range (%lu-%ld)",
92da3157
DM
304 total_elems,
305 (unsigned long)maxopenparen,
306 (long)parenfloor);
e0fa7e2b 307
620d5b66 308 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 309
495f47a5 310 DEBUG_BUFFERS_r(
92da3157 311 if ((int)maxopenparen > (int)parenfloor)
2b1a3689 312 Perl_re_exec_indentf( aTHX_
147e3846 313 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
2b1a3689
YO
314 depth,
315 PTR2UV(rex),
495f47a5
DM
316 PTR2UV(rex->offs)
317 );
318 );
92da3157 319 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 320/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
99a90e59
FC
321 SSPUSHIV(rex->offs[p].end);
322 SSPUSHIV(rex->offs[p].start);
1ca2007e 323 SSPUSHINT(rex->offs[p].start_tmp);
2b1a3689 324 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
147e3846 325 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
2b1a3689
YO
326 depth,
327 (UV)p,
495f47a5
DM
328 (IV)rex->offs[p].start,
329 (IV)rex->offs[p].start_tmp,
330 (IV)rex->offs[p].end
40a82448 331 ));
a0d0e21e 332 }
b1ce53c5 333/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 334 SSPUSHINT(maxopenparen);
b93070ed
DM
335 SSPUSHINT(rex->lastparen);
336 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 337 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 338
a0d0e21e
LW
339 return retval;
340}
341
c277df42 342/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
343#define REGCP_SET(cp) \
344 DEBUG_STATE_r( \
147e3846
KW
345 Perl_re_exec_indentf( aTHX_ \
346 "Setting an EVAL scope, savestack=%" IVdf ",\n", \
cb41e5d6
YO
347 depth, (IV)PL_savestack_ix \
348 ) \
349 ); \
ab3bbdeb 350 cp = PL_savestack_ix
c3464db5 351
ab3bbdeb 352#define REGCP_UNWIND(cp) \
e4f74956 353 DEBUG_STATE_r( \
cb41e5d6 354 if (cp != PL_savestack_ix) \
147e3846
KW
355 Perl_re_exec_indentf( aTHX_ \
356 "Clearing an EVAL scope, savestack=%" \
357 IVdf "..%" IVdf "\n", \
cb41e5d6
YO
358 depth, (IV)(cp), (IV)PL_savestack_ix \
359 ) \
360 ); \
ab3bbdeb 361 regcpblow(cp)
c277df42 362
a8d1f4b4
DM
363#define UNWIND_PAREN(lp, lcp) \
364 for (n = rex->lastparen; n > lp; n--) \
365 rex->offs[n].end = -1; \
366 rex->lastparen = n; \
367 rex->lastcloseparen = lcp;
368
369
f067efbf 370STATIC void
21553840 371S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
a0d0e21e 372{
e0fa7e2b 373 UV i;
87c0511b 374 U32 paren;
a3621e74
YO
375 GET_RE_DEBUG_FLAGS_DECL;
376
7918f24d
NC
377 PERL_ARGS_ASSERT_REGCPPOP;
378
b1ce53c5 379 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 380 i = SSPOPUV;
e0fa7e2b
NC
381 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
382 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
383 rex->lastcloseparen = SSPOPINT;
384 rex->lastparen = SSPOPINT;
92da3157 385 *maxopenparen_p = SSPOPINT;
b1ce53c5 386
620d5b66 387 i -= REGCP_OTHER_ELEMS;
b1ce53c5 388 /* Now restore the parentheses context. */
495f47a5
DM
389 DEBUG_BUFFERS_r(
390 if (i || rex->lastparen + 1 <= rex->nparens)
2b1a3689 391 Perl_re_exec_indentf( aTHX_
147e3846 392 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
2b1a3689
YO
393 depth,
394 PTR2UV(rex),
495f47a5
DM
395 PTR2UV(rex->offs)
396 );
397 );
92da3157 398 paren = *maxopenparen_p;
620d5b66 399 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
ea3daa5d 400 SSize_t tmps;
1ca2007e 401 rex->offs[paren].start_tmp = SSPOPINT;
99a90e59
FC
402 rex->offs[paren].start = SSPOPIV;
403 tmps = SSPOPIV;
b93070ed
DM
404 if (paren <= rex->lastparen)
405 rex->offs[paren].end = tmps;
2b1a3689 406 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
147e3846 407 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
2b1a3689
YO
408 depth,
409 (UV)paren,
495f47a5
DM
410 (IV)rex->offs[paren].start,
411 (IV)rex->offs[paren].start_tmp,
412 (IV)rex->offs[paren].end,
413 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 414 );
87c0511b 415 paren--;
a0d0e21e 416 }
daf18116 417#if 1
dafc8851
JH
418 /* It would seem that the similar code in regtry()
419 * already takes care of this, and in fact it is in
420 * a better location to since this code can #if 0-ed out
421 * but the code in regtry() is needed or otherwise tests
422 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
423 * (as of patchlevel 7877) will fail. Then again,
424 * this code seems to be necessary or otherwise
225593e1
DM
425 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
426 * --jhi updated by dapm */
b93070ed 427 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 428 if (i > *maxopenparen_p)
b93070ed
DM
429 rex->offs[i].start = -1;
430 rex->offs[i].end = -1;
2b1a3689 431 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
147e3846 432 " \\%" UVuf ": %s ..-1 undeffing\n",
2b1a3689
YO
433 depth,
434 (UV)i,
92da3157 435 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 436 ));
a0d0e21e 437 }
dafc8851 438#endif
a0d0e21e
LW
439}
440
74088413
DM
441/* restore the parens and associated vars at savestack position ix,
442 * but without popping the stack */
443
444STATIC void
21553840 445S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
74088413
DM
446{
447 I32 tmpix = PL_savestack_ix;
85882954
YO
448 PERL_ARGS_ASSERT_REGCP_RESTORE;
449
74088413 450 PL_savestack_ix = ix;
21553840 451 regcppop(rex, maxopenparen_p);
74088413
DM
452 PL_savestack_ix = tmpix;
453}
454
02db2b7b 455#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 456
9637d2a5
CB
457#ifndef PERL_IN_XSUB_RE
458
24e16d7b
KW
459bool
460Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
31c7f561
KW
461{
462 /* Returns a boolean as to whether or not 'character' is a member of the
463 * Posix character class given by 'classnum' that should be equivalent to a
464 * value in the typedef '_char_class_number'.
465 *
466 * Ideally this could be replaced by a just an array of function pointers
467 * to the C library functions that implement the macros this calls.
468 * However, to compile, the precise function signatures are required, and
469 * these may vary from platform to to platform. To avoid having to figure
470 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
471 * which adds an extra layer of function call overhead (unless the C
472 * optimizer strips it away). But we don't particularly care about
473 * performance with locales anyway. */
31c7f561
KW
474
475 switch ((_char_class_number) classnum) {
15861f94 476 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 477 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
478 case _CC_ENUM_ASCII: return isASCII_LC(character);
479 case _CC_ENUM_BLANK: return isBLANK_LC(character);
cee69f79 480 case _CC_ENUM_CASED: return isLOWER_LC(character)
b0d691b2 481 || isUPPER_LC(character);
e8d596e0 482 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
483 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
484 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
485 case _CC_ENUM_LOWER: return isLOWER_LC(character);
486 case _CC_ENUM_PRINT: return isPRINT_LC(character);
487 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 488 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
489 case _CC_ENUM_UPPER: return isUPPER_LC(character);
490 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 491 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
492 default: /* VERTSPACE should never occur in locales */
493 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
494 }
495
e5964223 496 NOT_REACHED; /* NOTREACHED */
31c7f561
KW
497 return FALSE;
498}
499
9637d2a5
CB
500#endif
501
3018b823
KW
502STATIC bool
503S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
504{
505 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
506 * 'character' is a member of the Posix character class given by 'classnum'
507 * that should be equivalent to a value in the typedef
508 * '_char_class_number'.
509 *
510 * This just calls isFOO_lc on the code point for the character if it is in
2f306ab9 511 * the range 0-255. Outside that range, all characters use Unicode
3018b823
KW
512 * rules, ignoring any locale. So use the Unicode function if this class
513 * requires a swash, and use the Unicode macro otherwise. */
514
515 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
516
517 if (UTF8_IS_INVARIANT(*character)) {
518 return isFOO_lc(classnum, *character);
519 }
520 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
521 return isFOO_lc(classnum,
a62b247b 522 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
3018b823
KW
523 }
524
613abc6d
KW
525 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
526
3018b823
KW
527 if (classnum < _FIRST_NON_SWASH_CC) {
528
529 /* Initialize the swash unless done already */
530 if (! PL_utf8_swash_ptrs[classnum]) {
531 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2a16ac92
KW
532 PL_utf8_swash_ptrs[classnum] =
533 _core_swash_init("utf8",
534 "",
535 &PL_sv_undef, 1, 0,
536 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
537 }
538
92a2046b
KW
539 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
540 character,
541 TRUE /* is UTF */ ));
3018b823
KW
542 }
543
544 switch ((_char_class_number) classnum) {
779cf272 545 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
3018b823
KW
546 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
547 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
548 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
e1ee3960 549 default: break;
3018b823
KW
550 }
551
e1ee3960 552 return FALSE; /* Things like CNTRL are always below 256 */
3018b823
KW
553}
554
aff4cafe
KW
555STATIC char *
556S_find_next_ascii(char * s, const char * send, const bool utf8_target)
557{
558 /* Returns the position of the first ASCII byte in the sequence between 's'
559 * and 'send-1' inclusive; returns 'send' if none found */
560
561 PERL_ARGS_ASSERT_FIND_NEXT_ASCII;
562
597ee3f4 563#ifndef EBCDIC
aff4cafe
KW
564
565 if ((STRLEN) (send - s) >= PERL_WORDSIZE
566
567 /* This term is wordsize if subword; 0 if not */
568 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
569
570 /* 'offset' */
571 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
572 {
573
574 /* Process per-byte until reach word boundary. XXX This loop could be
575 * eliminated if we knew that this platform had fast unaligned reads */
576 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
577 if (isASCII(*s)) {
578 return s;
579 }
580 s++; /* khw didn't bother creating a separate loop for
581 utf8_target */
582 }
583
584 /* Here, we know we have at least one full word to process. Process
585 * per-word as long as we have at least a full word left */
586 do {
1d2af574
KW
587 PERL_UINTMAX_T complemented = ~ * (PERL_UINTMAX_T *) s;
588 if (complemented & PERL_VARIANTS_WORD_MASK) {
589
590#if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
591 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
592
593 s += _variant_byte_number(complemented);
594 return s;
595
596#else /* If weird byte order, drop into next loop to do byte-at-a-time
597 checks. */
598
aff4cafe 599 break;
1d2af574 600#endif
aff4cafe 601 }
1d2af574 602
aff4cafe 603 s += PERL_WORDSIZE;
1d2af574 604
aff4cafe
KW
605 } while (s + PERL_WORDSIZE <= send);
606 }
607
608#endif
609
610 /* Process per-character */
611 if (utf8_target) {
612 while (s < send) {
613 if (isASCII(*s)) {
614 return s;
615 }
616 s += UTF8SKIP(s);
617 }
618 }
619 else {
620 while (s < send) {
621 if (isASCII(*s)) {
622 return s;
623 }
624 s++;
625 }
626 }
627
628 return s;
629}
630
631STATIC char *
632S_find_next_non_ascii(char * s, const char * send, const bool utf8_target)
633{
634 /* Returns the position of the first non-ASCII byte in the sequence between
635 * 's' and 'send-1' inclusive; returns 'send' if none found */
636
637#ifdef EBCDIC
638
639 PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
640
641 if (utf8_target) {
642 while (s < send) {
643 if ( ! isASCII(*s)) {
644 return s;
645 }
646 s += UTF8SKIP(s);
647 }
648 }
649 else {
650 while (s < send) {
651 if ( ! isASCII(*s)) {
652 return s;
653 }
654 s++;
655 }
656 }
657
658 return s;
659
660#else
661
662 const U8 * next_non_ascii = NULL;
663
664 PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
665 PERL_UNUSED_ARG(utf8_target);
666
667 /* On ASCII platforms invariants and ASCII are identical, so if the string
668 * is entirely invariants, there is no non-ASCII character */
669 return (is_utf8_invariant_string_loc((U8 *) s,
670 (STRLEN) (send - s),
671 &next_non_ascii))
672 ? (char *) send
673 : (char *) next_non_ascii;
674
675#endif
676
677}
678
ab1efbdc
KW
679STATIC char *
680S_find_span_end(char * s, const char * send, const char span_byte)
681{
682 /* Returns the position of the first byte in the sequence between 's' and
683 * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
684 * */
685
686 PERL_ARGS_ASSERT_FIND_SPAN_END;
687
688 assert(send >= s);
689
690 if ((STRLEN) (send - s) >= PERL_WORDSIZE
691 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
692 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
693 {
694 PERL_UINTMAX_T span_word;
695
696 /* Process per-byte until reach word boundary. XXX This loop could be
697 * eliminated if we knew that this platform had fast unaligned reads */
698 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
699 if (*s != span_byte) {
700 return s;
701 }
702 s++;
703 }
704
705 /* Create a word filled with the bytes we are spanning */
706 span_word = PERL_COUNT_MULTIPLIER * span_byte;
707
708 /* Process per-word as long as we have at least a full word left */
709 do {
710
711 /* Keep going if the whole word is composed of 'span_byte's */
712 if ((* (PERL_UINTMAX_T *) s) == span_word) {
713 s += PERL_WORDSIZE;
714 continue;
715 }
716
717 /* Here, at least one byte in the word isn't 'span_byte'. This xor
718 * leaves 1 bits only in those non-matching bytes */
719 span_word ^= * (PERL_UINTMAX_T *) s;
720
721 /* Make sure the upper bit of each non-matching byte is set. This
722 * makes each such byte look like an ASCII platform variant byte */
723 span_word |= span_word << 1;
724 span_word |= span_word << 2;
725 span_word |= span_word << 4;
726
727 /* That reduces the problem to what this function solves */
728 return s + _variant_byte_number(span_word);
729
730 } while (s + PERL_WORDSIZE <= send);
731 }
732
733 /* Process the straggler bytes beyond the final word boundary */
734 while (s < send) {
735 if (*s != span_byte) {
736 return s;
737 }
738 s++;
739 }
740
741 return s;
742}
743
2813d4ad
KW
744STATIC char *
745S_find_next_masked(char * s, const char * send, const U8 byte, const U8 mask)
746{
747 /* Returns the position of the first byte in the sequence between 's'
748 * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
749 * returns 'send' if none found. It uses word-level operations instead of
750 * byte to speed up the process */
751
752 PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
753
754 assert(send >= s);
755 assert((byte & mask) == byte);
756
757 if ((STRLEN) (send - s) >= PERL_WORDSIZE
758 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
759 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
760 {
761 PERL_UINTMAX_T word_complemented, mask_word;
762
763 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
764 if (((* (U8 *) s) & mask) == byte) {
765 return s;
766 }
767 s++;
768 }
769
770 word_complemented = ~ (PERL_COUNT_MULTIPLIER * byte);
771 mask_word = PERL_COUNT_MULTIPLIER * mask;
772
773 do {
774 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
775
776 /* If 'masked' contains 'byte' within it, anding with the
777 * complement will leave those 8 bits 0 */
778 masked &= word_complemented;
779
780 /* This causes the most significant bit to be set to 1 for any
781 * bytes in the word that aren't completely 0 */
782 masked |= masked << 1;
783 masked |= masked << 2;
784 masked |= masked << 4;
785
786 /* The msbits are the same as what marks a byte as variant, so we
787 * can use this mask. If all msbits are 1, the word doesn't
788 * contain 'byte' */
789 if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
790 s += PERL_WORDSIZE;
791 continue;
792 }
793
794 /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
795 * and any that are, are 0. Complement and re-AND to swap that */
796 masked = ~ masked;
797 masked &= PERL_VARIANTS_WORD_MASK;
798
799 /* This reduces the problem to that solved by this function */
800 s += _variant_byte_number(masked);
801 return s;
802
803 } while (s + PERL_WORDSIZE <= send);
804 }
805
806 while (s < send) {
807 if (((* (U8 *) s) & mask) == byte) {
808 return s;
809 }
810 s++;
811 }
812
813 return s;
814}
815
070e8b2e
KW
816STATIC U8 *
817S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
818{
819 /* Returns the position of the first byte in the sequence between 's' and
820 * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
821 * 'span_byte' should have been ANDed with 'mask' in the call of this
822 * function. Returns 'send' if none found. Works like find_span_end(),
823 * except for the AND */
824
825 PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
826
827 assert(send >= s);
828 assert((span_byte & mask) == span_byte);
829
830 if ((STRLEN) (send - s) >= PERL_WORDSIZE
831 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
832 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
833 {
834 PERL_UINTMAX_T span_word, mask_word;
835
836 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
837 if (((* (U8 *) s) & mask) != span_byte) {
838 return s;
839 }
840 s++;
841 }
842
843 span_word = PERL_COUNT_MULTIPLIER * span_byte;
844 mask_word = PERL_COUNT_MULTIPLIER * mask;
845
846 do {
847 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
848
849 if (masked == span_word) {
850 s += PERL_WORDSIZE;
851 continue;
852 }
853
854 masked ^= span_word;
855 masked |= masked << 1;
856 masked |= masked << 2;
857 masked |= masked << 4;
858 return s + _variant_byte_number(masked);
859
860 } while (s + PERL_WORDSIZE <= send);
861 }
862
863 while (s < send) {
864 if (((* (U8 *) s) & mask) != span_byte) {
865 return s;
866 }
867 s++;
868 }
869
870 return s;
871}
872
a687059c 873/*
e50aee73 874 * pregexec and friends
a687059c
LW
875 */
876
76234dfb 877#ifndef PERL_IN_XSUB_RE
a687059c 878/*
c277df42 879 - pregexec - match a regexp against a string
a687059c 880 */
c277df42 881I32
5aaab254 882Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
ea3daa5d 883 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
8fd1a950
DM
884/* stringarg: the point in the string at which to begin matching */
885/* strend: pointer to null at end of string */
886/* strbeg: real beginning of string */
887/* minend: end of match must be >= minend bytes after stringarg. */
888/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
889 * itself is accessed via the pointers above */
890/* nosave: For optimizations. */
c277df42 891{
7918f24d
NC
892 PERL_ARGS_ASSERT_PREGEXEC;
893
c277df42 894 return
9041c2e3 895 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
896 nosave ? 0 : REXEC_COPY_STR);
897}
76234dfb 898#endif
22e551b9 899
cad2e5aa 900
6eb5f6b9 901
1a4edc3c
DM
902/* re_intuit_start():
903 *
904 * Based on some optimiser hints, try to find the earliest position in the
905 * string where the regex could match.
906 *
907 * rx: the regex to match against
908 * sv: the SV being matched: only used for utf8 flag; the string
909 * itself is accessed via the pointers below. Note that on
910 * something like an overloaded SV, SvPOK(sv) may be false
911 * and the string pointers may point to something unrelated to
912 * the SV itself.
913 * strbeg: real beginning of string
914 * strpos: the point in the string at which to begin matching
915 * strend: pointer to the byte following the last char of the string
916 * flags currently unused; set to 0
917 * data: currently unused; set to NULL
918 *
919 * The basic idea of re_intuit_start() is to use some known information
920 * about the pattern, namely:
921 *
922 * a) the longest known anchored substring (i.e. one that's at a
923 * constant offset from the beginning of the pattern; but not
924 * necessarily at a fixed offset from the beginning of the
925 * string);
926 * b) the longest floating substring (i.e. one that's not at a constant
927 * offset from the beginning of the pattern);
928 * c) Whether the pattern is anchored to the string; either
929 * an absolute anchor: /^../, or anchored to \n: /^.../m,
930 * or anchored to pos(): /\G/;
931 * d) A start class: a real or synthetic character class which
932 * represents which characters are legal at the start of the pattern;
933 *
934 * to either quickly reject the match, or to find the earliest position
935 * within the string at which the pattern might match, thus avoiding
936 * running the full NFA engine at those earlier locations, only to
937 * eventually fail and retry further along.
938 *
939 * Returns NULL if the pattern can't match, or returns the address within
940 * the string which is the earliest place the match could occur.
941 *
942 * The longest of the anchored and floating substrings is called 'check'
943 * and is checked first. The other is called 'other' and is checked
944 * second. The 'other' substring may not be present. For example,
945 *
946 * /(abc|xyz)ABC\d{0,3}DEFG/
947 *
948 * will have
949 *
950 * check substr (float) = "DEFG", offset 6..9 chars
951 * other substr (anchored) = "ABC", offset 3..3 chars
952 * stclass = [ax]
953 *
954 * Be aware that during the course of this function, sometimes 'anchored'
955 * refers to a substring being anchored relative to the start of the
956 * pattern, and sometimes to the pattern itself being anchored relative to
957 * the string. For example:
958 *
959 * /\dabc/: "abc" is anchored to the pattern;
960 * /^\dabc/: "abc" is anchored to the pattern and the string;
961 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
962 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
963 * but the pattern is anchored to the string.
52a21eb3
DM
964 */
965
cad2e5aa 966char *
52a21eb3
DM
967Perl_re_intuit_start(pTHX_
968 REGEXP * const rx,
969 SV *sv,
970 const char * const strbeg,
971 char *strpos,
972 char *strend,
973 const U32 flags,
974 re_scream_pos_data *data)
cad2e5aa 975{
8d919b0a 976 struct regexp *const prog = ReANY(rx);
b2ad2123
DM
977 SSize_t start_shift = prog->check_offset_min;
978 /* Should be nonnegative! */
979 SSize_t end_shift = 0;
0fc004dd
DM
980 /* current lowest pos in string where the regex can start matching */
981 char *rx_origin = strpos;
eb578fdb 982 SV *check;
f2ed9b32 983 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
6480a6c4 984 U8 other_ix = 1 - prog->substrs->check_ix;
6ad5ffb3 985 bool ml_anch = 0;
8f4bf5fc 986 char *other_last = strpos;/* latest pos 'other' substr already checked to */
bd61b366 987 char *check_at = NULL; /* check substr found at this pos */
bbe252da 988 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 989 RXi_GET_DECL(prog,progi);
02d5137b
DM
990 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
991 regmatch_info *const reginfo = &reginfo_buf;
a3621e74
YO
992 GET_RE_DEBUG_FLAGS_DECL;
993
7918f24d 994 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
995 PERL_UNUSED_ARG(flags);
996 PERL_UNUSED_ARG(data);
7918f24d 997
6ad9a8ab 998 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0
DM
999 "Intuit: trying to determine minimum start position...\n"));
1000
fb9bbddb 1001 /* for now, assume that all substr offsets are positive. If at some point
f67a5002 1002 * in the future someone wants to do clever things with lookbehind and
fb9bbddb
DM
1003 * -ve offsets, they'll need to fix up any code in this function
1004 * which uses these offsets. See the thread beginning
1005 * <20140113145929.GF27210@iabyn.com>
1006 */
1007 assert(prog->substrs->data[0].min_offset >= 0);
1008 assert(prog->substrs->data[0].max_offset >= 0);
1009 assert(prog->substrs->data[1].min_offset >= 0);
1010 assert(prog->substrs->data[1].max_offset >= 0);
1011 assert(prog->substrs->data[2].min_offset >= 0);
1012 assert(prog->substrs->data[2].max_offset >= 0);
1013
f7022b5a 1014 /* for now, assume that if both present, that the floating substring
83f2232d 1015 * doesn't start before the anchored substring.
f7022b5a
DM
1016 * If you break this assumption (e.g. doing better optimisations
1017 * with lookahead/behind), then you'll need to audit the code in this
1018 * function carefully first
1019 */
1020 assert(
1021 ! ( (prog->anchored_utf8 || prog->anchored_substr)
1022 && (prog->float_utf8 || prog->float_substr))
1023 || (prog->float_min_offset >= prog->anchored_offset));
1024
1a4edc3c
DM
1025 /* byte rather than char calculation for efficiency. It fails
1026 * to quickly reject some cases that can't match, but will reject
1027 * them later after doing full char arithmetic */
c344f387 1028 if (prog->minlen > strend - strpos) {
6ad9a8ab 1029 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1030 " String too short...\n"));
cad2e5aa 1031 goto fail;
2c2d71f5 1032 }
d8da0584 1033
196a02af 1034 RXp_MATCH_UTF8_set(prog, utf8_target);
6c3fea77 1035 reginfo->is_utf8_target = cBOOL(utf8_target);
bf2039a9 1036 reginfo->info_aux = NULL;
9d9163fb 1037 reginfo->strbeg = strbeg;
220db18a 1038 reginfo->strend = strend;
aed7b151 1039 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 1040 reginfo->intuit = 1;
1cb48e53
DM
1041 /* not actually used within intuit, but zero for safety anyway */
1042 reginfo->poscache_maxiter = 0;
02d5137b 1043
f2ed9b32 1044 if (utf8_target) {
2814f4b3
HS
1045 if ((!prog->anchored_utf8 && prog->anchored_substr)
1046 || (!prog->float_utf8 && prog->float_substr))
33b8afdf
JH
1047 to_utf8_substr(prog);
1048 check = prog->check_utf8;
1049 } else {
7e0d5ad7
KW
1050 if (!prog->check_substr && prog->check_utf8) {
1051 if (! to_byte_substr(prog)) {
6b54ddc5 1052 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
1053 }
1054 }
33b8afdf
JH
1055 check = prog->check_substr;
1056 }
274cd312 1057
1dc475d0
DM
1058 /* dump the various substring data */
1059 DEBUG_OPTIMISE_MORE_r({
1060 int i;
1061 for (i=0; i<=2; i++) {
1062 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
1063 : prog->substrs->data[i].substr);
1064 if (!sv)
1065 continue;
1066
6ad9a8ab 1067 Perl_re_printf( aTHX_
147e3846
KW
1068 " substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
1069 " useful=%" IVdf " utf8=%d [%s]\n",
1dc475d0
DM
1070 i,
1071 (IV)prog->substrs->data[i].min_offset,
1072 (IV)prog->substrs->data[i].max_offset,
1073 (IV)prog->substrs->data[i].end_shift,
1074 BmUSEFUL(sv),
1075 utf8_target ? 1 : 0,
1076 SvPEEK(sv));
1077 }
1078 });
1079
8e1490ee 1080 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
9fc7410e
DM
1081
1082 /* ml_anch: check after \n?
1083 *
0fa70a06 1084 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
9fc7410e
DM
1085 * with /.*.../, these flags will have been added by the
1086 * compiler:
1087 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
1088 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
1089 */
7d2d37f5
DM
1090 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
1091 && !(prog->intflags & PREGf_IMPLICIT);
cad2e5aa 1092
343c8a29 1093 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
c889ccc8
DM
1094 /* we are only allowed to match at BOS or \G */
1095
57fcbfa7 1096 /* trivially reject if there's a BOS anchor and we're not at BOS.
7bb3b9eb
DM
1097 *
1098 * Note that we don't try to do a similar quick reject for
1099 * \G, since generally the caller will have calculated strpos
1100 * based on pos() and gofs, so the string is already correctly
1101 * anchored by definition; and handling the exceptions would
1102 * be too fiddly (e.g. REXEC_IGNOREPOS).
57fcbfa7 1103 */
7bb3b9eb 1104 if ( strpos != strbeg
d3d47aac 1105 && (prog->intflags & PREGf_ANCH_SBOL))
c889ccc8 1106 {
6ad9a8ab 1107 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1108 " Not at start...\n"));
c889ccc8
DM
1109 goto fail;
1110 }
1111
a5d12a4b
DM
1112 /* in the presence of an anchor, the anchored (relative to the
1113 * start of the regex) substr must also be anchored relative
66b7ec5c
DM
1114 * to strpos. So quickly reject if substr isn't found there.
1115 * This works for \G too, because the caller will already have
1116 * subtracted gofs from pos, and gofs is the offset from the
1117 * \G to the start of the regex. For example, in /.abc\Gdef/,
1118 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
1119 * caller will have set strpos=pos()-4; we look for the substr
1120 * at position pos()-4+1, which lines up with the "a" */
a5d12a4b 1121
33c28ab2 1122 if (prog->check_offset_min == prog->check_offset_max) {
c889ccc8 1123 /* Substring at constant offset from beg-of-str... */
b2ad2123 1124 SSize_t slen = SvCUR(check);
343c8a29 1125 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 1126
6ad9a8ab 1127 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1128 " Looking for check substr at fixed offset %" IVdf "...\n",
1dc475d0
DM
1129 (IV)prog->check_offset_min));
1130
7742aa66
DM
1131 if (SvTAIL(check)) {
1132 /* In this case, the regex is anchored at the end too.
1133 * Unless it's a multiline match, the lengths must match
b2ad2123 1134 * exactly, give or take a \n. NB: slen >= 1 since
7742aa66
DM
1135 * the last char of check is \n */
1136 if (!multiline
b2ad2123
DM
1137 && ( strend - s > slen
1138 || strend - s < slen - 1
1139 || (strend - s == slen && strend[-1] != '\n')))
c889ccc8 1140 {
6ad9a8ab 1141 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1142 " String too long...\n"));
c889ccc8
DM
1143 goto fail_finish;
1144 }
b2ad2123
DM
1145 /* Now should match s[0..slen-2] */
1146 slen--;
c889ccc8 1147 }
b2ad2123 1148 if (slen && (strend - s < slen
26fb2318 1149 || *SvPVX_const(check) != *s
b2ad2123 1150 || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
d307bf57 1151 {
6ad9a8ab 1152 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1153 " String not equal...\n"));
d307bf57
DM
1154 goto fail_finish;
1155 }
c889ccc8
DM
1156
1157 check_at = s;
1158 goto success_at_start;
cad2e5aa 1159 }
cad2e5aa 1160 }
cad2e5aa 1161 }
0fc004dd 1162
b2ad2123 1163 end_shift = prog->check_end_shift;
cad2e5aa 1164
19188028 1165#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
b2ad2123 1166 if (end_shift < 0)
147e3846 1167 Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
b2ad2123 1168 (IV)end_shift, RX_PRECOMP(rx));
2c2d71f5
JH
1169#endif
1170
2c2d71f5 1171 restart:
1de06328 1172
66b7ec5c
DM
1173 /* This is the (re)entry point of the main loop in this function.
1174 * The goal of this loop is to:
1175 * 1) find the "check" substring in the region rx_origin..strend
b2ad2123 1176 * (adjusted by start_shift / end_shift). If not found, reject
66b7ec5c
DM
1177 * immediately.
1178 * 2) If it exists, look for the "other" substr too if defined; for
1179 * example, if the check substr maps to the anchored substr, then
1180 * check the floating substr, and vice-versa. If not found, go
1181 * back to (1) with rx_origin suitably incremented.
1182 * 3) If we find an rx_origin position that doesn't contradict
1183 * either of the substrings, then check the possible additional
1184 * constraints on rx_origin of /^.../m or a known start class.
1185 * If these fail, then depending on which constraints fail, jump
1186 * back to here, or to various other re-entry points further along
1187 * that skip some of the first steps.
1188 * 4) If we pass all those tests, update the BmUSEFUL() count on the
1189 * substring. If the start position was determined to be at the
1190 * beginning of the string - so, not rejected, but not optimised,
1191 * since we have to run regmatch from position 0 - decrement the
1192 * BmUSEFUL() count. Otherwise increment it.
1193 */
1194
1a4edc3c
DM
1195
1196 /* first, look for the 'check' substring */
1197
1de06328 1198 {
c33e64f0
FC
1199 U8* start_point;
1200 U8* end_point;
c889ccc8 1201
c889ccc8 1202 DEBUG_OPTIMISE_MORE_r({
6ad9a8ab 1203 Perl_re_printf( aTHX_
147e3846
KW
1204 " At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1205 " Start shift: %" IVdf " End shift %" IVdf
1206 " Real end Shift: %" IVdf "\n",
675e93ee 1207 (IV)(rx_origin - strbeg),
c889ccc8 1208 (IV)prog->check_offset_min,
b2ad2123
DM
1209 (IV)start_shift,
1210 (IV)end_shift,
c889ccc8
DM
1211 (IV)prog->check_end_shift);
1212 });
1de06328 1213
b2ad2123 1214 end_point = HOPBACK3(strend, end_shift, rx_origin);
bb152a4b
DM
1215 if (!end_point)
1216 goto fail_finish;
b2ad2123 1217 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
33c28ab2
DM
1218 if (!start_point)
1219 goto fail_finish;
c889ccc8 1220
557f47af 1221
e0362b86 1222 /* If the regex is absolutely anchored to either the start of the
d3d47aac 1223 * string (SBOL) or to pos() (ANCH_GPOS), then
e0362b86
DM
1224 * check_offset_max represents an upper bound on the string where
1225 * the substr could start. For the ANCH_GPOS case, we assume that
1226 * the caller of intuit will have already set strpos to
1227 * pos()-gofs, so in this case strpos + offset_max will still be
1228 * an upper bound on the substr.
1229 */
c19c836a
DM
1230 if (!ml_anch
1231 && prog->intflags & PREGf_ANCH
e0362b86 1232 && prog->check_offset_max != SSize_t_MAX)
c19c836a 1233 {
b2ad2123 1234 SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
e0362b86
DM
1235 const char * const anchor =
1236 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
b2ad2123 1237 SSize_t targ_len = (char*)end_point - anchor;
2ce94a86 1238
b2ad2123 1239 if (check_len > targ_len) {
2ce94a86
DM
1240 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1241 "Anchored string too short...\n"));
1242 goto fail_finish;
1243 }
e0362b86
DM
1244
1245 /* do a bytes rather than chars comparison. It's conservative;
1246 * so it skips doing the HOP if the result can't possibly end
1247 * up earlier than the old value of end_point.
1248 */
b2ad2123
DM
1249 assert(anchor + check_len <= (char *)end_point);
1250 if (prog->check_offset_max + check_len < targ_len) {
e0362b86
DM
1251 end_point = HOP3lim((U8*)anchor,
1252 prog->check_offset_max,
b2ad2123 1253 end_point - check_len
2ce94a86 1254 )
b2ad2123 1255 + check_len;
e0362b86 1256 }
d6ef1678
DM
1257 }
1258
ae5d4331 1259 check_at = fbm_instr( start_point, end_point,
7fba1cd6 1260 check, multiline ? FBMrf_MULTILINE : 0);
c889ccc8 1261
6ad9a8ab 1262 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1263 " doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
675e93ee
DM
1264 (IV)((char*)start_point - strbeg),
1265 (IV)((char*)end_point - strbeg),
1266 (IV)(check_at ? check_at - strbeg : -1)
1267 ));
1268
8fd34720
DM
1269 /* Update the count-of-usability, remove useless subpatterns,
1270 unshift s. */
1271
1272 DEBUG_EXECUTE_r({
1273 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1274 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
6ad9a8ab 1275 Perl_re_printf( aTHX_ " %s %s substr %s%s%s",
8fd34720
DM
1276 (check_at ? "Found" : "Did not find"),
1277 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1278 ? "anchored" : "floating"),
1279 quoted,
1280 RE_SV_TAIL(check),
1281 (check_at ? " at offset " : "...\n") );
1282 });
2c2d71f5 1283
8fd34720
DM
1284 if (!check_at)
1285 goto fail_finish;
8fd34720
DM
1286 /* set rx_origin to the minimum position where the regex could start
1287 * matching, given the constraint of the just-matched check substring.
1288 * But don't set it lower than previously.
1289 */
fdc003fd 1290
8fd34720
DM
1291 if (check_at - rx_origin > prog->check_offset_max)
1292 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
675e93ee 1293 /* Finish the diagnostic message */
6ad9a8ab 1294 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1295 "%ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1296 (long)(check_at - strbeg),
1297 (IV)(rx_origin - strbeg)
1298 ));
8fd34720 1299 }
fdc003fd
DM
1300
1301
1a4edc3c 1302 /* now look for the 'other' substring if defined */
2c2d71f5 1303
6480a6c4
DM
1304 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
1305 : prog->substrs->data[other_ix].substr)
1de06328 1306 {
30944b6d 1307 /* Take into account the "other" substring. */
6c3343a6
DM
1308 char *last, *last1;
1309 char *s;
1310 SV* must;
1311 struct reg_substr_datum *other;
1312
1313 do_other_substr:
1314 other = &prog->substrs->data[other_ix];
1315
1316 /* if "other" is anchored:
1317 * we've previously found a floating substr starting at check_at.
1318 * This means that the regex origin must lie somewhere
1319 * between min (rx_origin): HOP3(check_at, -check_offset_max)
1320 * and max: HOP3(check_at, -check_offset_min)
1321 * (except that min will be >= strpos)
1322 * So the fixed substr must lie somewhere between
1323 * HOP3(min, anchored_offset)
1324 * HOP3(max, anchored_offset) + SvCUR(substr)
1325 */
1326
1327 /* if "other" is floating
1328 * Calculate last1, the absolute latest point where the
1329 * floating substr could start in the string, ignoring any
1330 * constraints from the earlier fixed match. It is calculated
1331 * as follows:
1332 *
1333 * strend - prog->minlen (in chars) is the absolute latest
1334 * position within the string where the origin of the regex
1335 * could appear. The latest start point for the floating
1336 * substr is float_min_offset(*) on from the start of the
1337 * regex. last1 simply combines thee two offsets.
1338 *
1339 * (*) You might think the latest start point should be
1340 * float_max_offset from the regex origin, and technically
1341 * you'd be correct. However, consider
1342 * /a\d{2,4}bcd\w/
1343 * Here, float min, max are 3,5 and minlen is 7.
1344 * This can match either
1345 * /a\d\dbcd\w/
1346 * /a\d\d\dbcd\w/
1347 * /a\d\d\d\dbcd\w/
1348 * In the first case, the regex matches minlen chars; in the
1349 * second, minlen+1, in the third, minlen+2.
1350 * In the first case, the floating offset is 3 (which equals
1351 * float_min), in the second, 4, and in the third, 5 (which
1352 * equals float_max). In all cases, the floating string bcd
1353 * can never start more than 4 chars from the end of the
1354 * string, which equals minlen - float_min. As the substring
1355 * starts to match more than float_min from the start of the
1356 * regex, it makes the regex match more than minlen chars,
1357 * and the two cancel each other out. So we can always use
1358 * float_min - minlen, rather than float_max - minlen for the
1359 * latest position in the string.
1360 *
1361 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1362 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1363 */
1364
e7a14a9c 1365 assert(prog->minlen >= other->min_offset);
6c3343a6
DM
1366 last1 = HOP3c(strend,
1367 other->min_offset - prog->minlen, strbeg);
1368
4d006249 1369 if (other_ix) {/* i.e. if (other-is-float) */
6c3343a6
DM
1370 /* last is the latest point where the floating substr could
1371 * start, *given* any constraints from the earlier fixed
1372 * match. This constraint is that the floating string starts
1373 * <= float_max_offset chars from the regex origin (rx_origin).
1374 * If this value is less than last1, use it instead.
eb3831ce 1375 */
6c3343a6
DM
1376 assert(rx_origin <= last1);
1377 last =
1378 /* this condition handles the offset==infinity case, and
1379 * is a short-cut otherwise. Although it's comparing a
1380 * byte offset to a char length, it does so in a safe way,
1381 * since 1 char always occupies 1 or more bytes,
1382 * so if a string range is (last1 - rx_origin) bytes,
1383 * it will be less than or equal to (last1 - rx_origin)
1384 * chars; meaning it errs towards doing the accurate HOP3
1385 * rather than just using last1 as a short-cut */
1386 (last1 - rx_origin) < other->max_offset
1387 ? last1
1388 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1389 }
1390 else {
b2ad2123
DM
1391 assert(strpos + start_shift <= check_at);
1392 last = HOP4c(check_at, other->min_offset - start_shift,
6c3343a6
DM
1393 strbeg, strend);
1394 }
ead917d0 1395
6c3343a6
DM
1396 s = HOP3c(rx_origin, other->min_offset, strend);
1397 if (s < other_last) /* These positions already checked */
1398 s = other_last;
1399
1400 must = utf8_target ? other->utf8_substr : other->substr;
1401 assert(SvPOK(must));
675e93ee
DM
1402 {
1403 char *from = s;
1404 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1405
71a9d105
DM
1406 if (to > strend)
1407 to = strend;
88203927
DM
1408 if (from > to) {
1409 s = NULL;
6ad9a8ab 1410 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1411 " skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
88203927
DM
1412 (IV)(from - strbeg),
1413 (IV)(to - strbeg)
1414 ));
1415 }
1416 else {
1417 s = fbm_instr(
1418 (unsigned char*)from,
1419 (unsigned char*)to,
1420 must,
1421 multiline ? FBMrf_MULTILINE : 0
1422 );
6ad9a8ab 1423 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1424 " doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
88203927
DM
1425 (IV)(from - strbeg),
1426 (IV)(to - strbeg),
1427 (IV)(s ? s - strbeg : -1)
1428 ));
1429 }
675e93ee
DM
1430 }
1431
6c3343a6
DM
1432 DEBUG_EXECUTE_r({
1433 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1434 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
6ad9a8ab 1435 Perl_re_printf( aTHX_ " %s %s substr %s%s",
6c3343a6
DM
1436 s ? "Found" : "Contradicts",
1437 other_ix ? "floating" : "anchored",
1438 quoted, RE_SV_TAIL(must));
1439 });
ead917d0 1440
ead917d0 1441
6c3343a6
DM
1442 if (!s) {
1443 /* last1 is latest possible substr location. If we didn't
1444 * find it before there, we never will */
1445 if (last >= last1) {
6ad9a8ab 1446 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee 1447 "; giving up...\n"));
6c3343a6 1448 goto fail_finish;
ead917d0
DM
1449 }
1450
6c3343a6
DM
1451 /* try to find the check substr again at a later
1452 * position. Maybe next time we'll find the "other" substr
1453 * in range too */
6c3343a6
DM
1454 other_last = HOP3c(last, 1, strend) /* highest failure */;
1455 rx_origin =
4d006249 1456 other_ix /* i.e. if other-is-float */
6c3343a6
DM
1457 ? HOP3c(rx_origin, 1, strend)
1458 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
6ad9a8ab 1459 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1460 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1461 (other_ix ? "floating" : "anchored"),
1462 (long)(HOP3c(check_at, 1, strend) - strbeg),
1463 (IV)(rx_origin - strbeg)
1464 ));
6c3343a6
DM
1465 goto restart;
1466 }
1467 else {
4d006249 1468 if (other_ix) { /* if (other-is-float) */
6c3343a6
DM
1469 /* other_last is set to s, not s+1, since its possible for
1470 * a floating substr to fail first time, then succeed
1471 * second time at the same floating position; e.g.:
1472 * "-AB--AABZ" =~ /\wAB\d*Z/
1473 * The first time round, anchored and float match at
1474 * "-(AB)--AAB(Z)" then fail on the initial \w character
1475 * class. Second time round, they match at "-AB--A(AB)(Z)".
1476 */
1477 other_last = s;
ead917d0
DM
1478 }
1479 else {
6c3343a6
DM
1480 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1481 other_last = HOP3c(s, 1, strend);
ead917d0 1482 }
6ad9a8ab 1483 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1484 " at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1485 (long)(s - strbeg),
1486 (IV)(rx_origin - strbeg)
1487 ));
1488
6c3343a6 1489 }
cad2e5aa 1490 }
acba93e8
DM
1491 else {
1492 DEBUG_OPTIMISE_MORE_r(
6ad9a8ab 1493 Perl_re_printf( aTHX_
147e3846
KW
1494 " Check-only match: offset min:%" IVdf " max:%" IVdf
1495 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1496 " strend:%" IVdf "\n",
acba93e8
DM
1497 (IV)prog->check_offset_min,
1498 (IV)prog->check_offset_max,
675e93ee
DM
1499 (IV)(check_at-strbeg),
1500 (IV)(rx_origin-strbeg),
1c1c599d 1501 (IV)(rx_origin-check_at),
675e93ee 1502 (IV)(strend-strbeg)
acba93e8
DM
1503 )
1504 );
1505 }
2c2d71f5 1506
acba93e8 1507 postprocess_substr_matches:
0991020e 1508
1a4edc3c 1509 /* handle the extra constraint of /^.../m if present */
e3c6feb0 1510
7d2d37f5 1511 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
4620cb61
DM
1512 char *s;
1513
6ad9a8ab 1514 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
a62659bd 1515 " looking for /^/m anchor"));
d0880ea7
DM
1516
1517 /* we have failed the constraint of a \n before rx_origin.
2e759faa
DM
1518 * Find the next \n, if any, even if it's beyond the current
1519 * anchored and/or floating substrings. Whether we should be
1520 * scanning ahead for the next \n or the next substr is debatable.
1521 * On the one hand you'd expect rare substrings to appear less
1522 * often than \n's. On the other hand, searching for \n means
675e93ee 1523 * we're effectively flipping between check_substr and "\n" on each
2e759faa
DM
1524 * iteration as the current "rarest" string candidate, which
1525 * means for example that we'll quickly reject the whole string if
1526 * hasn't got a \n, rather than trying every substr position
1527 * first
1528 */
d0880ea7 1529
4620cb61
DM
1530 s = HOP3c(strend, - prog->minlen, strpos);
1531 if (s <= rx_origin ||
1532 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1533 {
6ad9a8ab 1534 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
d0880ea7
DM
1535 " Did not find /%s^%s/m...\n",
1536 PL_colors[0], PL_colors[1]));
a62659bd
DM
1537 goto fail_finish;
1538 }
d0880ea7 1539
4ada1233
DM
1540 /* earliest possible origin is 1 char after the \n.
1541 * (since *rx_origin == '\n', it's safe to ++ here rather than
1542 * HOP(rx_origin, 1)) */
1543 rx_origin++;
d0880ea7 1544
f4f115de 1545 if (prog->substrs->check_ix == 0 /* check is anchored */
4ada1233 1546 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
f4f115de 1547 {
d0880ea7
DM
1548 /* Position contradicts check-string; either because
1549 * check was anchored (and thus has no wiggle room),
4ada1233 1550 * or check was float and rx_origin is above the float range */
6ad9a8ab 1551 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee
DM
1552 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1553 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
d0880ea7
DM
1554 goto restart;
1555 }
1556
1557 /* if we get here, the check substr must have been float,
2e759faa 1558 * is in range, and we may or may not have had an anchored
d0880ea7
DM
1559 * "other" substr which still contradicts */
1560 assert(prog->substrs->check_ix); /* check is float */
1561
1562 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1563 /* whoops, the anchored "other" substr exists, so we still
1564 * contradict. On the other hand, the float "check" substr
1565 * didn't contradict, so just retry the anchored "other"
1566 * substr */
6ad9a8ab 1567 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1568 " Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
d0880ea7 1569 PL_colors[0], PL_colors[1],
73e8ff00
DM
1570 (IV)(rx_origin - strbeg + prog->anchored_offset),
1571 (IV)(rx_origin - strbeg)
675e93ee 1572 ));
d0880ea7
DM
1573 goto do_other_substr;
1574 }
1575
1576 /* success: we don't contradict the found floating substring
1577 * (and there's no anchored substr). */
6ad9a8ab 1578 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee
DM
1579 " Found /%s^%s/m with rx_origin %ld...\n",
1580 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
e3c6feb0
DM
1581 }
1582 else {
6ad9a8ab 1583 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
fe4f3442 1584 " (multiline anchor test skipped)\n"));
e3c6feb0
DM
1585 }
1586
ffad1e6a 1587 success_at_start:
e3c6feb0 1588
cad2e5aa 1589
dd170ff5
DM
1590 /* if we have a starting character class, then test that extra constraint.
1591 * (trie stclasses are too expensive to use here, we are better off to
1592 * leave it to regmatch itself) */
1593
f8fc2ecf 1594 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
f8fc2ecf 1595 const U8* const str = (U8*)STRING(progi->regstclass);
0991020e 1596
b2ad2123
DM
1597 /* XXX this value could be pre-computed */
1598 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
2c75e362
DM
1599 ? (reginfo->is_utf8_pat
1600 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1601 : STR_LEN(progi->regstclass))
66e933ab 1602 : 1);
1de06328 1603 char * endpos;
fa3bb21d 1604 char *s;
000dfd2d
DM
1605 /* latest pos that a matching float substr constrains rx start to */
1606 char *rx_max_float = NULL;
1607
c75a3985
DM
1608 /* if the current rx_origin is anchored, either by satisfying an
1609 * anchored substring constraint, or a /^.../m constraint, then we
1610 * can reject the current origin if the start class isn't found
1611 * at the current position. If we have a float-only match, then
1612 * rx_origin is constrained to a range; so look for the start class
1613 * in that range. if neither, then look for the start class in the
1614 * whole rest of the string */
1615
dd170ff5
DM
1616 /* XXX DAPM it's not clear what the minlen test is for, and why
1617 * it's not used in the floating case. Nothing in the test suite
1618 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1619 * Here are some old comments, which may or may not be correct:
1620 *
1621 * minlen == 0 is possible if regstclass is \b or \B,
1622 * and the fixed substr is ''$.
1623 * Since minlen is already taken into account, rx_origin+1 is
1624 * before strend; accidentally, minlen >= 1 guaranties no false
1625 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1626 * 0) below assumes that regstclass does not come from lookahead...
1627 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1628 * This leaves EXACTF-ish only, which are dealt with in
1629 * find_byclass().
1630 */
1631
7d2d37f5 1632 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
b2ad2123 1633 endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
000dfd2d 1634 else if (prog->float_substr || prog->float_utf8) {
b2ad2123
DM
1635 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1636 endpos = HOP3clim(rx_max_float, cl_l, strend);
000dfd2d 1637 }
1de06328
YO
1638 else
1639 endpos= strend;
1640
6ad9a8ab 1641 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
b2ad2123 1642 " looking for class: start_shift: %" IVdf " check_at: %" IVdf
147e3846 1643 " rx_origin: %" IVdf " endpos: %" IVdf "\n",
b2ad2123 1644 (IV)start_shift, (IV)(check_at - strbeg),
c43b5520 1645 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
d8080198 1646
c43b5520 1647 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
f9176b44 1648 reginfo);
be778b1a 1649 if (!s) {
6eb5f6b9 1650 if (endpos == strend) {
6ad9a8ab 1651 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1dc475d0 1652 " Could not match STCLASS...\n") );
6eb5f6b9
JH
1653 goto fail;
1654 }
6ad9a8ab 1655 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1dc475d0 1656 " This position contradicts STCLASS...\n") );
e0eb31e7
DM
1657 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1658 && !(prog->intflags & PREGf_IMPLICIT))
653099ff 1659 goto fail;
9fed8d02 1660
6eb5f6b9 1661 /* Contradict one of substrings */
97136c8a
DM
1662 if (prog->anchored_substr || prog->anchored_utf8) {
1663 if (prog->substrs->check_ix == 1) { /* check is float */
1664 /* Have both, check_string is floating */
b2ad2123
DM
1665 assert(rx_origin + start_shift <= check_at);
1666 if (rx_origin + start_shift != check_at) {
97136c8a 1667 /* not at latest position float substr could match:
c75a3985
DM
1668 * Recheck anchored substring, but not floating.
1669 * The condition above is in bytes rather than
1670 * chars for efficiency. It's conservative, in
1671 * that it errs on the side of doing 'goto
88203927
DM
1672 * do_other_substr'. In this case, at worst,
1673 * an extra anchored search may get done, but in
1674 * practice the extra fbm_instr() is likely to
1675 * get skipped anyway. */
6ad9a8ab 1676 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
147e3846 1677 " about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1678 (long)(other_last - strbeg),
1679 (IV)(rx_origin - strbeg)
1680 ));
97136c8a 1681 goto do_other_substr;
3369914b 1682 }
3369914b
DM
1683 }
1684 }
97136c8a 1685 else {
9fed8d02
DM
1686 /* float-only */
1687
7d2d37f5 1688 if (ml_anch) {
c75a3985
DM
1689 /* In the presence of ml_anch, we might be able to
1690 * find another \n without breaking the current float
1691 * constraint. */
1692
1693 /* strictly speaking this should be HOP3c(..., 1, ...),
1694 * but since we goto a block of code that's going to
1695 * search for the next \n if any, its safe here */
9fed8d02 1696 rx_origin++;
6ad9a8ab 1697 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
675e93ee 1698 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
9fed8d02 1699 PL_colors[0], PL_colors[1],
675e93ee 1700 (long)(rx_origin - strbeg)) );
9fed8d02 1701 goto postprocess_substr_matches;
ab60c45a 1702 }
c75a3985
DM
1703
1704 /* strictly speaking this can never be true; but might
1705 * be if we ever allow intuit without substrings */
1706 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
9fed8d02 1707 goto fail;
c75a3985 1708
000dfd2d 1709 rx_origin = rx_max_float;
9fed8d02
DM
1710 }
1711
c75a3985
DM
1712 /* at this point, any matching substrings have been
1713 * contradicted. Start again... */
1714
9fed8d02 1715 rx_origin = HOP3c(rx_origin, 1, strend);
557f47af
DM
1716
1717 /* uses bytes rather than char calculations for efficiency.
1718 * It's conservative: it errs on the side of doing 'goto restart',
1719 * where there is code that does a proper char-based test */
b2ad2123 1720 if (rx_origin + start_shift + end_shift > strend) {
6ad9a8ab 1721 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
9fed8d02
DM
1722 " Could not match STCLASS...\n") );
1723 goto fail;
1724 }
6ad9a8ab 1725 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
147e3846 1726 " about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
9fed8d02 1727 (prog->substrs->check_ix ? "floating" : "anchored"),
b2ad2123 1728 (long)(rx_origin + start_shift - strbeg),
675e93ee
DM
1729 (IV)(rx_origin - strbeg)
1730 ));
9fed8d02 1731 goto restart;
6eb5f6b9 1732 }
9fed8d02 1733
c75a3985
DM
1734 /* Success !!! */
1735
5f9c6575 1736 if (rx_origin != s) {
6ad9a8ab 1737 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1738 " By STCLASS: moving %ld --> %ld\n",
675e93ee 1739 (long)(rx_origin - strbeg), (long)(s - strbeg))
b7953727
JH
1740 );
1741 }
1742 else {
6ad9a8ab 1743 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1744 " Does not contradict STCLASS...\n");
b7953727
JH
1745 );
1746 }
6eb5f6b9 1747 }
ffad1e6a
DM
1748
1749 /* Decide whether using the substrings helped */
1750
1751 if (rx_origin != strpos) {
1752 /* Fixed substring is found far enough so that the match
1753 cannot start at strpos. */
1754
6ad9a8ab 1755 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n"));
ffad1e6a
DM
1756 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1757 }
1758 else {
70563e16
DM
1759 /* The found rx_origin position does not prohibit matching at
1760 * strpos, so calling intuit didn't gain us anything. Decrement
1761 * the BmUSEFUL() count on the check substring, and if we reach
1762 * zero, free it. */
1763 if (!(prog->intflags & PREGf_NAUGHTY)
ffad1e6a
DM
1764 && (utf8_target ? (
1765 prog->check_utf8 /* Could be deleted already */
1766 && --BmUSEFUL(prog->check_utf8) < 0
1767 && (prog->check_utf8 == prog->float_utf8)
1768 ) : (
1769 prog->check_substr /* Could be deleted already */
1770 && --BmUSEFUL(prog->check_substr) < 0
1771 && (prog->check_substr == prog->float_substr)
1772 )))
1773 {
1774 /* If flags & SOMETHING - do not do it many times on the same match */
6ad9a8ab 1775 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n"));
ffad1e6a
DM
1776 /* XXX Does the destruction order has to change with utf8_target? */
1777 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1778 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1779 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1780 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1781 check = NULL; /* abort */
ffad1e6a
DM
1782 /* XXXX This is a remnant of the old implementation. It
1783 looks wasteful, since now INTUIT can use many
1784 other heuristics. */
1785 prog->extflags &= ~RXf_USE_INTUIT;
ffad1e6a
DM
1786 }
1787 }
1788
6ad9a8ab 1789 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
ffad1e6a 1790 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
675e93ee 1791 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
ffad1e6a 1792
c765d6e0 1793 return rx_origin;
2c2d71f5
JH
1794
1795 fail_finish: /* Substring not found */
33b8afdf 1796 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1797 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1798 fail:
6ad9a8ab 1799 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n",
e4584336 1800 PL_colors[4], PL_colors[5]));
bd61b366 1801 return NULL;
cad2e5aa 1802}
9661b544 1803
70563e16 1804
a0a388a1 1805#define DECL_TRIE_TYPE(scan) \
e7fd4aa1 1806 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
a4525e78
KW
1807 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
1808 trie_utf8l, trie_flu8 } \
e7fd4aa1
KW
1809 trie_type = ((scan->flags == EXACT) \
1810 ? (utf8_target ? trie_utf8 : trie_plain) \
a4525e78
KW
1811 : (scan->flags == EXACTL) \
1812 ? (utf8_target ? trie_utf8l : trie_plain) \
1813 : (scan->flags == EXACTFA) \
1814 ? (utf8_target \
1815 ? trie_utf8_exactfa_fold \
1816 : trie_latin_utf8_exactfa_fold) \
1817 : (scan->flags == EXACTFLU8 \
1818 ? trie_flu8 \
1819 : (utf8_target \
1820 ? trie_utf8_fold \
1821 : trie_latin_utf8_fold)))
fab2782b 1822
fd3249ee 1823#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
baa60164 1824STMT_START { \
fab2782b 1825 STRLEN skiplen; \
baa60164 1826 U8 flags = FOLD_FLAGS_FULL; \
fab2782b 1827 switch (trie_type) { \
a4525e78 1828 case trie_flu8: \
780fcc9f 1829 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
613abc6d
KW
1830 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1831 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1832 } \
a4525e78 1833 goto do_trie_utf8_fold; \
31f05a37 1834 case trie_utf8_exactfa_fold: \
baa60164 1835 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1836 /* FALLTHROUGH */ \
fab2782b 1837 case trie_utf8_fold: \
a4525e78 1838 do_trie_utf8_fold: \
fab2782b 1839 if ( foldlen>0 ) { \
c80e42f3 1840 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1841 foldlen -= len; \
1842 uscan += len; \
1843 len=0; \
1844 } else { \
fab2782b 1845 len = UTF8SKIP(uc); \
a1a5ec35
KW
1846 uvc = _toFOLD_utf8_flags( (const U8*) uc, uc + len, foldbuf, &foldlen, \
1847 flags); \
5f560d8a 1848 skiplen = UVCHR_SKIP( uvc ); \
fab2782b
YO
1849 foldlen -= skiplen; \
1850 uscan = foldbuf + skiplen; \
1851 } \
1852 break; \
baa60164
KW
1853 case trie_latin_utf8_exactfa_fold: \
1854 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1855 /* FALLTHROUGH */ \
fab2782b
YO
1856 case trie_latin_utf8_fold: \
1857 if ( foldlen>0 ) { \
c80e42f3 1858 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1859 foldlen -= len; \
1860 uscan += len; \
1861 len=0; \
1862 } else { \
1863 len = 1; \
31f05a37 1864 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
5f560d8a 1865 skiplen = UVCHR_SKIP( uvc ); \
fab2782b
YO
1866 foldlen -= skiplen; \
1867 uscan = foldbuf + skiplen; \
1868 } \
1869 break; \
a4525e78 1870 case trie_utf8l: \
780fcc9f 1871 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
613abc6d
KW
1872 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1873 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1874 } \
780fcc9f 1875 /* FALLTHROUGH */ \
fab2782b 1876 case trie_utf8: \
c80e42f3 1877 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1878 break; \
1879 case trie_plain: \
1880 uvc = (UV)*uc; \
1881 len = 1; \
1882 } \
1883 if (uvc < 256) { \
1884 charid = trie->charmap[ uvc ]; \
1885 } \
1886 else { \
1887 charid = 0; \
1888 if (widecharmap) { \
1889 SV** const svpp = hv_fetch(widecharmap, \
1890 (char*)&uvc, sizeof(UV), 0); \
1891 if (svpp) \
1892 charid = (U16)SvIV(*svpp); \
1893 } \
1894 } \
4cadc6a9
YO
1895} STMT_END
1896
cb41e5d6 1897#define DUMP_EXEC_POS(li,s,doutf8,depth) \
ae7c5b9b 1898 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
cb41e5d6 1899 startpos, doutf8, depth)
ae7c5b9b 1900
c84a03c5 1901#define REXEC_FBC_EXACTISH_SCAN(COND) \
4cadc6a9
YO
1902STMT_START { \
1903 while (s <= e) { \
c84a03c5 1904 if ( (COND) \
8b37b57b 1905 && (ln == 1 || folder(s+1, pat_string+1, ln-1))\
02d5137b 1906 && (reginfo->intuit || regtry(reginfo, &s)) )\
4cadc6a9
YO
1907 goto got_it; \
1908 s++; \
1909 } \
1910} STMT_END
1911
da10aa09
KW
1912#define REXEC_FBC_SCAN(UTF8, CODE) \
1913 STMT_START { \
1914 while (s < strend) { \
1915 CODE \
1916 s += ((UTF8) ? UTF8SKIP(s) : 1); \
1917 } \
1918 } STMT_END
4cadc6a9 1919
d990bd30
KW
1920#define REXEC_FBC_CLASS_SCAN(UTF8, COND) \
1921 STMT_START { \
1922 while (s < strend) { \
1923 REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \
1924 } \
1925 } STMT_END
4cadc6a9 1926
d990bd30 1927#define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \
05bd126c 1928 if (COND) { \
19719145 1929 FBC_CHECK_AND_TRY \
21d1ed54
KW
1930 s += ((UTF8) ? UTF8SKIP(s) : 1); \
1931 previous_occurrence_end = s; \
05bd126c 1932 } \
21d1ed54
KW
1933 else { \
1934 s += ((UTF8) ? UTF8SKIP(s) : 1); \
1935 }
4cadc6a9 1936
c84a03c5 1937#define REXEC_FBC_CSCAN(CONDUTF8,COND) \
baa60164 1938 if (utf8_target) { \
da10aa09 1939 REXEC_FBC_CLASS_SCAN(1, CONDUTF8); \
e1d1eefb
YO
1940 } \
1941 else { \
da10aa09 1942 REXEC_FBC_CLASS_SCAN(0, COND); \
d981ef24 1943 }
05bd126c 1944
a9448551
KW
1945/* We keep track of where the next character should start after an occurrence
1946 * of the one we're looking for. Knowing that, we can see right away if the
1947 * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we
1948 * don't accept the 2nd and succeeding adjacent occurrences */
19719145
KW
1949#define FBC_CHECK_AND_TRY \
1950 if ( ( doevery \
1951 || s != previous_occurrence_end) \
1952 && (reginfo->intuit || regtry(reginfo, &s))) \
1953 { \
1954 goto got_it; \
1955 }
1956
a9448551
KW
1957
1958/* This differs from the above macros in that it calls a function which returns
1959 * the next occurrence of the thing being looked for in 's'; and 'strend' if
1960 * there is no such occurrence. */
1961#define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f) \
1962 while (s < strend) { \
1963 s = f; \
1964 if (s >= strend) { \
1965 break; \
1966 } \
1967 \
1968 FBC_CHECK_AND_TRY \
1969 s += (UTF8) ? UTF8SKIP(s) : 1; \
1970 previous_occurrence_end = s; \
1971 }
1972
05bd126c
KW
1973/* The three macros below are slightly different versions of the same logic.
1974 *
1975 * The first is for /a and /aa when the target string is UTF-8. This can only
1976 * match ascii, but it must advance based on UTF-8. The other two handle the
1977 * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking
1978 * for the boundary (or non-boundary) between a word and non-word character.
1979 * The utf8 and non-utf8 cases have the same logic, but the details must be
1980 * different. Find the "wordness" of the character just prior to this one, and
1981 * compare it with the wordness of this one. If they differ, we have a
1982 * boundary. At the beginning of the string, pretend that the previous
1983 * character was a new-line.
1984 *
1985 * All these macros uncleanly have side-effects with each other and outside
1986 * variables. So far it's been too much trouble to clean-up
1987 *
1988 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1989 * a word character or not.
1990 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1991 * word/non-word
1992 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1993 *
1994 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1995 * are looking for a boundary or for a non-boundary. If we are looking for a
1996 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1997 * see if this tentative match actually works, and if so, to quit the loop
1998 * here. And vice-versa if we are looking for a non-boundary.
1999 *
2000 * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
da10aa09 2001 * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of
05bd126c
KW
2002 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
2003 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
2004 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
2005 * complement. But in that branch we complement tmp, meaning that at the
2006 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
2007 * which means at the top of the loop in the next iteration, it is
2008 * TEST_NON_UTF8(s-1) */
b2f4e957 2009#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
2010 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
2011 tmp = TEST_NON_UTF8(tmp); \
da10aa09 2012 REXEC_FBC_SCAN(1, /* 1=>is-utf8; advances s while s < strend */ \
05bd126c
KW
2013 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
2014 tmp = !tmp; \
2015 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
2016 } \
2017 else { \
2018 IF_FAIL; \
2019 } \
2020 ); \
2021
2022/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
2023 * TEST_UTF8 is a macro that for the same input code points returns identically
2024 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
236d82fd 2025#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
2026 if (s == reginfo->strbeg) { \
2027 tmp = '\n'; \
2028 } \
2029 else { /* Back-up to the start of the previous character */ \
2030 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
2031 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
3db24e1e 2032 0, UTF8_ALLOW_DEFAULT); \
05bd126c
KW
2033 } \
2034 tmp = TEST_UV(tmp); \
2035 LOAD_UTF8_CHARCLASS_ALNUM(); \
da10aa09 2036 REXEC_FBC_SCAN(1, /* 1=>is-utf8; advances s while s < strend */ \
7a207065 2037 if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \
05bd126c
KW
2038 tmp = !tmp; \
2039 IF_SUCCESS; \
2040 } \
2041 else { \
2042 IF_FAIL; \
2043 } \
2044 );
cfaf538b 2045
05bd126c
KW
2046/* Like the above two macros. UTF8_CODE is the complete code for handling
2047 * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
2048 * macros below */
baa60164 2049#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 2050 if (utf8_target) { \
05bd126c 2051 UTF8_CODE \
63ac0dad
KW
2052 } \
2053 else { /* Not utf8 */ \
9d9163fb 2054 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
63ac0dad 2055 tmp = TEST_NON_UTF8(tmp); \
da10aa09 2056 REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */ \
63ac0dad 2057 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
63ac0dad 2058 IF_SUCCESS; \
760cfa8e 2059 tmp = !tmp; \
63ac0dad
KW
2060 } \
2061 else { \
2062 IF_FAIL; \
2063 } \
2064 ); \
2065 } \
c8519dc7
KW
2066 /* Here, things have been set up by the previous code so that tmp is the \
2067 * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \
2068 * utf8ness of the target). We also have to check if this matches against \
2069 * the EOS, which we treat as a \n (which is the same value in both UTF-8 \
2070 * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \
2071 * string */ \
2072 if (tmp == ! TEST_NON_UTF8('\n')) { \
2073 IF_SUCCESS; \
2074 } \
2075 else { \
2076 IF_FAIL; \
2077 }
63ac0dad 2078
ae7c5b9b
KW
2079/* This is the macro to use when we want to see if something that looks like it
2080 * could match, actually does, and if so exits the loop */
2081#define REXEC_FBC_TRYIT \
2082 if ((reginfo->intuit || regtry(reginfo, &s))) \
2083 goto got_it
2084
2085/* The only difference between the BOUND and NBOUND cases is that
2086 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2087 * NBOUND. This is accomplished by passing it as either the if or else clause,
2088 * with the other one being empty (PLACEHOLDER is defined as empty).
2089 *
2090 * The TEST_FOO parameters are for operating on different forms of input, but
2091 * all should be ones that return identically for the same underlying code
2092 * points */
2093#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2094 FBC_BOUND_COMMON( \
2095 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
2096 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2097
44129e46 2098#define FBC_BOUND_A(TEST_NON_UTF8) \
ae7c5b9b
KW
2099 FBC_BOUND_COMMON( \
2100 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
2101 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2102
2103#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2104 FBC_BOUND_COMMON( \
2105 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2106 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2107
44129e46 2108#define FBC_NBOUND_A(TEST_NON_UTF8) \
ae7c5b9b
KW
2109 FBC_BOUND_COMMON( \
2110 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2111 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2112
8bde5eaf
JH
2113#ifdef DEBUGGING
2114static IV
2115S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2116 IV cp_out = Perl__invlist_search(invlist, cp_in);
2117 assert(cp_out >= 0);
2118 return cp_out;
2119}
2120# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2121 invmap[S_get_break_val_cp_checked(invlist, cp)]
2122#else
2123# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2124 invmap[_invlist_search(invlist, cp)]
2125#endif
2126
64935bc6
KW
2127/* Takes a pointer to an inversion list, a pointer to its corresponding
2128 * inversion map, and a code point, and returns the code point's value
2129 * according to the two arrays. It assumes that all code points have a value.
2130 * This is used as the base macro for macros for particular properties */
2131#define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
8bde5eaf 2132 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
64935bc6
KW
2133
2134/* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2135 * of a code point, returning the value for the first code point in the string.
2136 * And it takes the particular macro name that finds the desired value given a
2137 * code point. Merely convert the UTF-8 to code point and call the cp macro */
2138#define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
2139 (__ASSERT_(pos < strend) \
2140 /* Note assumes is valid UTF-8 */ \
2141 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2142
2143/* Returns the GCB value for the input code point */
2144#define getGCB_VAL_CP(cp) \
2145 _generic_GET_BREAK_VAL_CP( \
2146 PL_GCB_invlist, \
02f811dd 2147 _Perl_GCB_invmap, \
64935bc6
KW
2148 (cp))
2149
2150/* Returns the GCB value for the first code point in the UTF-8 encoded string
2151 * bounded by pos and strend */
2152#define getGCB_VAL_UTF8(pos, strend) \
2153 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
05bd126c 2154
6b659339
KW
2155/* Returns the LB value for the input code point */
2156#define getLB_VAL_CP(cp) \
2157 _generic_GET_BREAK_VAL_CP( \
2158 PL_LB_invlist, \
2159 _Perl_LB_invmap, \
2160 (cp))
2161
2162/* Returns the LB value for the first code point in the UTF-8 encoded string
2163 * bounded by pos and strend */
2164#define getLB_VAL_UTF8(pos, strend) \
2165 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2166
06ae2722
KW
2167
2168/* Returns the SB value for the input code point */
2169#define getSB_VAL_CP(cp) \
2170 _generic_GET_BREAK_VAL_CP( \
2171 PL_SB_invlist, \
bf4268fa 2172 _Perl_SB_invmap, \
06ae2722
KW
2173 (cp))
2174
2175/* Returns the SB value for the first code point in the UTF-8 encoded string
2176 * bounded by pos and strend */
2177#define getSB_VAL_UTF8(pos, strend) \
2178 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2179
ae3bb8ea
KW
2180/* Returns the WB value for the input code point */
2181#define getWB_VAL_CP(cp) \
2182 _generic_GET_BREAK_VAL_CP( \
2183 PL_WB_invlist, \
bf4268fa 2184 _Perl_WB_invmap, \
ae3bb8ea
KW
2185 (cp))
2186
2187/* Returns the WB value for the first code point in the UTF-8 encoded string
2188 * bounded by pos and strend */
2189#define getWB_VAL_UTF8(pos, strend) \
2190 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2191
786e8c11 2192/* We know what class REx starts with. Try to find this position... */
02d5137b 2193/* if reginfo->intuit, its a dryrun */
786e8c11
YO
2194/* annoyingly all the vars in this routine have different names from their counterparts
2195 in regmatch. /grrr */
3c3eec57 2196STATIC char *
07be1b83 2197S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 2198 const char *strend, regmatch_info *reginfo)
a687059c 2199{
73104a1b 2200 dVAR;
a9448551
KW
2201
2202 /* TRUE if x+ need not match at just the 1st pos of run of x's */
73104a1b 2203 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
a9448551 2204
73104a1b
KW
2205 char *pat_string; /* The pattern's exactish string */
2206 char *pat_end; /* ptr to end char of pat_string */
2207 re_fold_t folder; /* Function for computing non-utf8 folds */
2208 const U8 *fold_array; /* array for folding ords < 256 */
2209 STRLEN ln;
2210 STRLEN lnc;
73104a1b
KW
2211 U8 c1;
2212 U8 c2;
2213 char *e;
21d1ed54
KW
2214
2215 /* In some cases we accept only the first occurence of 'x' in a sequence of
2216 * them. This variable points to just beyond the end of the previous
2217 * occurrence of 'x', hence we can tell if we are in a sequence. (Having
2218 * it point to beyond the 'x' allows us to work for UTF-8 without having to
2219 * hop back.) */
2220 char * previous_occurrence_end = 0;
2221
3b6c52ce 2222 I32 tmp; /* Scratch variable */
ba44c216 2223 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 2224 UV utf8_fold_flags = 0;
f9176b44 2225 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
2226 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
2227 with a result inverts that result, as 0^1 =
2228 1 and 1^1 = 0 */
2229 _char_class_number classnum;
2230
73104a1b 2231 RXi_GET_DECL(prog,progi);
2f7f8cb1 2232
73104a1b 2233 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 2234
73104a1b
KW
2235 /* We know what class it must start with. */
2236 switch (OP(c)) {
a4525e78 2237 case ANYOFL:
780fcc9f 2238 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
a0bd1a30 2239
d1c40ef5 2240 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
a0bd1a30
KW
2241 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2242 }
2243
780fcc9f 2244 /* FALLTHROUGH */
ac44c12e 2245 case ANYOFD:
73104a1b
KW
2246 case ANYOF:
2247 if (utf8_target) {
da10aa09 2248 REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
3db24e1e 2249 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
73104a1b 2250 }
1451f692 2251 else if (ANYOF_FLAGS(c)) {
da10aa09 2252 REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
1451f692 2253 }
73104a1b 2254 else {
da10aa09 2255 REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s)));
73104a1b
KW
2256 }
2257 break;
73104a1b 2258
2813d4ad
KW
2259 case ANYOFM: /* ARG() is the base byte; FLAGS() the mask byte */
2260 /* UTF-8ness doesn't matter, so use 0 */
2261 REXEC_FBC_FIND_NEXT_SCAN(0,
2262 find_next_masked(s, strend, ARG(c), FLAGS(c)));
2263 break;
2264
098b07d5
KW
2265 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
2266 assert(! is_utf8_pat);
924ba076 2267 /* FALLTHROUGH */
73104a1b 2268 case EXACTFA:
984e6dd1 2269 if (is_utf8_pat || utf8_target) {
73104a1b
KW
2270 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2271 goto do_exactf_utf8;
2272 }
2273 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
2274 folder = foldEQ_latin1; /* /a, except the sharp s one which */
2275 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 2276
2fdb7295
KW
2277 case EXACTF: /* This node only generated for non-utf8 patterns */
2278 assert(! is_utf8_pat);
73104a1b 2279 if (utf8_target) {
73104a1b
KW
2280 utf8_fold_flags = 0;
2281 goto do_exactf_utf8;
2282 }
2283 fold_array = PL_fold;
2284 folder = foldEQ;
2285 goto do_exactf_non_utf8;
2286
2287 case EXACTFL:
780fcc9f 2288 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
31f05a37 2289 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
cea315b6 2290 utf8_fold_flags = FOLDEQ_LOCALE;
73104a1b
KW
2291 goto do_exactf_utf8;
2292 }
2293 fold_array = PL_fold_locale;
2294 folder = foldEQ_locale;
2295 goto do_exactf_non_utf8;
3c760661 2296
73104a1b 2297 case EXACTFU_SS:
984e6dd1 2298 if (is_utf8_pat) {
73104a1b
KW
2299 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2300 }
2301 goto do_exactf_utf8;
16d951b7 2302
a4525e78
KW
2303 case EXACTFLU8:
2304 if (! utf8_target) { /* All code points in this node require
2305 UTF-8 to express. */
2306 break;
2307 }
613abc6d
KW
2308 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2309 | FOLDEQ_S2_FOLDS_SANE;
a4525e78
KW
2310 goto do_exactf_utf8;
2311
73104a1b 2312 case EXACTFU:
984e6dd1
DM
2313 if (is_utf8_pat || utf8_target) {
2314 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
2315 goto do_exactf_utf8;
2316 }
fac1af77 2317
73104a1b
KW
2318 /* Any 'ss' in the pattern should have been replaced by regcomp,
2319 * so we don't have to worry here about this single special case
2320 * in the Latin1 range */
2321 fold_array = PL_fold_latin1;
2322 folder = foldEQ_latin1;
2323
924ba076 2324 /* FALLTHROUGH */
73104a1b 2325
c52b8b12 2326 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
73104a1b
KW
2327 are no glitches with fold-length differences
2328 between the target string and pattern */
2329
2330 /* The idea in the non-utf8 EXACTF* cases is to first find the
2331 * first character of the EXACTF* node and then, if necessary,
2332 * case-insensitively compare the full text of the node. c1 is the
2333 * first character. c2 is its fold. This logic will not work for
2334 * Unicode semantics and the german sharp ss, which hence should
2335 * not be compiled into a node that gets here. */
2336 pat_string = STRING(c);
2337 ln = STR_LEN(c); /* length to match in octets/bytes */
2338
2339 /* We know that we have to match at least 'ln' bytes (which is the
2340 * same as characters, since not utf8). If we have to match 3
2341 * characters, and there are only 2 availabe, we know without
2342 * trying that it will fail; so don't start a match past the
2343 * required minimum number from the far end */
ea3daa5d 2344 e = HOP3c(strend, -((SSize_t)ln), s);
dda01918
HS
2345 if (e < s)
2346 break;
fac1af77 2347
73104a1b
KW
2348 c1 = *pat_string;
2349 c2 = fold_array[c1];
2350 if (c1 == c2) { /* If char and fold are the same */
2351 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
2352 }
2353 else {
2354 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
2355 }
2356 break;
fac1af77 2357
c52b8b12
KW
2358 do_exactf_utf8:
2359 {
73104a1b
KW
2360 unsigned expansion;
2361
2362 /* If one of the operands is in utf8, we can't use the simpler folding
2363 * above, due to the fact that many different characters can have the
2364 * same fold, or portion of a fold, or different- length fold */
2365 pat_string = STRING(c);
2366 ln = STR_LEN(c); /* length to match in octets/bytes */
2367 pat_end = pat_string + ln;
984e6dd1 2368 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
2369 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2370 : ln;
2371
2372 /* We have 'lnc' characters to match in the pattern, but because of
2373 * multi-character folding, each character in the target can match
2374 * up to 3 characters (Unicode guarantees it will never exceed
2375 * this) if it is utf8-encoded; and up to 2 if not (based on the
2376 * fact that the Latin 1 folds are already determined, and the
2377 * only multi-char fold in that range is the sharp-s folding to
2378 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
2379 * string character. Adjust lnc accordingly, rounding up, so that
2380 * if we need to match at least 4+1/3 chars, that really is 5. */
2381 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2382 lnc = (lnc + expansion - 1) / expansion;
2383
2384 /* As in the non-UTF8 case, if we have to match 3 characters, and
2385 * only 2 are left, it's guaranteed to fail, so don't start a
2386 * match that would require us to go beyond the end of the string
2387 */
ea3daa5d 2388 e = HOP3c(strend, -((SSize_t)lnc), s);
73104a1b 2389
73104a1b
KW
2390 /* XXX Note that we could recalculate e to stop the loop earlier,
2391 * as the worst case expansion above will rarely be met, and as we
2392 * go along we would usually find that e moves further to the left.
2393 * This would happen only after we reached the point in the loop
2394 * where if there were no expansion we should fail. Unclear if
2395 * worth the expense */
2396
2397 while (s <= e) {
2398 char *my_strend= (char *)strend;
2399 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 2400 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
02d5137b 2401 && (reginfo->intuit || regtry(reginfo, &s)) )
73104a1b
KW
2402 {
2403 goto got_it;
2404 }
2405 s += (utf8_target) ? UTF8SKIP(s) : 1;
2406 }
2407 break;
2408 }
236d82fd 2409
73104a1b 2410 case BOUNDL:
780fcc9f 2411 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6 2412 if (FLAGS(c) != TRADITIONAL_BOUND) {
89ad707a
KW
2413 if (! IN_UTF8_CTYPE_LOCALE) {
2414 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
64935bc6 2415 B_ON_NON_UTF8_LOCALE_IS_WRONG);
89ad707a 2416 }
64935bc6
KW
2417 goto do_boundu;
2418 }
2419
7a207065 2420 FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
73104a1b 2421 break;
64935bc6 2422
73104a1b 2423 case NBOUNDL:
780fcc9f 2424 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6 2425 if (FLAGS(c) != TRADITIONAL_BOUND) {
89ad707a
KW
2426 if (! IN_UTF8_CTYPE_LOCALE) {
2427 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
64935bc6 2428 B_ON_NON_UTF8_LOCALE_IS_WRONG);
89ad707a 2429 }
64935bc6
KW
2430 goto do_nboundu;
2431 }
2432
7a207065 2433 FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
73104a1b 2434 break;
64935bc6
KW
2435
2436 case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2437 meaning */
2438 assert(FLAGS(c) == TRADITIONAL_BOUND);
2439
7a207065 2440 FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
73104a1b 2441 break;
64935bc6
KW
2442
2443 case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2444 meaning */
2445 assert(FLAGS(c) == TRADITIONAL_BOUND);
2446
44129e46 2447 FBC_BOUND_A(isWORDCHAR_A);
73104a1b 2448 break;
64935bc6
KW
2449
2450 case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2451 meaning */
2452 assert(FLAGS(c) == TRADITIONAL_BOUND);
2453
7a207065 2454 FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
73104a1b 2455 break;
64935bc6
KW
2456
2457 case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2458 meaning */
2459 assert(FLAGS(c) == TRADITIONAL_BOUND);
2460
44129e46 2461 FBC_NBOUND_A(isWORDCHAR_A);
73104a1b 2462 break;
64935bc6 2463
73104a1b 2464 case NBOUNDU:
64935bc6 2465 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
7a207065 2466 FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
64935bc6
KW
2467 break;
2468 }
2469
2470 do_nboundu:
2471
2472 to_complement = 1;
2473 /* FALLTHROUGH */
2474
2475 case BOUNDU:
2476 do_boundu:
2477 switch((bound_type) FLAGS(c)) {
2478 case TRADITIONAL_BOUND:
7a207065 2479 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
64935bc6
KW
2480 break;
2481 case GCB_BOUND:
a7a8bd1e 2482 if (s == reginfo->strbeg) {
67481c39 2483 if (reginfo->intuit || regtry(reginfo, &s))
64935bc6
KW
2484 {
2485 goto got_it;
2486 }
a7a8bd1e
KW
2487
2488 /* Didn't match. Try at the next position (if there is one) */
64935bc6 2489 s += (utf8_target) ? UTF8SKIP(s) : 1;
a7a8bd1e
KW
2490 if (UNLIKELY(s >= reginfo->strend)) {
2491 break;
2492 }
64935bc6
KW
2493 }
2494
2495 if (utf8_target) {
85e5f08b 2496 GCB_enum before = getGCB_VAL_UTF8(
64935bc6
KW
2497 reghop3((U8*)s, -1,
2498 (U8*)(reginfo->strbeg)),
2499 (U8*) reginfo->strend);
2500 while (s < strend) {
85e5f08b 2501 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
64935bc6 2502 (U8*) reginfo->strend);
b0e24409
KW
2503 if ( (to_complement ^ isGCB(before,
2504 after,
2505 (U8*) reginfo->strbeg,
2506 (U8*) s,
2507 utf8_target))
00e3344b
KW
2508 && (reginfo->intuit || regtry(reginfo, &s)))
2509 {
2510 goto got_it;
64935bc6 2511 }
43a7bd62 2512 before = after;
64935bc6
KW
2513 s += UTF8SKIP(s);
2514 }
2515 }
2516 else { /* Not utf8. Everything is a GCB except between CR and
2517 LF */
2518 while (s < strend) {
00e3344b
KW
2519 if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
2520 || UCHARAT(s) != '\n'))
2521 && (reginfo->intuit || regtry(reginfo, &s)))
64935bc6 2522 {
00e3344b 2523 goto got_it;
64935bc6 2524 }
43a7bd62 2525 s++;
64935bc6
KW
2526 }
2527 }
2528
6de80efc
KW
2529 /* And, since this is a bound, it can match after the final
2530 * character in the string */
67481c39 2531 if ((reginfo->intuit || regtry(reginfo, &s))) {
64935bc6
KW
2532 goto got_it;
2533 }
2534 break;
ae3bb8ea 2535
6b659339
KW
2536 case LB_BOUND:
2537 if (s == reginfo->strbeg) {
2538 if (reginfo->intuit || regtry(reginfo, &s)) {
2539 goto got_it;
2540 }
2541 s += (utf8_target) ? UTF8SKIP(s) : 1;
2542 if (UNLIKELY(s >= reginfo->strend)) {
2543 break;
2544 }
2545 }
2546
2547 if (utf8_target) {
2548 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2549 -1,
2550 (U8*)(reginfo->strbeg)),
2551 (U8*) reginfo->strend);
2552 while (s < strend) {
2553 LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2554 if (to_complement ^ isLB(before,
2555 after,
2556 (U8*) reginfo->strbeg,
2557 (U8*) s,
2558 (U8*) reginfo->strend,
2559 utf8_target)
2560 && (reginfo->intuit || regtry(reginfo, &s)))
2561 {
2562 goto got_it;
2563 }
2564 before = after;
2565 s += UTF8SKIP(s);
2566 }
2567 }
2568 else { /* Not utf8. */
2569 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2570 while (s < strend) {
2571 LB_enum after = getLB_VAL_CP((U8) *s);
2572 if (to_complement ^ isLB(before,
2573 after,
2574 (U8*) reginfo->strbeg,
2575 (U8*) s,
2576 (U8*) reginfo->strend,
2577 utf8_target)
2578 && (reginfo->intuit || regtry(reginfo, &s)))
2579 {
2580 goto got_it;
2581 }
2582 before = after;
2583 s++;
2584 }
2585 }
2586
2587 if (reginfo->intuit || regtry(reginfo, &s)) {
2588 goto got_it;
2589 }
2590
2591 break;
2592
06ae2722 2593 case SB_BOUND:
a7a8bd1e 2594 if (s == reginfo->strbeg) {
67481c39 2595 if (reginfo->intuit || regtry(reginfo, &s)) {
06ae2722
KW
2596 goto got_it;
2597 }
06ae2722 2598 s += (utf8_target) ? UTF8SKIP(s) : 1;
a7a8bd1e
KW
2599 if (UNLIKELY(s >= reginfo->strend)) {
2600 break;
2601 }
06ae2722
KW
2602 }
2603
2604 if (utf8_target) {
85e5f08b 2605 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
06ae2722
KW
2606 -1,
2607 (U8*)(reginfo->strbeg)),
2608 (U8*) reginfo->strend);
2609 while (s < strend) {
85e5f08b 2610 SB_enum after = getSB_VAL_UTF8((U8*) s,
06ae2722 2611 (U8*) reginfo->strend);
00e3344b
KW
2612 if ((to_complement ^ isSB(before,
2613 after,
2614 (U8*) reginfo->strbeg,
2615 (U8*) s,
2616 (U8*) reginfo->strend,
2617 utf8_target))
2618 && (reginfo->intuit || regtry(reginfo, &s)))
06ae2722 2619 {
00e3344b 2620 goto got_it;
06ae2722 2621 }
43a7bd62 2622 before = after;
06ae2722
KW
2623 s += UTF8SKIP(s);
2624 }
2625 }
2626 else { /* Not utf8. */
85e5f08b 2627 SB_enum before = getSB_VAL_CP((U8) *(s -1));
06ae2722 2628 while (s < strend) {
85e5f08b 2629 SB_enum after = getSB_VAL_CP((U8) *s);
00e3344b
KW
2630 if ((to_complement ^ isSB(before,
2631 after,
2632 (U8*) reginfo->strbeg,
2633 (U8*) s,
2634 (U8*) reginfo->strend,
2635 utf8_target))
2636 && (reginfo->intuit || regtry(reginfo, &s)))
06ae2722 2637 {
00e3344b 2638 goto got_it;
06ae2722 2639 }
43a7bd62 2640 before = after;
06ae2722
KW
2641 s++;
2642 }
2643 }
2644
2645 /* Here are at the final position in the target string. The SB
2646 * value is always true here, so matches, depending on other
2647 * constraints */
67481c39 2648 if (reginfo->intuit || regtry(reginfo, &s)) {
06ae2722
KW
2649 goto got_it;
2650 }
2651
2652 break;
2653
ae3bb8ea
KW
2654 case WB_BOUND:
2655 if (s == reginfo->strbeg) {
67481c39 2656 if (reginfo->intuit || regtry(reginfo, &s)) {
ae3bb8ea
KW
2657 goto got_it;
2658 }
2659 s += (utf8_target) ? UTF8SKIP(s) : 1;
a7a8bd1e
KW
2660 if (UNLIKELY(s >= reginfo->strend)) {
2661 break;
2662 }
ae3bb8ea
KW
2663 }
2664
2665 if (utf8_target) {
2666 /* We are at a boundary between char_sub_0 and char_sub_1.
2667 * We also keep track of the value for char_sub_-1 as we
2668 * loop through the line. Context may be needed to make a
2669 * determination, and if so, this can save having to
2670 * recalculate it */
85e5f08b
KW
2671 WB_enum previous = WB_UNKNOWN;
2672 WB_enum before = getWB_VAL_UTF8(
ae3bb8ea
KW
2673 reghop3((U8*)s,
2674 -1,
2675 (U8*)(reginfo->strbeg)),
2676 (U8*) reginfo->strend);
2677 while (s < strend) {
85e5f08b 2678 WB_enum after = getWB_VAL_UTF8((U8*) s,
ae3bb8ea 2679 (U8*) reginfo->strend);
00e3344b
KW
2680 if ((to_complement ^ isWB(previous,
2681 before,
2682 after,
2683 (U8*) reginfo->strbeg,
2684 (U8*) s,
2685 (U8*) reginfo->strend,
2686 utf8_target))
2687 && (reginfo->intuit || regtry(reginfo, &s)))
ae3bb8ea 2688 {
00e3344b 2689 goto got_it;
ae3bb8ea 2690 }
43a7bd62
KW
2691 previous = before;
2692 before = after;
ae3bb8ea
KW
2693 s += UTF8SKIP(s);
2694 }
2695 }
2696 else { /* Not utf8. */
85e5f08b
KW
2697 WB_enum previous = WB_UNKNOWN;
2698 WB_enum before = getWB_VAL_CP((U8) *(s -1));
ae3bb8ea 2699 while (s < strend) {
85e5f08b 2700 WB_enum after = getWB_VAL_CP((U8) *s);
00e3344b
KW
2701 if ((to_complement ^ isWB(previous,
2702 before,
2703 after,
2704 (U8*) reginfo->strbeg,
2705 (U8*) s,
2706 (U8*) reginfo->strend,
2707 utf8_target))
2708 && (reginfo->intuit || regtry(reginfo, &s)))
ae3bb8ea 2709 {
00e3344b 2710 goto got_it;
ae3bb8ea 2711 }
43a7bd62
KW
2712 previous = before;
2713 before = after;
ae3bb8ea
KW
2714 s++;
2715 }
2716 }
2717
67481c39 2718 if (reginfo->intuit || regtry(reginfo, &s)) {
ae3bb8ea
KW
2719 goto got_it;
2720 }
64935bc6 2721 }
73104a1b 2722 break;
64935bc6 2723
73104a1b
KW
2724 case LNBREAK:
2725 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2726 is_LNBREAK_latin1_safe(s, strend)
2727 );
2728 break;
3018b823 2729
aff4cafe 2730 case ASCII:
a9448551 2731 REXEC_FBC_FIND_NEXT_SCAN(0, find_next_ascii(s, strend, utf8_target));
aff4cafe
KW
2732 break;
2733
2734 case NASCII:
a9448551
KW
2735 if (utf8_target) {
2736 REXEC_FBC_FIND_NEXT_SCAN(1, find_next_non_ascii(s, strend,
2737 utf8_target));
2738 }
2739 else {
2740 REXEC_FBC_FIND_NEXT_SCAN(0, find_next_non_ascii(s, strend,
2741 utf8_target));
aff4cafe
KW
2742 }
2743
2744 break;
2745
3018b823
KW
2746 /* The argument to all the POSIX node types is the class number to pass to
2747 * _generic_isCC() to build a mask for searching in PL_charclass[] */
2748
2749 case NPOSIXL:
2750 to_complement = 1;
2751 /* FALLTHROUGH */
2752
2753 case POSIXL:
780fcc9f 2754 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3018b823
KW
2755 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2756 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 2757 break;
3018b823
KW
2758
2759 case NPOSIXD:
2760 to_complement = 1;
2761 /* FALLTHROUGH */
2762
2763 case POSIXD:
2764 if (utf8_target) {
2765 goto posix_utf8;
2766 }
2767 goto posixa;
2768
2769 case NPOSIXA:
2770 if (utf8_target) {
2771 /* The complement of something that matches only ASCII matches all
837226c8 2772 * non-ASCII, plus everything in ASCII that isn't in the class. */
da10aa09
KW
2773 REXEC_FBC_CLASS_SCAN(1, ! isASCII_utf8_safe(s, strend)
2774 || ! _generic_isCC_A(*s, FLAGS(c)));
3018b823
KW
2775 break;
2776 }
2777
2778 to_complement = 1;
4a6c6db5 2779 goto posixa;
3018b823 2780
73104a1b
KW
2781 case POSIXA:
2782 /* Don't need to worry about utf8, as it can match only a single
4a6c6db5
KW
2783 * byte invariant character. But we do anyway for performance reasons,
2784 * as otherwise we would have to examine all the continuation
2785 * characters */
2786 if (utf8_target) {
da10aa09 2787 REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c)));
4a6c6db5
KW
2788 break;
2789 }
2790
2791 posixa:
da10aa09 2792 REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
3018b823 2793 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 2794 break;
3018b823
KW
2795
2796 case NPOSIXU:
2797 to_complement = 1;
2798 /* FALLTHROUGH */
2799
2800 case POSIXU:
2801 if (! utf8_target) {
da10aa09
KW
2802 REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2803 to_complement ^ cBOOL(_generic_isCC(*s,
3018b823
KW
2804 FLAGS(c))));
2805 }
2806 else {
2807
c52b8b12 2808 posix_utf8:
3018b823
KW
2809 classnum = (_char_class_number) FLAGS(c);
2810 if (classnum < _FIRST_NON_SWASH_CC) {
2811 while (s < strend) {
2812
2813 /* We avoid loading in the swash as long as possible, but
2814 * should we have to, we jump to a separate loop. This
2815 * extra 'if' statement is what keeps this code from being
da10aa09 2816 * just a call to REXEC_FBC_CLASS_SCAN() */
3018b823
KW
2817 if (UTF8_IS_ABOVE_LATIN1(*s)) {
2818 goto found_above_latin1;
2819 }
d990bd30
KW
2820
2821 REXEC_FBC_CLASS_SCAN_GUTS(1, (UTF8_IS_INVARIANT(*s)
3018b823
KW
2822 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2823 classnum)))
042d9e50 2824 || ( UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, strend)
3018b823 2825 && to_complement ^ cBOOL(
a62b247b 2826 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
94bb8c36 2827 *(s + 1)),
d990bd30 2828 classnum))));
3018b823
KW
2829 }
2830 }
2831 else switch (classnum) { /* These classes are implemented as
2832 macros */
779cf272 2833 case _CC_ENUM_SPACE:
da10aa09 2834 REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
7a207065 2835 to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
3018b823
KW
2836 break;
2837
2838 case _CC_ENUM_BLANK:
da10aa09 2839 REXEC_FBC_CLASS_SCAN(1,
7a207065 2840 to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
3018b823
KW
2841 break;
2842
2843 case _CC_ENUM_XDIGIT:
da10aa09 2844 REXEC_FBC_CLASS_SCAN(1,
7a207065 2845 to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3018b823
KW
2846 break;
2847
2848 case _CC_ENUM_VERTSPACE:
da10aa09 2849 REXEC_FBC_CLASS_SCAN(1,
7a207065 2850 to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3018b823
KW
2851 break;
2852
2853 case _CC_ENUM_CNTRL:
da10aa09 2854 REXEC_FBC_CLASS_SCAN(1,
7a207065 2855 to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
3018b823
KW
2856 break;
2857
2858 default:
2859 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
e5964223 2860 NOT_REACHED; /* NOTREACHED */
3018b823
KW
2861 }
2862 }
2863 break;
2864
2865 found_above_latin1: /* Here we have to load a swash to get the result
2866 for the current code point */
2867 if (! PL_utf8_swash_ptrs[classnum]) {
2868 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2869 PL_utf8_swash_ptrs[classnum] =
2a16ac92
KW
2870 _core_swash_init("utf8",
2871 "",
2872 &PL_sv_undef, 1, 0,
2873 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
2874 }
2875
2876 /* This is a copy of the loop above for swash classes, though using the
2877 * FBC macro instead of being expanded out. Since we've loaded the
2878 * swash, we don't have to check for that each time through the loop */
da10aa09 2879 REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
7a207065 2880 to_complement ^ cBOOL(_generic_utf8_safe(
3018b823
KW
2881 classnum,
2882 s,
7a207065 2883 strend,
3018b823
KW
2884 swash_fetch(PL_utf8_swash_ptrs[classnum],
2885 (U8 *) s, TRUE))));
73104a1b
KW
2886 break;
2887
2888 case AHOCORASICKC:
2889 case AHOCORASICK:
2890 {
2891 DECL_TRIE_TYPE(c);
2892 /* what trie are we using right now */
2893 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2894 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2895 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2896
2897 const char *last_start = strend - trie->minlen;
6148ee25 2898#ifdef DEBUGGING
73104a1b 2899 const char *real_start = s;
6148ee25 2900#endif
73104a1b
KW
2901 STRLEN maxlen = trie->maxlen;
2902 SV *sv_points;
2903 U8 **points; /* map of where we were in the input string
2904 when reading a given char. For ASCII this
2905 is unnecessary overhead as the relationship
2906 is always 1:1, but for Unicode, especially
2907 case folded Unicode this is not true. */
2908 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2909 U8 *bitmap=NULL;
2910
2911
2912 GET_RE_DEBUG_FLAGS_DECL;
2913
2914 /* We can't just allocate points here. We need to wrap it in
2915 * an SV so it gets freed properly if there is a croak while
2916 * running the match */
2917 ENTER;
2918 SAVETMPS;
2919 sv_points=newSV(maxlen * sizeof(U8 *));
2920 SvCUR_set(sv_points,
2921 maxlen * sizeof(U8 *));
2922 SvPOK_on(sv_points);
2923 sv_2mortal(sv_points);
2924 points=(U8**)SvPV_nolen(sv_points );
2925 if ( trie_type != trie_utf8_fold
2926 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2927 {
2928 if (trie->bitmap)
2929 bitmap=(U8*)trie->bitmap;
2930 else
2931 bitmap=(U8*)ANYOF_BITMAP(c);
2932 }
2933 /* this is the Aho-Corasick algorithm modified a touch
2934 to include special handling for long "unknown char" sequences.
2935 The basic idea being that we use AC as long as we are dealing
2936 with a possible matching char, when we encounter an unknown char
2937 (and we have not encountered an accepting state) we scan forward
2938 until we find a legal starting char.
2939 AC matching is basically that of trie matching, except that when
2940 we encounter a failing transition, we fall back to the current
2941 states "fail state", and try the current char again, a process
2942 we repeat until we reach the root state, state 1, or a legal
2943 transition. If we fail on the root state then we can either
2944 terminate if we have reached an accepting state previously, or
2945 restart the entire process from the beginning if we have not.
2946
2947 */
2948 while (s <= last_start) {
2949 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2950 U8 *uc = (U8*)s;
2951 U16 charid = 0;
2952 U32 base = 1;
2953 U32 state = 1;
2954 UV uvc = 0;
2955 STRLEN len = 0;
2956 STRLEN foldlen = 0;
2957 U8 *uscan = (U8*)NULL;
2958 U8 *leftmost = NULL;
2959#ifdef DEBUGGING
2960 U32 accepted_word= 0;
786e8c11 2961#endif
73104a1b
KW
2962 U32 pointpos = 0;
2963
2964 while ( state && uc <= (U8*)strend ) {
2965 int failed=0;
2966 U32 word = aho->states[ state ].wordnum;
2967
2968 if( state==1 ) {
2969 if ( bitmap ) {
2970 DEBUG_TRIE_EXECUTE_r(
2971 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2972 dump_exec_pos( (char *)uc, c, strend, real_start,
cb41e5d6 2973 (char *)uc, utf8_target, 0 );
6ad9a8ab 2974 Perl_re_printf( aTHX_
73104a1b
KW
2975 " Scanning for legal start char...\n");
2976 }
2977 );
2978 if (utf8_target) {
2979 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2980 uc += UTF8SKIP(uc);
2981 }
2982 } else {
2983 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2984 uc++;
2985 }
786e8c11 2986 }
73104a1b 2987 s= (char *)uc;
07be1b83 2988 }
73104a1b
KW
2989 if (uc >(U8*)last_start) break;
2990 }
2991
2992 if ( word ) {
2993 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2994 if (!leftmost || lpos < leftmost) {
2995 DEBUG_r(accepted_word=word);
2996 leftmost= lpos;
7016d6eb 2997 }
73104a1b 2998 if (base==0) break;
7016d6eb 2999
73104a1b
KW
3000 }
3001 points[pointpos++ % maxlen]= uc;
3002 if (foldlen || uc < (U8*)strend) {
3003 REXEC_TRIE_READ_CHAR(trie_type, trie,
3004 widecharmap, uc,
3005 uscan, len, uvc, charid, foldlen,
3006 foldbuf, uniflags);
3007 DEBUG_TRIE_EXECUTE_r({
3008 dump_exec_pos( (char *)uc, c, strend,
cb41e5d6 3009 real_start, s, utf8_target, 0);
6ad9a8ab 3010 Perl_re_printf( aTHX_
147e3846 3011 " Charid:%3u CP:%4" UVxf " ",
73104a1b
KW
3012 charid, uvc);
3013 });
3014 }
3015 else {
3016 len = 0;
3017 charid = 0;
3018 }
07be1b83 3019
73104a1b
KW
3020
3021 do {
6148ee25 3022#ifdef DEBUGGING
73104a1b 3023 word = aho->states[ state ].wordnum;
6148ee25 3024#endif
73104a1b
KW
3025 base = aho->states[ state ].trans.base;
3026
3027 DEBUG_TRIE_EXECUTE_r({
3028 if (failed)
3029 dump_exec_pos( (char *)uc, c, strend, real_start,
cb41e5d6 3030 s, utf8_target, 0 );
6ad9a8ab 3031 Perl_re_printf( aTHX_
147e3846 3032 "%sState: %4" UVxf ", word=%" UVxf,
73104a1b
KW
3033 failed ? " Fail transition to " : "",
3034 (UV)state, (UV)word);
3035 });
3036 if ( base ) {
3037 U32 tmp;
3038 I32 offset;
3039 if (charid &&
3040 ( ((offset = base + charid
3041 - 1 - trie->uniquecharcount)) >= 0)
3042 && ((U32)offset < trie->lasttrans)
3043 && trie->trans[offset].check == state
3044 && (tmp=trie->trans[offset].next))
3045 {
3046 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 3047 Perl_re_printf( aTHX_ " - legal\n"));
73104a1b
KW
3048 state = tmp;
3049 break;
07be1b83
YO
3050 }
3051 else {
786e8c11 3052 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 3053 Perl_re_printf( aTHX_ " - fail\n"));
786e8c11 3054 failed = 1;
73104a1b 3055 state = aho->fail[state];
07be1b83 3056 }
07be1b83 3057 }
73104a1b
KW
3058 else {
3059 /* we must be accepting here */
3060 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 3061 Perl_re_printf( aTHX_ " - accepting\n"));
73104a1b
KW
3062 failed = 1;
3063 break;
786e8c11 3064 }
73104a1b
KW
3065 } while(state);
3066 uc += len;
3067 if (failed) {
3068 if (leftmost)
3069 break;
3070 if (!state) state = 1;
07be1b83 3071 }
73104a1b
KW
3072 }
3073 if ( aho->states[ state ].wordnum ) {
3074 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
3075 if (!leftmost || lpos < leftmost) {
3076 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3077 leftmost = lpos;
07be1b83
YO
3078 }
3079 }
73104a1b
KW
3080 if (leftmost) {
3081 s = (char*)leftmost;
3082 DEBUG_TRIE_EXECUTE_r({
147e3846 3083 Perl_re_printf( aTHX_ "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
73104a1b
KW
3084 (UV)accepted_word, (IV)(s - real_start)
3085 );
3086 });
02d5137b 3087 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
3088 FREETMPS;
3089 LEAVE;
3090 goto got_it;
3091 }
3092 s = HOPc(s,1);
3093 DEBUG_TRIE_EXECUTE_r({
6ad9a8ab 3094 Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
73104a1b
KW
3095 });
3096 } else {
3097 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 3098 Perl_re_printf( aTHX_ "No match.\n"));
73104a1b
KW
3099 break;
3100 }
3101 }
3102 FREETMPS;
3103 LEAVE;
3104 }
3105 break;
3106 default:
3107 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
73104a1b
KW
3108 }
3109 return 0;
3110 got_it:
3111 return s;
6eb5f6b9
JH
3112}
3113
60165aa4
DM
3114/* set RX_SAVED_COPY, RX_SUBBEG etc.
3115 * flags have same meanings as with regexec_flags() */
3116
749f4950
DM
3117static void
3118S_reg_set_capture_string(pTHX_ REGEXP * const rx,
60165aa4
DM
3119 char *strbeg,
3120 char *strend,
3121 SV *sv,
3122 U32 flags,
3123 bool utf8_target)
3124{
3125 struct regexp *const prog = ReANY(rx);
3126
60165aa4
DM
3127 if (flags & REXEC_COPY_STR) {
3128#ifdef PERL_ANY_COW
3129 if (SvCANCOW(sv)) {
eb8fc9fe 3130 DEBUG_C(Perl_re_printf( aTHX_
60165aa4 3131 "Copy on write: regexp capture, type %d\n",
eb8fc9fe 3132 (int) SvTYPE(sv)));
5411a0e5
DM
3133 /* Create a new COW SV to share the match string and store
3134 * in saved_copy, unless the current COW SV in saved_copy
3135 * is valid and suitable for our purpose */
3136 if (( prog->saved_copy
3137 && SvIsCOW(prog->saved_copy)
3138 && SvPOKp(prog->saved_copy)
3139 && SvIsCOW(sv)
3140 && SvPOKp(sv)
3141 && SvPVX(sv) == SvPVX(prog->saved_copy)))
a76b0e90 3142 {
5411a0e5
DM
3143 /* just reuse saved_copy SV */
3144 if (RXp_MATCH_COPIED(prog)) {
3145 Safefree(prog->subbeg);
3146 RXp_MATCH_COPIED_off(prog);
3147 }
3148 }
3149 else {
3150 /* create new COW SV to share string */
196a02af 3151 RXp_MATCH_COPY_FREE(prog);
a76b0e90 3152 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
a76b0e90 3153 }
5411a0e5
DM
3154 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3155 assert (SvPOKp(prog->saved_copy));
60165aa4
DM
3156 prog->sublen = strend - strbeg;
3157 prog->suboffset = 0;
3158 prog->subcoffset = 0;
3159 } else
3160#endif
3161 {
99a90e59
FC
3162 SSize_t min = 0;
3163 SSize_t max = strend - strbeg;
ea3daa5d 3164 SSize_t sublen;
60165aa4
DM
3165
3166 if ( (flags & REXEC_COPY_SKIP_POST)
e322109a 3167 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
3168 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3169 ) { /* don't copy $' part of string */
3170 U32 n = 0;
3171 max = -1;
3172 /* calculate the right-most part of the string covered
f67a5002 3173 * by a capture. Due to lookahead, this may be to
60165aa4
DM
3174 * the right of $&, so we have to scan all captures */
3175 while (n <= prog->lastparen) {
3176 if (prog->offs[n].end > max)
3177 max = prog->offs[n].end;
3178 n++;
3179 }
3180 if (max == -1)
3181 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3182 ? prog->offs[0].start
3183 : 0;
3184 assert(max >= 0 && max <= strend - strbeg);
3185 }
3186
3187 if ( (flags & REXEC_COPY_SKIP_PRE)
e322109a 3188 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
3189 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3190 ) { /* don't copy $` part of string */
3191 U32 n = 0;
3192 min = max;
3193 /* calculate the left-most part of the string covered
f67a5002 3194 * by a capture. Due to lookbehind, this may be to
60165aa4
DM
3195 * the left of $&, so we have to scan all captures */
3196 while (min && n <= prog->lastparen) {
3197 if ( prog->offs[n].start != -1
3198 && prog->offs[n].start < min)
3199 {
3200 min = prog->offs[n].start;
3201 }
3202 n++;
3203 }
3204 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3205 && min > prog->offs[0].end
3206 )
3207 min = prog->offs[0].end;
3208
3209 }
3210
3211 assert(min >= 0 && min <= max && min <= strend - strbeg);
3212 sublen = max - min;
3213
196a02af 3214 if (RXp_MATCH_COPIED(prog)) {
60165aa4
DM
3215 if (sublen > prog->sublen)
3216 prog->subbeg =
3217 (char*)saferealloc(prog->subbeg, sublen+1);
3218 }
3219 else
3220 prog->subbeg = (char*)safemalloc(sublen+1);
3221 Copy(strbeg + min, prog->subbeg, sublen, char);
3222 prog->subbeg[sublen] = '\0';
3223 prog->suboffset = min;
3224 prog->sublen = sublen;
196a02af 3225 RXp_MATCH_COPIED_on(prog);
60165aa4
DM
3226 }
3227 prog->subcoffset = prog->suboffset;
3228 if (prog->suboffset && utf8_target) {
3229 /* Convert byte offset to chars.
3230 * XXX ideally should only compute this if @-/@+
3231 * has been seen, a la PL_sawampersand ??? */
3232
3233 /* If there's a direct correspondence between the
3234 * string which we're matching and the original SV,
3235 * then we can use the utf8 len cache associated with
3236 * the SV. In particular, it means that under //g,
3237 * sv_pos_b2u() will use the previously cached
3238 * position to speed up working out the new length of
3239 * subcoffset, rather than counting from the start of
3240 * the string each time. This stops
3241 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3242 * from going quadratic */
3243 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
ea3daa5d
FC
3244 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3245 SV_GMAGIC|SV_CONST_RETURN);
60165aa4
DM
3246 else
3247 prog->subcoffset = utf8_length((U8*)strbeg,
3248 (U8*)(strbeg+prog->suboffset));
3249 }
3250 }
3251 else {
196a02af 3252 RXp_MATCH_COPY_FREE(prog);
60165aa4
DM
3253 prog->subbeg = strbeg;
3254 prog->suboffset = 0;
3255 prog->subcoffset = 0;
3256 prog->sublen = strend - strbeg;
3257 }
3258}
3259
3260
3261
fae667d5 3262
6eb5f6b9
JH
3263/*
3264 - regexec_flags - match a regexp against a string
3265 */
3266I32
5aaab254 3267Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
ea3daa5d 3268 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
3269/* stringarg: the point in the string at which to begin matching */
3270/* strend: pointer to null at end of string */
3271/* strbeg: real beginning of string */
3272/* minend: end of match must be >= minend bytes after stringarg. */
3273/* sv: SV being matched: only used for utf8 flag, pos() etc; string
3274 * itself is accessed via the pointers above */
3275/* data: May be used for some additional optimizations.
d058ec57 3276 Currently unused. */
a340edde 3277/* flags: For optimizations. See REXEC_* in regexp.h */
8fd1a950 3278
6eb5f6b9 3279{
8d919b0a 3280 struct regexp *const prog = ReANY(rx);
5aaab254 3281 char *s;
eb578fdb 3282 regnode *c;
03c83e26 3283 char *startpos;
ea3daa5d
FC
3284 SSize_t minlen; /* must match at least this many chars */
3285 SSize_t dontbother = 0; /* how many characters not to try at end */
f2ed9b32 3286 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 3287 I32 multiline;
f8fc2ecf 3288 RXi_GET_DECL(prog,progi);
02d5137b
DM
3289 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
3290 regmatch_info *const reginfo = &reginfo_buf;
e9105d30 3291 regexp_paren_pair *swap = NULL;
006f26b2 3292 I32 oldsave;
a3621e74
YO
3293 GET_RE_DEBUG_FLAGS_DECL;
3294
7918f24d 3295 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 3296 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
3297
3298 /* Be paranoid... */
3dc78631 3299 if (prog == NULL) {
6eb5f6b9 3300 Perl_croak(aTHX_ "NULL regexp parameter");
6eb5f6b9
JH
3301 }
3302
6c3fea77 3303 DEBUG_EXECUTE_r(
03c83e26 3304 debug_start_match(rx, utf8_target, stringarg, strend,
6c3fea77
DM
3305 "Matching");
3306 );
8adc0f72 3307
b342a604
DM
3308 startpos = stringarg;
3309
4cf1a867
DM
3310 /* set these early as they may be used by the HOP macros below */
3311 reginfo->strbeg = strbeg;
3312 reginfo->strend = strend;
3313 reginfo->is_utf8_target = cBOOL(utf8_target);
3314
58430ea8 3315 if (prog->intflags & PREGf_GPOS_SEEN) {
d307c076
DM
3316 MAGIC *mg;
3317
fef7148b
DM
3318 /* set reginfo->ganch, the position where \G can match */
3319
3320 reginfo->ganch =
3321 (flags & REXEC_IGNOREPOS)
3322 ? stringarg /* use start pos rather than pos() */
3dc78631 3323 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
25fdce4a
FC
3324 /* Defined pos(): */
3325 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
fef7148b
DM
3326 : strbeg; /* pos() not defined; use start of string */
3327
6ad9a8ab 3328 DEBUG_GPOS_r(Perl_re_printf( aTHX_
147e3846 3329 "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
fef7148b 3330
03c83e26
DM
3331 /* in the presence of \G, we may need to start looking earlier in
3332 * the string than the suggested start point of stringarg:
0b2c2a84 3333 * if prog->gofs is set, then that's a known, fixed minimum
03c83e26
DM
3334 * offset, such as
3335 * /..\G/: gofs = 2
3336 * /ab|c\G/: gofs = 1
3337 * or if the minimum offset isn't known, then we have to go back
3338 * to the start of the string, e.g. /w+\G/
3339 */
2bfbe302 3340
8e1490ee 3341 if (prog->intflags & PREGf_ANCH_GPOS) {
4cf1a867
DM
3342 if (prog->gofs) {
3343 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3344 if (!startpos ||
3345 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3346 {
6ad9a8ab 3347 DEBUG_r(Perl_re_printf( aTHX_
4cf1a867
DM
3348 "fail: ganch-gofs before earliest possible start\n"));
3349 return 0;
3350 }
2bfbe302 3351 }
4cf1a867
DM
3352 else
3353 startpos = reginfo->ganch;
2bfbe302
DM
3354 }
3355 else if (prog->gofs) {
4cf1a867
DM
3356 startpos = HOPBACKc(startpos, prog->gofs);
3357 if (!startpos)
b342a604 3358 startpos = strbeg;
03c83e26 3359 }
58430ea8 3360 else if (prog->intflags & PREGf_GPOS_FLOAT)
b342a604 3361 startpos = strbeg;
03c83e26
DM
3362 }
3363
3364 minlen = prog->minlen;
b342a604 3365 if ((startpos + minlen) > strend || startpos < strbeg) {
6ad9a8ab 3366 DEBUG_r(Perl_re_printf( aTHX_
03c83e26
DM
3367 "Regex match can't succeed, so not even tried\n"));
3368 return 0;
3369 }
3370
63a3746a
DM
3371 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3372 * which will call destuctors to reset PL_regmatch_state, free higher
3373 * PL_regmatch_slabs, and clean up regmatch_info_aux and
3374 * regmatch_info_aux_eval */
3375
3376 oldsave = PL_savestack_ix;
3377
dfa77d06
DM
3378 s = startpos;
3379
e322109a 3380 if ((prog->extflags & RXf_USE_INTUIT)
7fadf4a7
DM
3381 && !(flags & REXEC_CHECKED))
3382 {
dfa77d06 3383 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
7fadf4a7 3384 flags, NULL);
dfa77d06 3385 if (!s)
7fadf4a7
DM
3386 return 0;
3387
e322109a 3388 if (prog->extflags & RXf_CHECK_ALL) {
7fadf4a7
DM
3389 /* we can match based purely on the result of INTUIT.
3390 * Set up captures etc just for $& and $-[0]
3391 * (an intuit-only match wont have $1,$2,..) */
3392 assert(!prog->nparens);
d5e7783a
DM
3393
3394 /* s/// doesn't like it if $& is earlier than where we asked it to
3395 * start searching (which can happen on something like /.\G/) */
3396 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3397 && (s < stringarg))
3398 {
3399 /* this should only be possible under \G */
58430ea8 3400 assert(prog->intflags & PREGf_GPOS_SEEN);
6ad9a8ab 3401 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
d5e7783a
DM
3402 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3403 goto phooey;
3404 }
3405
7fadf4a7
DM
3406 /* match via INTUIT shouldn't have any captures.
3407 * Let @-, @+, $^N know */
3408 prog->lastparen = prog->lastcloseparen = 0;
196a02af 3409 RXp_MATCH_UTF8_set(prog, utf8_target);
3ff69bd6
DM
3410 prog->offs[0].start = s - strbeg;
3411 prog->offs[0].end = utf8_target
3412 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3413 : s - strbeg + prog->minlenret;
7fadf4a7 3414 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 3415 S_reg_set_capture_string(aTHX_ rx,
7fadf4a7
DM
3416 strbeg, strend,
3417 sv, flags, utf8_target);
3418
7fadf4a7
DM
3419 return 1;
3420 }
3421 }
3422
6c3fea77 3423 multiline = prog->extflags & RXf_PMf_MULTILINE;
1de06328 3424
dfa77d06 3425 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
6ad9a8ab 3426 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
a72c7584
JH
3427 "String too short [regexec_flags]...\n"));
3428 goto phooey;
1aa99e6b 3429 }
1de06328 3430
6eb5f6b9 3431 /* Check validity of program. */
f8fc2ecf 3432 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
3433 Perl_croak(aTHX_ "corrupted regexp program");
3434 }
3435
196a02af
DM
3436 RXp_MATCH_TAINTED_off(prog);
3437 RXp_MATCH_UTF8_set(prog, utf8_target);
1738e041 3438
6c3fea77
DM
3439 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
3440 reginfo->intuit = 0;
02d5137b
DM
3441 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3442 reginfo->warned = FALSE;
02d5137b 3443 reginfo->sv = sv;
1cb48e53 3444 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
6eb5f6b9 3445 /* see how far we have to get to not match where we matched before */
fe3974be 3446 reginfo->till = stringarg + minend;
6eb5f6b9 3447
60779a30 3448 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
82c23608
FC
3449 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3450 S_cleanup_regmatch_info_aux has executed (registered by
3451 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
3452 magic belonging to this SV.
3453 Not newSVsv, either, as it does not COW.
3454 */
3455 reginfo->sv = newSV(0);
4cba5ac0 3456 SvSetSV_nosteal(reginfo->sv, sv);
82c23608
FC
3457 SAVEFREESV(reginfo->sv);
3458 }
3459
331b2dcc
DM
3460 /* reserve next 2 or 3 slots in PL_regmatch_state:
3461 * slot N+0: may currently be in use: skip it
3462 * slot N+1: use for regmatch_info_aux struct
3463 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3464 * slot N+3: ready for use by regmatch()
3465 */
bf2039a9 3466
331b2dcc
DM
3467 {
3468 regmatch_state *old_regmatch_state;
3469 regmatch_slab *old_regmatch_slab;
3470 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3471
3472 /* on first ever match, allocate first slab */
3473 if (!PL_regmatch_slab) {
3474 Newx(PL_regmatch_slab, 1, regmatch_slab);
3475 PL_regmatch_slab->prev = NULL;
3476 PL_regmatch_slab->next = NULL;
3477 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3478 }
bf2039a9 3479
331b2dcc
DM
3480 old_regmatch_state = PL_regmatch_state;
3481 old_regmatch_slab = PL_regmatch_slab;
bf2039a9 3482
331b2dcc
DM
3483 for (i=0; i <= max; i++) {
3484 if (i == 1)
3485 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3486 else if (i ==2)
3487 reginfo->info_aux_eval =
3488 reginfo->info_aux->info_aux_eval =
3489 &(PL_regmatch_state->u.info_aux_eval);
bf2039a9 3490
331b2dcc
DM
3491 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
3492 PL_regmatch_state = S_push_slab(aTHX);
3493 }
bf2039a9 3494
331b2dcc
DM
3495 /* note initial PL_regmatch_state position; at end of match we'll
3496 * pop back to there and free any higher slabs */
bf2039a9 3497
331b2dcc
DM
3498 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3499 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2ac8ff4b 3500 reginfo->info_aux->poscache = NULL;
bf2039a9 3501
331b2dcc 3502 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
bf2039a9 3503
331b2dcc
DM
3504 if ((prog->extflags & RXf_EVAL_SEEN))
3505 S_setup_eval_state(aTHX_ reginfo);
3506 else
3507 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
bf2039a9 3508 }
d3aa529c 3509
6eb5f6b9 3510 /* If there is a "must appear" string, look for it. */
6eb5f6b9 3511
288b8c02 3512 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
3513 /* We have to be careful. If the previous successful match
3514 was from this regex we don't want a subsequent partially
3515 successful match to clobber the old results.
3516 So when we detect this possibility we add a swap buffer
d8da0584
KW
3517 to the re, and switch the buffer each match. If we fail,
3518 we switch it back; otherwise we leave it swapped.
e9105d30
GG
3519 */
3520 swap = prog->offs;
3521 /* do we need a save destructor here for eval dies? */