This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.sym: Add node type ANYOF_POSIXL
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
f65819ce
CO
5 * One Ring to rule them all, One Ring to find them
6 *
4ac71550
TC
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
b992490d 83#include "invlist_inline.h"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
bbac6b20
KW
86#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
88
a0bd1a30
KW
89static const char utf8_locale_required[] =
90 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
91
e1cf74e3
CB
92#ifdef DEBUGGING
93/* At least one required character in the target string is expressible only in
94 * UTF-8. */
95static const char* const non_utf8_target_but_utf8_required
96 = "Can't match, because target string needs to be in UTF-8\n";
97#endif
98
7631e439
KW
99/* Returns a boolean as to whether the input unsigned number is a power of 2
100 * (2**0, 2**1, etc). In other words if it has just a single bit set.
101 * If not, subtracting 1 would leave the uppermost bit set, so the & would
102 * yield non-zero */
103#define isPOWER_OF_2(n) ((n & (n-1)) == 0)
104
7b031478 105#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
6ad9a8ab 106 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\
7b031478 107 goto target; \
e1cf74e3
CB
108} STMT_END
109
c74f6de9
KW
110#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
111
a687059c
LW
112#ifndef STATIC
113#define STATIC static
114#endif
115
451c6e0b
KW
116/* Valid only if 'c', the character being looke-up, is an invariant under
117 * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
118 * everything matchable is straight forward in the bitmap */
119#define REGINCLASS(prog,p,c,u) (ANYOF_FLAGS(p) \
120 ? reginclass(prog,p,c,c+1,u) \
121 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 122
c277df42
IZ
123/*
124 * Forwards.
125 */
126
f2ed9b32 127#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 128
3dab1dad 129#define HOPc(pos,off) \
ba44c216 130 (char *)(reginfo->is_utf8_target \
220db18a 131 ? reghop3((U8*)pos, off, \
9d9163fb 132 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad 133 : (U8*)(pos + off))
557f47af 134
bb152a4b
DM
135/* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
136#define HOPBACK3(pos, off, lim) \
137 (reginfo->is_utf8_target \
138 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
139 : (pos - off >= lim) \
140 ? (U8*)pos - off \
3dab1dad 141 : NULL)
efb30f32 142
bb152a4b
DM
143#define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
144
ba44c216 145#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 146#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 147
557f47af
DM
148/* lim must be +ve. Returns NULL on overshoot */
149#define HOPMAYBE3(pos,off,lim) \
150 (reginfo->is_utf8_target \
151 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
152 : ((U8*)pos + off <= lim) \
153 ? (U8*)pos + off \
154 : NULL)
155
8e9f2289
DM
156/* like HOP3, but limits the result to <= lim even for the non-utf8 case.
157 * off must be >=0; args should be vars rather than expressions */
158#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
159 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
160 : (U8*)((pos + off) > lim ? lim : (pos + off)))
67853908 161#define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
8e9f2289 162
2974eaec
DM
163#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
164 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
165 : (U8*)(pos + off))
166#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
7016d6eb
DM
167
168#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
169#define NEXTCHR_IS_EOS (nextchr < 0)
170
171#define SET_nextchr \
220db18a 172 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
7016d6eb
DM
173
174#define SET_locinput(p) \
175 locinput = (p); \
176 SET_nextchr
177
c7304fe2 178#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
179/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
180
5f80c4cf 181/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
182/* it would be nice to rework regcomp.sym to generate this stuff. sigh
183 *
184 * NOTE that *nothing* that affects backtracking should be in here, specifically
185 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
186 * node that is in between two EXACT like nodes when ascertaining what the required
187 * "follow" character is. This should probably be moved to regex compile time
188 * although it may be done at run time beause of the REF possibility - more
189 * investigation required. -- demerphq
190*/
baa60164
KW
191#define JUMPABLE(rn) ( \
192 OP(rn) == OPEN || \
24be3102
YO
193 (OP(rn) == CLOSE && \
194 !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) || \
baa60164
KW
195 OP(rn) == EVAL || \
196 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
197 OP(rn) == PLUS || OP(rn) == MINMOD || \
198 OP(rn) == KEEPS || \
199 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 200)
ee9b8eae 201#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 202
ee9b8eae
YO
203#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
204
205#if 0
206/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
a4525e78 207 we don't need this definition. XXX These are now out-of-sync*/
ee9b8eae 208#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
89829bb5 209#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFAA || OP(rn)==EXACTFAA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
210#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
211
212#else
213/* ... so we use this as its faster. */
a4525e78 214#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==EXACTL )
89829bb5 215#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFAA || OP(rn) == EXACTFAA_NO_TRIE)
ee9b8eae
YO
216#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
217#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
218
219#endif
e2d8ce26 220
a84d97b6
HS
221/*
222 Search for mandatory following text node; for lookahead, the text must
223 follow but for lookbehind (rn->flags != 0) we skip to the next step.
224*/
baa60164 225#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
226 while (JUMPABLE(rn)) { \
227 const OPCODE type = OP(rn); \
228 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 229 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 230 else if (type == PLUS) \
cca55fe3 231 rn = NEXTOPER(rn); \
3dab1dad 232 else if (type == IFMATCH) \
a84d97b6 233 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 234 else rn += NEXT_OFF(rn); \
3dab1dad 235 } \
5f80c4cf 236} STMT_END
74750237 237
006f26b2
DM
238#define SLAB_FIRST(s) (&(s)->states[0])
239#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
240
a75351a1 241static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 242static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
bf2039a9 243static regmatch_state * S_push_slab(pTHX);
51371543 244
87c0511b 245#define REGCP_PAREN_ELEMS 3
f067efbf 246#define REGCP_OTHER_ELEMS 3
e0fa7e2b 247#define REGCP_FRAME_ELEMS 1
620d5b66
NC
248/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
249 * are needed for the regexp context stack bookkeeping. */
250
76e3520e 251STATIC CHECKPOINT
21553840 252S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
a0d0e21e 253{
a3b680e6 254 const int retval = PL_savestack_ix;
92da3157
DM
255 const int paren_elems_to_push =
256 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
257 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
258 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 259 I32 p;
40a82448 260 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 261
b93070ed
DM
262 PERL_ARGS_ASSERT_REGCPPUSH;
263
e49a9654 264 if (paren_elems_to_push < 0)
e8a85d26
JH
265 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
266 (int)paren_elems_to_push, (int)maxopenparen,
267 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
e49a9654 268
e0fa7e2b 269 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
147e3846 270 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
5df417d0 271 " out of range (%lu-%ld)",
92da3157
DM
272 total_elems,
273 (unsigned long)maxopenparen,
274 (long)parenfloor);
e0fa7e2b 275
620d5b66 276 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 277
495f47a5 278 DEBUG_BUFFERS_r(
92da3157 279 if ((int)maxopenparen > (int)parenfloor)
2b1a3689 280 Perl_re_exec_indentf( aTHX_
147e3846 281 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
2b1a3689
YO
282 depth,
283 PTR2UV(rex),
495f47a5
DM
284 PTR2UV(rex->offs)
285 );
286 );
92da3157 287 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 288/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
99a90e59
FC
289 SSPUSHIV(rex->offs[p].end);
290 SSPUSHIV(rex->offs[p].start);
1ca2007e 291 SSPUSHINT(rex->offs[p].start_tmp);
2b1a3689 292 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
147e3846 293 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
2b1a3689
YO
294 depth,
295 (UV)p,
495f47a5
DM
296 (IV)rex->offs[p].start,
297 (IV)rex->offs[p].start_tmp,
298 (IV)rex->offs[p].end
40a82448 299 ));
a0d0e21e 300 }
b1ce53c5 301/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 302 SSPUSHINT(maxopenparen);
b93070ed
DM
303 SSPUSHINT(rex->lastparen);
304 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 305 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 306
a0d0e21e
LW
307 return retval;
308}
309
c277df42 310/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
311#define REGCP_SET(cp) \
312 DEBUG_STATE_r( \
147e3846
KW
313 Perl_re_exec_indentf( aTHX_ \
314 "Setting an EVAL scope, savestack=%" IVdf ",\n", \
cb41e5d6
YO
315 depth, (IV)PL_savestack_ix \
316 ) \
317 ); \
ab3bbdeb 318 cp = PL_savestack_ix
c3464db5 319
ab3bbdeb 320#define REGCP_UNWIND(cp) \
e4f74956 321 DEBUG_STATE_r( \
cb41e5d6 322 if (cp != PL_savestack_ix) \
147e3846
KW
323 Perl_re_exec_indentf( aTHX_ \
324 "Clearing an EVAL scope, savestack=%" \
325 IVdf "..%" IVdf "\n", \
cb41e5d6
YO
326 depth, (IV)(cp), (IV)PL_savestack_ix \
327 ) \
328 ); \
ab3bbdeb 329 regcpblow(cp)
c277df42 330
8e9ed7cc
DM
331/* set the start and end positions of capture ix */
332#define CLOSE_CAPTURE(ix, s, e) \
333 rex->offs[ix].start = s; \
334 rex->offs[ix].end = e; \
3be2a9fd
DM
335 if (ix > rex->lastparen) \
336 rex->lastparen = ix; \
337 rex->lastcloseparen = ix; \
69cd2617 338 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
befca383 339 "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
69cd2617
DM
340 depth, \
341 PTR2UV(rex), \
342 PTR2UV(rex->offs), \
8e9ed7cc
DM
343 (UV)ix, \
344 (IV)rex->offs[ix].start, \
3be2a9fd
DM
345 (IV)rex->offs[ix].end, \
346 (UV)rex->lastparen \
69cd2617
DM
347 ))
348
a8d1f4b4 349#define UNWIND_PAREN(lp, lcp) \
befca383
DM
350 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
351 "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
352 depth, \
353 PTR2UV(rex), \
354 PTR2UV(rex->offs), \
355 (UV)(lp), \
356 (UV)(rex->lastparen), \
357 (UV)(lcp) \
358 )); \
a8d1f4b4
DM
359 for (n = rex->lastparen; n > lp; n--) \
360 rex->offs[n].end = -1; \
361 rex->lastparen = n; \
362 rex->lastcloseparen = lcp;
363
364
f067efbf 365STATIC void
21553840 366S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
a0d0e21e 367{
e0fa7e2b 368 UV i;
87c0511b 369 U32 paren;
a3621e74
YO
370 GET_RE_DEBUG_FLAGS_DECL;
371
7918f24d
NC
372 PERL_ARGS_ASSERT_REGCPPOP;
373
b1ce53c5 374 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 375 i = SSPOPUV;
e0fa7e2b
NC
376 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
377 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
378 rex->lastcloseparen = SSPOPINT;
379 rex->lastparen = SSPOPINT;
92da3157 380 *maxopenparen_p = SSPOPINT;
b1ce53c5 381
620d5b66 382 i -= REGCP_OTHER_ELEMS;
b1ce53c5 383 /* Now restore the parentheses context. */
495f47a5
DM
384 DEBUG_BUFFERS_r(
385 if (i || rex->lastparen + 1 <= rex->nparens)
2b1a3689 386 Perl_re_exec_indentf( aTHX_
147e3846 387 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
2b1a3689
YO
388 depth,
389 PTR2UV(rex),
495f47a5
DM
390 PTR2UV(rex->offs)
391 );
392 );
92da3157 393 paren = *maxopenparen_p;
620d5b66 394 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
ea3daa5d 395 SSize_t tmps;
1ca2007e 396 rex->offs[paren].start_tmp = SSPOPINT;
99a90e59
FC
397 rex->offs[paren].start = SSPOPIV;
398 tmps = SSPOPIV;
b93070ed
DM
399 if (paren <= rex->lastparen)
400 rex->offs[paren].end = tmps;
2b1a3689 401 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
147e3846 402 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
2b1a3689
YO
403 depth,
404 (UV)paren,
495f47a5
DM
405 (IV)rex->offs[paren].start,
406 (IV)rex->offs[paren].start_tmp,
407 (IV)rex->offs[paren].end,
408 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 409 );
87c0511b 410 paren--;
a0d0e21e 411 }
daf18116 412#if 1
dafc8851
JH
413 /* It would seem that the similar code in regtry()
414 * already takes care of this, and in fact it is in
415 * a better location to since this code can #if 0-ed out
416 * but the code in regtry() is needed or otherwise tests
417 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
418 * (as of patchlevel 7877) will fail. Then again,
419 * this code seems to be necessary or otherwise
225593e1
DM
420 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
421 * --jhi updated by dapm */
b93070ed 422 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 423 if (i > *maxopenparen_p)
b93070ed
DM
424 rex->offs[i].start = -1;
425 rex->offs[i].end = -1;
2b1a3689 426 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
147e3846 427 " \\%" UVuf ": %s ..-1 undeffing\n",
2b1a3689
YO
428 depth,
429 (UV)i,
92da3157 430 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 431 ));
a0d0e21e 432 }
dafc8851 433#endif
a0d0e21e
LW
434}
435
74088413
DM
436/* restore the parens and associated vars at savestack position ix,
437 * but without popping the stack */
438
439STATIC void
21553840 440S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
74088413
DM
441{
442 I32 tmpix = PL_savestack_ix;
85882954
YO
443 PERL_ARGS_ASSERT_REGCP_RESTORE;
444
74088413 445 PL_savestack_ix = ix;
21553840 446 regcppop(rex, maxopenparen_p);
74088413
DM
447 PL_savestack_ix = tmpix;
448}
449
02db2b7b 450#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 451
9637d2a5
CB
452#ifndef PERL_IN_XSUB_RE
453
24e16d7b
KW
454bool
455Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
31c7f561
KW
456{
457 /* Returns a boolean as to whether or not 'character' is a member of the
458 * Posix character class given by 'classnum' that should be equivalent to a
459 * value in the typedef '_char_class_number'.
460 *
461 * Ideally this could be replaced by a just an array of function pointers
462 * to the C library functions that implement the macros this calls.
463 * However, to compile, the precise function signatures are required, and
464 * these may vary from platform to to platform. To avoid having to figure
465 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
466 * which adds an extra layer of function call overhead (unless the C
467 * optimizer strips it away). But we don't particularly care about
468 * performance with locales anyway. */
31c7f561
KW
469
470 switch ((_char_class_number) classnum) {
15861f94 471 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 472 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
473 case _CC_ENUM_ASCII: return isASCII_LC(character);
474 case _CC_ENUM_BLANK: return isBLANK_LC(character);
cee69f79 475 case _CC_ENUM_CASED: return isLOWER_LC(character)
b0d691b2 476 || isUPPER_LC(character);
e8d596e0 477 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
478 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
479 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
480 case _CC_ENUM_LOWER: return isLOWER_LC(character);
481 case _CC_ENUM_PRINT: return isPRINT_LC(character);
482 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 483 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
484 case _CC_ENUM_UPPER: return isUPPER_LC(character);
485 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 486 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
487 default: /* VERTSPACE should never occur in locales */
488 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
489 }
490
e5964223 491 NOT_REACHED; /* NOTREACHED */
31c7f561
KW
492 return FALSE;
493}
494
9637d2a5
CB
495#endif
496
3018b823 497STATIC bool
a78c2fa6 498S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
3018b823
KW
499{
500 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
501 * 'character' is a member of the Posix character class given by 'classnum'
502 * that should be equivalent to a value in the typedef
503 * '_char_class_number'.
504 *
505 * This just calls isFOO_lc on the code point for the character if it is in
2f306ab9 506 * the range 0-255. Outside that range, all characters use Unicode
3018b823
KW
507 * rules, ignoring any locale. So use the Unicode function if this class
508 * requires a swash, and use the Unicode macro otherwise. */
509
510 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
511
512 if (UTF8_IS_INVARIANT(*character)) {
513 return isFOO_lc(classnum, *character);
514 }
515 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
516 return isFOO_lc(classnum,
a62b247b 517 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
3018b823
KW
518 }
519
a78c2fa6 520 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
613abc6d 521
3018b823 522 switch ((_char_class_number) classnum) {
779cf272 523 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
3018b823
KW
524 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
525 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
526 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
4d3d8522
KW
527 default:
528 return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
a78c2fa6 529 utf8_to_uvchr_buf(character, e, NULL));
3018b823
KW
530 }
531
e1ee3960 532 return FALSE; /* Things like CNTRL are always below 256 */
3018b823
KW
533}
534
aff4cafe
KW
535STATIC char *
536S_find_next_ascii(char * s, const char * send, const bool utf8_target)
537{
538 /* Returns the position of the first ASCII byte in the sequence between 's'
539 * and 'send-1' inclusive; returns 'send' if none found */
540
541 PERL_ARGS_ASSERT_FIND_NEXT_ASCII;
542
597ee3f4 543#ifndef EBCDIC
aff4cafe
KW
544
545 if ((STRLEN) (send - s) >= PERL_WORDSIZE
546
547 /* This term is wordsize if subword; 0 if not */
548 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
549
550 /* 'offset' */
551 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
552 {
553
554 /* Process per-byte until reach word boundary. XXX This loop could be
555 * eliminated if we knew that this platform had fast unaligned reads */
556 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
557 if (isASCII(*s)) {
558 return s;
559 }
560 s++; /* khw didn't bother creating a separate loop for
561 utf8_target */
562 }
563
564 /* Here, we know we have at least one full word to process. Process
565 * per-word as long as we have at least a full word left */
566 do {
1d2af574
KW
567 PERL_UINTMAX_T complemented = ~ * (PERL_UINTMAX_T *) s;
568 if (complemented & PERL_VARIANTS_WORD_MASK) {
569
290746ac
KW
570# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
571 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1d2af574
KW
572
573 s += _variant_byte_number(complemented);
574 return s;
575
290746ac 576# else /* If weird byte order, drop into next loop to do byte-at-a-time
1d2af574
KW
577 checks. */
578
aff4cafe 579 break;
290746ac 580# endif
aff4cafe 581 }
1d2af574 582
aff4cafe 583 s += PERL_WORDSIZE;
1d2af574 584
aff4cafe
KW
585 } while (s + PERL_WORDSIZE <= send);
586 }
587
588#endif
589
590 /* Process per-character */
591 if (utf8_target) {
592 while (s < send) {
593 if (isASCII(*s)) {
594 return s;
595 }
596 s += UTF8SKIP(s);
597 }
598 }
599 else {
600 while (s < send) {
601 if (isASCII(*s)) {
602 return s;
603 }
604 s++;
605 }
606 }
607
608 return s;
609}
610
611STATIC char *
612S_find_next_non_ascii(char * s, const char * send, const bool utf8_target)
613{
614 /* Returns the position of the first non-ASCII byte in the sequence between
615 * 's' and 'send-1' inclusive; returns 'send' if none found */
616
617#ifdef EBCDIC
618
619 PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
620
621 if (utf8_target) {
622 while (s < send) {
623 if ( ! isASCII(*s)) {
624 return s;
625 }
626 s += UTF8SKIP(s);
627 }
628 }
629 else {
630 while (s < send) {
631 if ( ! isASCII(*s)) {
632 return s;
633 }
634 s++;
635 }
636 }
637
638 return s;
639
640#else
641
642 const U8 * next_non_ascii = NULL;
643
644 PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
645 PERL_UNUSED_ARG(utf8_target);
646
647 /* On ASCII platforms invariants and ASCII are identical, so if the string
648 * is entirely invariants, there is no non-ASCII character */
649 return (is_utf8_invariant_string_loc((U8 *) s,
650 (STRLEN) (send - s),
651 &next_non_ascii))
652 ? (char *) send
653 : (char *) next_non_ascii;
654
655#endif
656
657}
658
2b1f9c71
KW
659STATIC U8 *
660S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
ab1efbdc
KW
661{
662 /* Returns the position of the first byte in the sequence between 's' and
663 * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
664 * */
665
666 PERL_ARGS_ASSERT_FIND_SPAN_END;
667
668 assert(send >= s);
669
670 if ((STRLEN) (send - s) >= PERL_WORDSIZE
671 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
672 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
673 {
674 PERL_UINTMAX_T span_word;
675
676 /* Process per-byte until reach word boundary. XXX This loop could be
677 * eliminated if we knew that this platform had fast unaligned reads */
678 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
679 if (*s != span_byte) {
680 return s;
681 }
682 s++;
683 }
684
685 /* Create a word filled with the bytes we are spanning */
686 span_word = PERL_COUNT_MULTIPLIER * span_byte;
687
688 /* Process per-word as long as we have at least a full word left */
689 do {
690
691 /* Keep going if the whole word is composed of 'span_byte's */
692 if ((* (PERL_UINTMAX_T *) s) == span_word) {
693 s += PERL_WORDSIZE;
694 continue;
695 }
696
23a7ee81
KW
697 /* Here, at least one byte in the word isn't 'span_byte'. */
698
699#ifdef EBCDIC
700
701 break;
702
703#else
704
705 /* This xor leaves 1 bits only in those non-matching bytes */
ab1efbdc
KW
706 span_word ^= * (PERL_UINTMAX_T *) s;
707
708 /* Make sure the upper bit of each non-matching byte is set. This
709 * makes each such byte look like an ASCII platform variant byte */
710 span_word |= span_word << 1;
711 span_word |= span_word << 2;
712 span_word |= span_word << 4;
713
714 /* That reduces the problem to what this function solves */
715 return s + _variant_byte_number(span_word);
716
23a7ee81
KW
717#endif
718
ab1efbdc
KW
719 } while (s + PERL_WORDSIZE <= send);
720 }
721
722 /* Process the straggler bytes beyond the final word boundary */
723 while (s < send) {
724 if (*s != span_byte) {
725 return s;
726 }
727 s++;
728 }
729
730 return s;
731}
732
2b1f9c71
KW
733STATIC U8 *
734S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
2813d4ad
KW
735{
736 /* Returns the position of the first byte in the sequence between 's'
737 * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
738 * returns 'send' if none found. It uses word-level operations instead of
739 * byte to speed up the process */
740
741 PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
742
743 assert(send >= s);
744 assert((byte & mask) == byte);
745
23a7ee81
KW
746#ifndef EBCDIC
747
2813d4ad
KW
748 if ((STRLEN) (send - s) >= PERL_WORDSIZE
749 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
750 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
751 {
752 PERL_UINTMAX_T word_complemented, mask_word;
753
754 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
2b1f9c71 755 if (((*s) & mask) == byte) {
2813d4ad
KW
756 return s;
757 }
758 s++;
759 }
760
761 word_complemented = ~ (PERL_COUNT_MULTIPLIER * byte);
762 mask_word = PERL_COUNT_MULTIPLIER * mask;
763
764 do {
765 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
766
767 /* If 'masked' contains 'byte' within it, anding with the
768 * complement will leave those 8 bits 0 */
769 masked &= word_complemented;
770
771 /* This causes the most significant bit to be set to 1 for any
772 * bytes in the word that aren't completely 0 */
773 masked |= masked << 1;
774 masked |= masked << 2;
775 masked |= masked << 4;
776
777 /* The msbits are the same as what marks a byte as variant, so we
778 * can use this mask. If all msbits are 1, the word doesn't
779 * contain 'byte' */
780 if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
781 s += PERL_WORDSIZE;
782 continue;
783 }
784
785 /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
786 * and any that are, are 0. Complement and re-AND to swap that */
787 masked = ~ masked;
788 masked &= PERL_VARIANTS_WORD_MASK;
789
790 /* This reduces the problem to that solved by this function */
791 s += _variant_byte_number(masked);
792 return s;
793
794 } while (s + PERL_WORDSIZE <= send);
795 }
796
23a7ee81
KW
797#endif
798
2813d4ad 799 while (s < send) {
2b1f9c71 800 if (((*s) & mask) == byte) {
2813d4ad
KW
801 return s;
802 }
803 s++;
804 }
805
806 return s;
807}
808
070e8b2e
KW
809STATIC U8 *
810S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
811{
812 /* Returns the position of the first byte in the sequence between 's' and
813 * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
814 * 'span_byte' should have been ANDed with 'mask' in the call of this
815 * function. Returns 'send' if none found. Works like find_span_end(),
816 * except for the AND */
817
818 PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
819
820 assert(send >= s);
821 assert((span_byte & mask) == span_byte);
822
823 if ((STRLEN) (send - s) >= PERL_WORDSIZE
824 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
825 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
826 {
827 PERL_UINTMAX_T span_word, mask_word;
828
829 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
2b1f9c71 830 if (((*s) & mask) != span_byte) {
070e8b2e
KW
831 return s;
832 }
833 s++;
834 }
835
836 span_word = PERL_COUNT_MULTIPLIER * span_byte;
837 mask_word = PERL_COUNT_MULTIPLIER * mask;
838
839 do {
840 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
841
842 if (masked == span_word) {
843 s += PERL_WORDSIZE;
844 continue;
845 }
846
23a7ee81
KW
847#ifdef EBCDIC
848
849 break;
850
851#else
852
070e8b2e
KW
853 masked ^= span_word;
854 masked |= masked << 1;
855 masked |= masked << 2;
856 masked |= masked << 4;
857 return s + _variant_byte_number(masked);
858
23a7ee81
KW
859#endif
860
070e8b2e
KW
861 } while (s + PERL_WORDSIZE <= send);
862 }
863
864 while (s < send) {
2b1f9c71 865 if (((*s) & mask) != span_byte) {
070e8b2e
KW
866 return s;
867 }
868 s++;
869 }
870
871 return s;
872}
873
a687059c 874/*
e50aee73 875 * pregexec and friends
a687059c
LW
876 */
877
76234dfb 878#ifndef PERL_IN_XSUB_RE
a687059c 879/*
c277df42 880 - pregexec - match a regexp against a string
a687059c 881 */
c277df42 882I32
5aaab254 883Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
ea3daa5d 884 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
8fd1a950
DM
885/* stringarg: the point in the string at which to begin matching */
886/* strend: pointer to null at end of string */
887/* strbeg: real beginning of string */
888/* minend: end of match must be >= minend bytes after stringarg. */
889/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
890 * itself is accessed via the pointers above */
891/* nosave: For optimizations. */
c277df42 892{
7918f24d
NC
893 PERL_ARGS_ASSERT_PREGEXEC;
894
c277df42 895 return
9041c2e3 896 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
897 nosave ? 0 : REXEC_COPY_STR);
898}
76234dfb 899#endif
22e551b9 900
cad2e5aa 901
6eb5f6b9 902
1a4edc3c
DM
903/* re_intuit_start():
904 *
905 * Based on some optimiser hints, try to find the earliest position in the
906 * string where the regex could match.
907 *
908 * rx: the regex to match against
909 * sv: the SV being matched: only used for utf8 flag; the string
910 * itself is accessed via the pointers below. Note that on
911 * something like an overloaded SV, SvPOK(sv) may be false
912 * and the string pointers may point to something unrelated to
913 * the SV itself.
914 * strbeg: real beginning of string
915 * strpos: the point in the string at which to begin matching
916 * strend: pointer to the byte following the last char of the string
917 * flags currently unused; set to 0
918 * data: currently unused; set to NULL
919 *
920 * The basic idea of re_intuit_start() is to use some known information
921 * about the pattern, namely:
922 *
923 * a) the longest known anchored substring (i.e. one that's at a
924 * constant offset from the beginning of the pattern; but not
925 * necessarily at a fixed offset from the beginning of the
926 * string);
927 * b) the longest floating substring (i.e. one that's not at a constant
928 * offset from the beginning of the pattern);
929 * c) Whether the pattern is anchored to the string; either
930 * an absolute anchor: /^../, or anchored to \n: /^.../m,
931 * or anchored to pos(): /\G/;
932 * d) A start class: a real or synthetic character class which
933 * represents which characters are legal at the start of the pattern;
934 *
935 * to either quickly reject the match, or to find the earliest position
936 * within the string at which the pattern might match, thus avoiding
937 * running the full NFA engine at those earlier locations, only to
938 * eventually fail and retry further along.
939 *
940 * Returns NULL if the pattern can't match, or returns the address within
941 * the string which is the earliest place the match could occur.
942 *
943 * The longest of the anchored and floating substrings is called 'check'
944 * and is checked first. The other is called 'other' and is checked
945 * second. The 'other' substring may not be present. For example,
946 *
947 * /(abc|xyz)ABC\d{0,3}DEFG/
948 *
949 * will have
950 *
951 * check substr (float) = "DEFG", offset 6..9 chars
952 * other substr (anchored) = "ABC", offset 3..3 chars
953 * stclass = [ax]
954 *
955 * Be aware that during the course of this function, sometimes 'anchored'
956 * refers to a substring being anchored relative to the start of the
957 * pattern, and sometimes to the pattern itself being anchored relative to
958 * the string. For example:
959 *
960 * /\dabc/: "abc" is anchored to the pattern;
961 * /^\dabc/: "abc" is anchored to the pattern and the string;
962 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
963 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
964 * but the pattern is anchored to the string.
52a21eb3
DM
965 */
966
cad2e5aa 967char *
52a21eb3
DM
968Perl_re_intuit_start(pTHX_
969 REGEXP * const rx,
970 SV *sv,
971 const char * const strbeg,
972 char *strpos,
973 char *strend,
974 const U32 flags,
975 re_scream_pos_data *data)
cad2e5aa 976{
8d919b0a 977 struct regexp *const prog = ReANY(rx);
b2ad2123
DM
978 SSize_t start_shift = prog->check_offset_min;
979 /* Should be nonnegative! */
980 SSize_t end_shift = 0;
0fc004dd
DM
981 /* current lowest pos in string where the regex can start matching */
982 char *rx_origin = strpos;
eb578fdb 983 SV *check;
f2ed9b32 984 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
6480a6c4 985 U8 other_ix = 1 - prog->substrs->check_ix;
6ad5ffb3 986 bool ml_anch = 0;
8f4bf5fc 987 char *other_last = strpos;/* latest pos 'other' substr already checked to */
bd61b366 988 char *check_at = NULL; /* check substr found at this pos */
bbe252da 989 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 990 RXi_GET_DECL(prog,progi);
02d5137b
DM
991 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
992 regmatch_info *const reginfo = &reginfo_buf;
a3621e74
YO
993 GET_RE_DEBUG_FLAGS_DECL;
994
7918f24d 995 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
996 PERL_UNUSED_ARG(flags);
997 PERL_UNUSED_ARG(data);
7918f24d 998
6ad9a8ab 999 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0
DM
1000 "Intuit: trying to determine minimum start position...\n"));
1001
fb9bbddb 1002 /* for now, assume that all substr offsets are positive. If at some point
f67a5002 1003 * in the future someone wants to do clever things with lookbehind and
fb9bbddb
DM
1004 * -ve offsets, they'll need to fix up any code in this function
1005 * which uses these offsets. See the thread beginning
1006 * <20140113145929.GF27210@iabyn.com>
1007 */
1008 assert(prog->substrs->data[0].min_offset >= 0);
1009 assert(prog->substrs->data[0].max_offset >= 0);
1010 assert(prog->substrs->data[1].min_offset >= 0);
1011 assert(prog->substrs->data[1].max_offset >= 0);
1012 assert(prog->substrs->data[2].min_offset >= 0);
1013 assert(prog->substrs->data[2].max_offset >= 0);
1014
f7022b5a 1015 /* for now, assume that if both present, that the floating substring
83f2232d 1016 * doesn't start before the anchored substring.
f7022b5a
DM
1017 * If you break this assumption (e.g. doing better optimisations
1018 * with lookahead/behind), then you'll need to audit the code in this
1019 * function carefully first
1020 */
1021 assert(
1022 ! ( (prog->anchored_utf8 || prog->anchored_substr)
1023 && (prog->float_utf8 || prog->float_substr))
1024 || (prog->float_min_offset >= prog->anchored_offset));
1025
1a4edc3c
DM
1026 /* byte rather than char calculation for efficiency. It fails
1027 * to quickly reject some cases that can't match, but will reject
1028 * them later after doing full char arithmetic */
c344f387 1029 if (prog->minlen > strend - strpos) {
6ad9a8ab 1030 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1031 " String too short...\n"));
cad2e5aa 1032 goto fail;
2c2d71f5 1033 }
d8da0584 1034
196a02af 1035 RXp_MATCH_UTF8_set(prog, utf8_target);
6c3fea77 1036 reginfo->is_utf8_target = cBOOL(utf8_target);
bf2039a9 1037 reginfo->info_aux = NULL;
9d9163fb 1038 reginfo->strbeg = strbeg;
220db18a 1039 reginfo->strend = strend;
aed7b151 1040 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 1041 reginfo->intuit = 1;
1cb48e53
DM
1042 /* not actually used within intuit, but zero for safety anyway */
1043 reginfo->poscache_maxiter = 0;
02d5137b 1044
f2ed9b32 1045 if (utf8_target) {
2814f4b3
HS
1046 if ((!prog->anchored_utf8 && prog->anchored_substr)
1047 || (!prog->float_utf8 && prog->float_substr))
33b8afdf
JH
1048 to_utf8_substr(prog);
1049 check = prog->check_utf8;
1050 } else {
7e0d5ad7
KW
1051 if (!prog->check_substr && prog->check_utf8) {
1052 if (! to_byte_substr(prog)) {
6b54ddc5 1053 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
1054 }
1055 }
33b8afdf
JH
1056 check = prog->check_substr;
1057 }
274cd312 1058
1dc475d0
DM
1059 /* dump the various substring data */
1060 DEBUG_OPTIMISE_MORE_r({
1061 int i;
1062 for (i=0; i<=2; i++) {
1063 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
1064 : prog->substrs->data[i].substr);
1065 if (!sv)
1066 continue;
1067
6ad9a8ab 1068 Perl_re_printf( aTHX_
147e3846
KW
1069 " substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
1070 " useful=%" IVdf " utf8=%d [%s]\n",
1dc475d0
DM
1071 i,
1072 (IV)prog->substrs->data[i].min_offset,
1073 (IV)prog->substrs->data[i].max_offset,
1074 (IV)prog->substrs->data[i].end_shift,
1075 BmUSEFUL(sv),
1076 utf8_target ? 1 : 0,
1077 SvPEEK(sv));
1078 }
1079 });
1080
8e1490ee 1081 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
9fc7410e
DM
1082
1083 /* ml_anch: check after \n?
1084 *
0fa70a06 1085 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
9fc7410e
DM
1086 * with /.*.../, these flags will have been added by the
1087 * compiler:
1088 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
1089 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
1090 */
7d2d37f5
DM
1091 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
1092 && !(prog->intflags & PREGf_IMPLICIT);
cad2e5aa 1093
343c8a29 1094 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
c889ccc8
DM
1095 /* we are only allowed to match at BOS or \G */
1096
57fcbfa7 1097 /* trivially reject if there's a BOS anchor and we're not at BOS.
7bb3b9eb
DM
1098 *
1099 * Note that we don't try to do a similar quick reject for
1100 * \G, since generally the caller will have calculated strpos
1101 * based on pos() and gofs, so the string is already correctly
1102 * anchored by definition; and handling the exceptions would
1103 * be too fiddly (e.g. REXEC_IGNOREPOS).
57fcbfa7 1104 */
7bb3b9eb 1105 if ( strpos != strbeg
d3d47aac 1106 && (prog->intflags & PREGf_ANCH_SBOL))
c889ccc8 1107 {
6ad9a8ab 1108 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1109 " Not at start...\n"));
c889ccc8
DM
1110 goto fail;
1111 }
1112
a5d12a4b
DM
1113 /* in the presence of an anchor, the anchored (relative to the
1114 * start of the regex) substr must also be anchored relative
66b7ec5c
DM
1115 * to strpos. So quickly reject if substr isn't found there.
1116 * This works for \G too, because the caller will already have
1117 * subtracted gofs from pos, and gofs is the offset from the
1118 * \G to the start of the regex. For example, in /.abc\Gdef/,
1119 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
1120 * caller will have set strpos=pos()-4; we look for the substr
1121 * at position pos()-4+1, which lines up with the "a" */
a5d12a4b 1122
33c28ab2 1123 if (prog->check_offset_min == prog->check_offset_max) {
c889ccc8 1124 /* Substring at constant offset from beg-of-str... */
b2ad2123 1125 SSize_t slen = SvCUR(check);
343c8a29 1126 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 1127
6ad9a8ab 1128 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1129 " Looking for check substr at fixed offset %" IVdf "...\n",
1dc475d0
DM
1130 (IV)prog->check_offset_min));
1131
7742aa66
DM
1132 if (SvTAIL(check)) {
1133 /* In this case, the regex is anchored at the end too.
1134 * Unless it's a multiline match, the lengths must match
b2ad2123 1135 * exactly, give or take a \n. NB: slen >= 1 since
7742aa66
DM
1136 * the last char of check is \n */
1137 if (!multiline
b2ad2123
DM
1138 && ( strend - s > slen
1139 || strend - s < slen - 1
1140 || (strend - s == slen && strend[-1] != '\n')))
c889ccc8 1141 {
6ad9a8ab 1142 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1143 " String too long...\n"));
c889ccc8
DM
1144 goto fail_finish;
1145 }
b2ad2123
DM
1146 /* Now should match s[0..slen-2] */
1147 slen--;
c889ccc8 1148 }
b2ad2123 1149 if (slen && (strend - s < slen
26fb2318 1150 || *SvPVX_const(check) != *s
b2ad2123 1151 || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
d307bf57 1152 {
6ad9a8ab 1153 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1154 " String not equal...\n"));
d307bf57
DM
1155 goto fail_finish;
1156 }
c889ccc8
DM
1157
1158 check_at = s;
1159 goto success_at_start;
cad2e5aa 1160 }
cad2e5aa 1161 }
cad2e5aa 1162 }
0fc004dd 1163
b2ad2123 1164 end_shift = prog->check_end_shift;
cad2e5aa 1165
19188028 1166#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
b2ad2123 1167 if (end_shift < 0)
147e3846 1168 Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
b2ad2123 1169 (IV)end_shift, RX_PRECOMP(rx));
2c2d71f5
JH
1170#endif
1171
2c2d71f5 1172 restart:
1de06328 1173
66b7ec5c
DM
1174 /* This is the (re)entry point of the main loop in this function.
1175 * The goal of this loop is to:
1176 * 1) find the "check" substring in the region rx_origin..strend
b2ad2123 1177 * (adjusted by start_shift / end_shift). If not found, reject
66b7ec5c
DM
1178 * immediately.
1179 * 2) If it exists, look for the "other" substr too if defined; for
1180 * example, if the check substr maps to the anchored substr, then
1181 * check the floating substr, and vice-versa. If not found, go
1182 * back to (1) with rx_origin suitably incremented.
1183 * 3) If we find an rx_origin position that doesn't contradict
1184 * either of the substrings, then check the possible additional
1185 * constraints on rx_origin of /^.../m or a known start class.
1186 * If these fail, then depending on which constraints fail, jump
1187 * back to here, or to various other re-entry points further along
1188 * that skip some of the first steps.
1189 * 4) If we pass all those tests, update the BmUSEFUL() count on the
1190 * substring. If the start position was determined to be at the
1191 * beginning of the string - so, not rejected, but not optimised,
1192 * since we have to run regmatch from position 0 - decrement the
1193 * BmUSEFUL() count. Otherwise increment it.
1194 */
1195
1a4edc3c
DM
1196
1197 /* first, look for the 'check' substring */
1198
1de06328 1199 {
c33e64f0
FC
1200 U8* start_point;
1201 U8* end_point;
c889ccc8 1202
c889ccc8 1203 DEBUG_OPTIMISE_MORE_r({
6ad9a8ab 1204 Perl_re_printf( aTHX_
147e3846
KW
1205 " At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1206 " Start shift: %" IVdf " End shift %" IVdf
1207 " Real end Shift: %" IVdf "\n",
675e93ee 1208 (IV)(rx_origin - strbeg),
c889ccc8 1209 (IV)prog->check_offset_min,
b2ad2123
DM
1210 (IV)start_shift,
1211 (IV)end_shift,
c889ccc8
DM
1212 (IV)prog->check_end_shift);
1213 });
1de06328 1214
b2ad2123 1215 end_point = HOPBACK3(strend, end_shift, rx_origin);
bb152a4b
DM
1216 if (!end_point)
1217 goto fail_finish;
b2ad2123 1218 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
33c28ab2
DM
1219 if (!start_point)
1220 goto fail_finish;
c889ccc8 1221
557f47af 1222
e0362b86 1223 /* If the regex is absolutely anchored to either the start of the
d3d47aac 1224 * string (SBOL) or to pos() (ANCH_GPOS), then
e0362b86
DM
1225 * check_offset_max represents an upper bound on the string where
1226 * the substr could start. For the ANCH_GPOS case, we assume that
1227 * the caller of intuit will have already set strpos to
1228 * pos()-gofs, so in this case strpos + offset_max will still be
1229 * an upper bound on the substr.
1230 */
c19c836a
DM
1231 if (!ml_anch
1232 && prog->intflags & PREGf_ANCH
e0362b86 1233 && prog->check_offset_max != SSize_t_MAX)
c19c836a 1234 {
b2ad2123 1235 SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
e0362b86
DM
1236 const char * const anchor =
1237 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
b2ad2123 1238 SSize_t targ_len = (char*)end_point - anchor;
2ce94a86 1239
b2ad2123 1240 if (check_len > targ_len) {
2ce94a86 1241 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
12453e29 1242 "Target string too short to match required substring...\n"));
2ce94a86
DM
1243 goto fail_finish;
1244 }
e0362b86
DM
1245
1246 /* do a bytes rather than chars comparison. It's conservative;
1247 * so it skips doing the HOP if the result can't possibly end
1248 * up earlier than the old value of end_point.
1249 */
b2ad2123
DM
1250 assert(anchor + check_len <= (char *)end_point);
1251 if (prog->check_offset_max + check_len < targ_len) {
e0362b86
DM
1252 end_point = HOP3lim((U8*)anchor,
1253 prog->check_offset_max,
b2ad2123 1254 end_point - check_len
2ce94a86 1255 )
b2ad2123 1256 + check_len;
12453e29
YO
1257 if (end_point < start_point)
1258 goto fail_finish;
e0362b86 1259 }
d6ef1678
DM
1260 }
1261
ae5d4331 1262 check_at = fbm_instr( start_point, end_point,
7fba1cd6 1263 check, multiline ? FBMrf_MULTILINE : 0);
c889ccc8 1264
6ad9a8ab 1265 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1266 " doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
675e93ee
DM
1267 (IV)((char*)start_point - strbeg),
1268 (IV)((char*)end_point - strbeg),
1269 (IV)(check_at ? check_at - strbeg : -1)
1270 ));
1271
8fd34720
DM
1272 /* Update the count-of-usability, remove useless subpatterns,
1273 unshift s. */
1274
1275 DEBUG_EXECUTE_r({
1276 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1277 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
6ad9a8ab 1278 Perl_re_printf( aTHX_ " %s %s substr %s%s%s",
8fd34720
DM
1279 (check_at ? "Found" : "Did not find"),
1280 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1281 ? "anchored" : "floating"),
1282 quoted,
1283 RE_SV_TAIL(check),
1284 (check_at ? " at offset " : "...\n") );
1285 });
2c2d71f5 1286
8fd34720
DM
1287 if (!check_at)
1288 goto fail_finish;
8fd34720
DM
1289 /* set rx_origin to the minimum position where the regex could start
1290 * matching, given the constraint of the just-matched check substring.
1291 * But don't set it lower than previously.
1292 */
fdc003fd 1293
8fd34720
DM
1294 if (check_at - rx_origin > prog->check_offset_max)
1295 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
675e93ee 1296 /* Finish the diagnostic message */
6ad9a8ab 1297 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1298 "%ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1299 (long)(check_at - strbeg),
1300 (IV)(rx_origin - strbeg)
1301 ));
8fd34720 1302 }
fdc003fd
DM
1303
1304
1a4edc3c 1305 /* now look for the 'other' substring if defined */
2c2d71f5 1306
6480a6c4
DM
1307 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
1308 : prog->substrs->data[other_ix].substr)
1de06328 1309 {
30944b6d 1310 /* Take into account the "other" substring. */
6c3343a6
DM
1311 char *last, *last1;
1312 char *s;
1313 SV* must;
1314 struct reg_substr_datum *other;
1315
1316 do_other_substr:
1317 other = &prog->substrs->data[other_ix];
1318
1319 /* if "other" is anchored:
1320 * we've previously found a floating substr starting at check_at.
1321 * This means that the regex origin must lie somewhere
1322 * between min (rx_origin): HOP3(check_at, -check_offset_max)
1323 * and max: HOP3(check_at, -check_offset_min)
1324 * (except that min will be >= strpos)
1325 * So the fixed substr must lie somewhere between
1326 * HOP3(min, anchored_offset)
1327 * HOP3(max, anchored_offset) + SvCUR(substr)
1328 */
1329
1330 /* if "other" is floating
1331 * Calculate last1, the absolute latest point where the
1332 * floating substr could start in the string, ignoring any
1333 * constraints from the earlier fixed match. It is calculated
1334 * as follows:
1335 *
1336 * strend - prog->minlen (in chars) is the absolute latest
1337 * position within the string where the origin of the regex
1338 * could appear. The latest start point for the floating
1339 * substr is float_min_offset(*) on from the start of the
1340 * regex. last1 simply combines thee two offsets.
1341 *
1342 * (*) You might think the latest start point should be
1343 * float_max_offset from the regex origin, and technically
1344 * you'd be correct. However, consider
1345 * /a\d{2,4}bcd\w/
1346 * Here, float min, max are 3,5 and minlen is 7.
1347 * This can match either
1348 * /a\d\dbcd\w/
1349 * /a\d\d\dbcd\w/
1350 * /a\d\d\d\dbcd\w/
1351 * In the first case, the regex matches minlen chars; in the
1352 * second, minlen+1, in the third, minlen+2.
1353 * In the first case, the floating offset is 3 (which equals
1354 * float_min), in the second, 4, and in the third, 5 (which
1355 * equals float_max). In all cases, the floating string bcd
1356 * can never start more than 4 chars from the end of the
1357 * string, which equals minlen - float_min. As the substring
1358 * starts to match more than float_min from the start of the
1359 * regex, it makes the regex match more than minlen chars,
1360 * and the two cancel each other out. So we can always use
1361 * float_min - minlen, rather than float_max - minlen for the
1362 * latest position in the string.
1363 *
1364 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1365 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1366 */
1367
e7a14a9c 1368 assert(prog->minlen >= other->min_offset);
6c3343a6
DM
1369 last1 = HOP3c(strend,
1370 other->min_offset - prog->minlen, strbeg);
1371
4d006249 1372 if (other_ix) {/* i.e. if (other-is-float) */
6c3343a6
DM
1373 /* last is the latest point where the floating substr could
1374 * start, *given* any constraints from the earlier fixed
1375 * match. This constraint is that the floating string starts
1376 * <= float_max_offset chars from the regex origin (rx_origin).
1377 * If this value is less than last1, use it instead.
eb3831ce 1378 */
6c3343a6
DM
1379 assert(rx_origin <= last1);
1380 last =
1381 /* this condition handles the offset==infinity case, and
1382 * is a short-cut otherwise. Although it's comparing a
1383 * byte offset to a char length, it does so in a safe way,
1384 * since 1 char always occupies 1 or more bytes,
1385 * so if a string range is (last1 - rx_origin) bytes,
1386 * it will be less than or equal to (last1 - rx_origin)
1387 * chars; meaning it errs towards doing the accurate HOP3
1388 * rather than just using last1 as a short-cut */
1389 (last1 - rx_origin) < other->max_offset
1390 ? last1
1391 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1392 }
1393 else {
b2ad2123
DM
1394 assert(strpos + start_shift <= check_at);
1395 last = HOP4c(check_at, other->min_offset - start_shift,
6c3343a6
DM
1396 strbeg, strend);
1397 }
ead917d0 1398
6c3343a6
DM
1399 s = HOP3c(rx_origin, other->min_offset, strend);
1400 if (s < other_last) /* These positions already checked */
1401 s = other_last;
1402
1403 must = utf8_target ? other->utf8_substr : other->substr;
1404 assert(SvPOK(must));
675e93ee
DM
1405 {
1406 char *from = s;
1407 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1408
71a9d105
DM
1409 if (to > strend)
1410 to = strend;
88203927
DM
1411 if (from > to) {
1412 s = NULL;
6ad9a8ab 1413 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1414 " skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
88203927
DM
1415 (IV)(from - strbeg),
1416 (IV)(to - strbeg)
1417 ));
1418 }
1419 else {
1420 s = fbm_instr(
1421 (unsigned char*)from,
1422 (unsigned char*)to,
1423 must,
1424 multiline ? FBMrf_MULTILINE : 0
1425 );
6ad9a8ab 1426 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1427 " doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
88203927
DM
1428 (IV)(from - strbeg),
1429 (IV)(to - strbeg),
1430 (IV)(s ? s - strbeg : -1)
1431 ));
1432 }
675e93ee
DM
1433 }
1434
6c3343a6
DM
1435 DEBUG_EXECUTE_r({
1436 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1437 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
6ad9a8ab 1438 Perl_re_printf( aTHX_ " %s %s substr %s%s",
6c3343a6
DM
1439 s ? "Found" : "Contradicts",
1440 other_ix ? "floating" : "anchored",
1441 quoted, RE_SV_TAIL(must));
1442 });
ead917d0 1443
ead917d0 1444
6c3343a6
DM
1445 if (!s) {
1446 /* last1 is latest possible substr location. If we didn't
1447 * find it before there, we never will */
1448 if (last >= last1) {
6ad9a8ab 1449 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee 1450 "; giving up...\n"));
6c3343a6 1451 goto fail_finish;
ead917d0
DM
1452 }
1453
6c3343a6
DM
1454 /* try to find the check substr again at a later
1455 * position. Maybe next time we'll find the "other" substr
1456 * in range too */
6c3343a6
DM
1457 other_last = HOP3c(last, 1, strend) /* highest failure */;
1458 rx_origin =
4d006249 1459 other_ix /* i.e. if other-is-float */
6c3343a6
DM
1460 ? HOP3c(rx_origin, 1, strend)
1461 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
6ad9a8ab 1462 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1463 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1464 (other_ix ? "floating" : "anchored"),
1465 (long)(HOP3c(check_at, 1, strend) - strbeg),
1466 (IV)(rx_origin - strbeg)
1467 ));
6c3343a6
DM
1468 goto restart;
1469 }
1470 else {
4d006249 1471 if (other_ix) { /* if (other-is-float) */
6c3343a6
DM
1472 /* other_last is set to s, not s+1, since its possible for
1473 * a floating substr to fail first time, then succeed
1474 * second time at the same floating position; e.g.:
1475 * "-AB--AABZ" =~ /\wAB\d*Z/
1476 * The first time round, anchored and float match at
1477 * "-(AB)--AAB(Z)" then fail on the initial \w character
1478 * class. Second time round, they match at "-AB--A(AB)(Z)".
1479 */
1480 other_last = s;
ead917d0
DM
1481 }
1482 else {
6c3343a6
DM
1483 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1484 other_last = HOP3c(s, 1, strend);
ead917d0 1485 }
6ad9a8ab 1486 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1487 " at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1488 (long)(s - strbeg),
1489 (IV)(rx_origin - strbeg)
1490 ));
1491
6c3343a6 1492 }
cad2e5aa 1493 }
acba93e8
DM
1494 else {
1495 DEBUG_OPTIMISE_MORE_r(
6ad9a8ab 1496 Perl_re_printf( aTHX_
147e3846
KW
1497 " Check-only match: offset min:%" IVdf " max:%" IVdf
1498 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1499 " strend:%" IVdf "\n",
acba93e8
DM
1500 (IV)prog->check_offset_min,
1501 (IV)prog->check_offset_max,
675e93ee
DM
1502 (IV)(check_at-strbeg),
1503 (IV)(rx_origin-strbeg),
1c1c599d 1504 (IV)(rx_origin-check_at),
675e93ee 1505 (IV)(strend-strbeg)
acba93e8
DM
1506 )
1507 );
1508 }
2c2d71f5 1509
acba93e8 1510 postprocess_substr_matches:
0991020e 1511
1a4edc3c 1512 /* handle the extra constraint of /^.../m if present */
e3c6feb0 1513
7d2d37f5 1514 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
4620cb61
DM
1515 char *s;
1516
6ad9a8ab 1517 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
a62659bd 1518 " looking for /^/m anchor"));
d0880ea7
DM
1519
1520 /* we have failed the constraint of a \n before rx_origin.
2e759faa
DM
1521 * Find the next \n, if any, even if it's beyond the current
1522 * anchored and/or floating substrings. Whether we should be
1523 * scanning ahead for the next \n or the next substr is debatable.
1524 * On the one hand you'd expect rare substrings to appear less
1525 * often than \n's. On the other hand, searching for \n means
675e93ee 1526 * we're effectively flipping between check_substr and "\n" on each
2e759faa
DM
1527 * iteration as the current "rarest" string candidate, which
1528 * means for example that we'll quickly reject the whole string if
1529 * hasn't got a \n, rather than trying every substr position
1530 * first
1531 */
d0880ea7 1532
4620cb61
DM
1533 s = HOP3c(strend, - prog->minlen, strpos);
1534 if (s <= rx_origin ||
1535 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1536 {
6ad9a8ab 1537 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
d0880ea7
DM
1538 " Did not find /%s^%s/m...\n",
1539 PL_colors[0], PL_colors[1]));
a62659bd
DM
1540 goto fail_finish;
1541 }
d0880ea7 1542
4ada1233
DM
1543 /* earliest possible origin is 1 char after the \n.
1544 * (since *rx_origin == '\n', it's safe to ++ here rather than
1545 * HOP(rx_origin, 1)) */
1546 rx_origin++;
d0880ea7 1547
f4f115de 1548 if (prog->substrs->check_ix == 0 /* check is anchored */
4ada1233 1549 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
f4f115de 1550 {
d0880ea7
DM
1551 /* Position contradicts check-string; either because
1552 * check was anchored (and thus has no wiggle room),
4ada1233 1553 * or check was float and rx_origin is above the float range */
6ad9a8ab 1554 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee
DM
1555 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1556 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
d0880ea7
DM
1557 goto restart;
1558 }
1559
1560 /* if we get here, the check substr must have been float,
2e759faa 1561 * is in range, and we may or may not have had an anchored
d0880ea7
DM
1562 * "other" substr which still contradicts */
1563 assert(prog->substrs->check_ix); /* check is float */
1564
1565 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1566 /* whoops, the anchored "other" substr exists, so we still
1567 * contradict. On the other hand, the float "check" substr
1568 * didn't contradict, so just retry the anchored "other"
1569 * substr */
6ad9a8ab 1570 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1571 " Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
d0880ea7 1572 PL_colors[0], PL_colors[1],
73e8ff00
DM
1573 (IV)(rx_origin - strbeg + prog->anchored_offset),
1574 (IV)(rx_origin - strbeg)
675e93ee 1575 ));
d0880ea7
DM
1576 goto do_other_substr;
1577 }
1578
1579 /* success: we don't contradict the found floating substring
1580 * (and there's no anchored substr). */
6ad9a8ab 1581 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee
DM
1582 " Found /%s^%s/m with rx_origin %ld...\n",
1583 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
e3c6feb0
DM
1584 }
1585 else {
6ad9a8ab 1586 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
fe4f3442 1587 " (multiline anchor test skipped)\n"));
e3c6feb0
DM
1588 }
1589
ffad1e6a 1590 success_at_start:
e3c6feb0 1591
cad2e5aa 1592
dd170ff5
DM
1593 /* if we have a starting character class, then test that extra constraint.
1594 * (trie stclasses are too expensive to use here, we are better off to
1595 * leave it to regmatch itself) */
1596
f8fc2ecf 1597 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
f8fc2ecf 1598 const U8* const str = (U8*)STRING(progi->regstclass);
0991020e 1599
b2ad2123
DM
1600 /* XXX this value could be pre-computed */
1601 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
2c75e362
DM
1602 ? (reginfo->is_utf8_pat
1603 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1604 : STR_LEN(progi->regstclass))
66e933ab 1605 : 1);
1de06328 1606 char * endpos;
fa3bb21d 1607 char *s;
000dfd2d
DM
1608 /* latest pos that a matching float substr constrains rx start to */
1609 char *rx_max_float = NULL;
1610
c75a3985
DM
1611 /* if the current rx_origin is anchored, either by satisfying an
1612 * anchored substring constraint, or a /^.../m constraint, then we
1613 * can reject the current origin if the start class isn't found
1614 * at the current position. If we have a float-only match, then
1615 * rx_origin is constrained to a range; so look for the start class
1616 * in that range. if neither, then look for the start class in the
1617 * whole rest of the string */
1618
dd170ff5
DM
1619 /* XXX DAPM it's not clear what the minlen test is for, and why
1620 * it's not used in the floating case. Nothing in the test suite
1621 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1622 * Here are some old comments, which may or may not be correct:
1623 *
1624 * minlen == 0 is possible if regstclass is \b or \B,
1625 * and the fixed substr is ''$.
1626 * Since minlen is already taken into account, rx_origin+1 is
1627 * before strend; accidentally, minlen >= 1 guaranties no false
1628 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1629 * 0) below assumes that regstclass does not come from lookahead...
1630 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1631 * This leaves EXACTF-ish only, which are dealt with in
1632 * find_byclass().
1633 */
1634
7d2d37f5 1635 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
b2ad2123 1636 endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
000dfd2d 1637 else if (prog->float_substr || prog->float_utf8) {
b2ad2123
DM
1638 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1639 endpos = HOP3clim(rx_max_float, cl_l, strend);
000dfd2d 1640 }
1de06328
YO
1641 else
1642 endpos= strend;
1643
6ad9a8ab 1644 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
b2ad2123 1645 " looking for class: start_shift: %" IVdf " check_at: %" IVdf
147e3846 1646 " rx_origin: %" IVdf " endpos: %" IVdf "\n",
b2ad2123 1647 (IV)start_shift, (IV)(check_at - strbeg),
c43b5520 1648 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
d8080198 1649
c43b5520 1650 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
f9176b44 1651 reginfo);
be778b1a 1652 if (!s) {
6eb5f6b9 1653 if (endpos == strend) {
6ad9a8ab 1654 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1dc475d0 1655 " Could not match STCLASS...\n") );
6eb5f6b9
JH
1656 goto fail;
1657 }
6ad9a8ab 1658 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1dc475d0 1659 " This position contradicts STCLASS...\n") );
e0eb31e7
DM
1660 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1661 && !(prog->intflags & PREGf_IMPLICIT))
653099ff 1662 goto fail;
9fed8d02 1663
6eb5f6b9 1664 /* Contradict one of substrings */
97136c8a
DM
1665 if (prog->anchored_substr || prog->anchored_utf8) {
1666 if (prog->substrs->check_ix == 1) { /* check is float */
1667 /* Have both, check_string is floating */
b2ad2123
DM
1668 assert(rx_origin + start_shift <= check_at);
1669 if (rx_origin + start_shift != check_at) {
97136c8a 1670 /* not at latest position float substr could match:
c75a3985
DM
1671 * Recheck anchored substring, but not floating.
1672 * The condition above is in bytes rather than
1673 * chars for efficiency. It's conservative, in
1674 * that it errs on the side of doing 'goto
88203927
DM
1675 * do_other_substr'. In this case, at worst,
1676 * an extra anchored search may get done, but in
1677 * practice the extra fbm_instr() is likely to
1678 * get skipped anyway. */
6ad9a8ab 1679 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
147e3846 1680 " about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1681 (long)(other_last - strbeg),
1682 (IV)(rx_origin - strbeg)
1683 ));
97136c8a 1684 goto do_other_substr;
3369914b 1685 }
3369914b
DM
1686 }
1687 }
97136c8a 1688 else {
9fed8d02
DM
1689 /* float-only */
1690
7d2d37f5 1691 if (ml_anch) {
c75a3985
DM
1692 /* In the presence of ml_anch, we might be able to
1693 * find another \n without breaking the current float
1694 * constraint. */
1695
1696 /* strictly speaking this should be HOP3c(..., 1, ...),
1697 * but since we goto a block of code that's going to
1698 * search for the next \n if any, its safe here */
9fed8d02 1699 rx_origin++;
6ad9a8ab 1700 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
675e93ee 1701 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
9fed8d02 1702 PL_colors[0], PL_colors[1],
675e93ee 1703 (long)(rx_origin - strbeg)) );
9fed8d02 1704 goto postprocess_substr_matches;
ab60c45a 1705 }
c75a3985
DM
1706
1707 /* strictly speaking this can never be true; but might
1708 * be if we ever allow intuit without substrings */
1709 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
9fed8d02 1710 goto fail;
c75a3985 1711
000dfd2d 1712 rx_origin = rx_max_float;
9fed8d02
DM
1713 }
1714
c75a3985
DM
1715 /* at this point, any matching substrings have been
1716 * contradicted. Start again... */
1717
9fed8d02 1718 rx_origin = HOP3c(rx_origin, 1, strend);
557f47af
DM
1719
1720 /* uses bytes rather than char calculations for efficiency.
1721 * It's conservative: it errs on the side of doing 'goto restart',
1722 * where there is code that does a proper char-based test */
b2ad2123 1723 if (rx_origin + start_shift + end_shift > strend) {
6ad9a8ab 1724 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
9fed8d02
DM
1725 " Could not match STCLASS...\n") );
1726 goto fail;
1727 }
6ad9a8ab 1728 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
147e3846 1729 " about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
9fed8d02 1730 (prog->substrs->check_ix ? "floating" : "anchored"),
b2ad2123 1731 (long)(rx_origin + start_shift - strbeg),
675e93ee
DM
1732 (IV)(rx_origin - strbeg)
1733 ));
9fed8d02 1734 goto restart;
6eb5f6b9 1735 }
9fed8d02 1736
c75a3985
DM
1737 /* Success !!! */
1738
5f9c6575 1739 if (rx_origin != s) {
6ad9a8ab 1740 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1741 " By STCLASS: moving %ld --> %ld\n",
675e93ee 1742 (long)(rx_origin - strbeg), (long)(s - strbeg))
b7953727
JH
1743 );
1744 }
1745 else {
6ad9a8ab 1746 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1747 " Does not contradict STCLASS...\n");
b7953727
JH
1748 );
1749 }
6eb5f6b9 1750 }
ffad1e6a
DM
1751
1752 /* Decide whether using the substrings helped */
1753
1754 if (rx_origin != strpos) {
1755 /* Fixed substring is found far enough so that the match
1756 cannot start at strpos. */
1757
6ad9a8ab 1758 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n"));
ffad1e6a
DM
1759 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1760 }
1761 else {
70563e16
DM
1762 /* The found rx_origin position does not prohibit matching at
1763 * strpos, so calling intuit didn't gain us anything. Decrement
1764 * the BmUSEFUL() count on the check substring, and if we reach
1765 * zero, free it. */
1766 if (!(prog->intflags & PREGf_NAUGHTY)
ffad1e6a
DM
1767 && (utf8_target ? (
1768 prog->check_utf8 /* Could be deleted already */
1769 && --BmUSEFUL(prog->check_utf8) < 0
1770 && (prog->check_utf8 == prog->float_utf8)
1771 ) : (
1772 prog->check_substr /* Could be deleted already */
1773 && --BmUSEFUL(prog->check_substr) < 0
1774 && (prog->check_substr == prog->float_substr)
1775 )))
1776 {
1777 /* If flags & SOMETHING - do not do it many times on the same match */
6ad9a8ab 1778 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n"));
ffad1e6a
DM
1779 /* XXX Does the destruction order has to change with utf8_target? */
1780 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1781 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1782 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1783 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1784 check = NULL; /* abort */
ffad1e6a
DM
1785 /* XXXX This is a remnant of the old implementation. It
1786 looks wasteful, since now INTUIT can use many
1787 other heuristics. */
1788 prog->extflags &= ~RXf_USE_INTUIT;
ffad1e6a
DM
1789 }
1790 }
1791
6ad9a8ab 1792 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
ffad1e6a 1793 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
675e93ee 1794 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
ffad1e6a 1795
c765d6e0 1796 return rx_origin;
2c2d71f5
JH
1797
1798 fail_finish: /* Substring not found */
33b8afdf 1799 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1800 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1801 fail:
6ad9a8ab 1802 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n",
e4584336 1803 PL_colors[4], PL_colors[5]));
bd61b366 1804 return NULL;
cad2e5aa 1805}
9661b544 1806
70563e16 1807
a0a388a1 1808#define DECL_TRIE_TYPE(scan) \
e7fd4aa1 1809 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
a4525e78 1810 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
3ed3004a 1811 trie_utf8l, trie_flu8, trie_flu8_latin } \
e7fd4aa1
KW
1812 trie_type = ((scan->flags == EXACT) \
1813 ? (utf8_target ? trie_utf8 : trie_plain) \
a4525e78
KW
1814 : (scan->flags == EXACTL) \
1815 ? (utf8_target ? trie_utf8l : trie_plain) \
89829bb5 1816 : (scan->flags == EXACTFAA) \
a4525e78
KW
1817 ? (utf8_target \
1818 ? trie_utf8_exactfa_fold \
1819 : trie_latin_utf8_exactfa_fold) \
1820 : (scan->flags == EXACTFLU8 \
3ed3004a
YO
1821 ? (utf8_target \
1822 ? trie_flu8 \
1823 : trie_flu8_latin) \
a4525e78
KW
1824 : (utf8_target \
1825 ? trie_utf8_fold \
3ed3004a 1826 : trie_latin_utf8_fold)))
fab2782b 1827
9ad8cac4
KW
1828/* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1829 * 'foldbuf+sizeof(foldbuf)' */
1830#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
baa60164 1831STMT_START { \
fab2782b 1832 STRLEN skiplen; \
baa60164 1833 U8 flags = FOLD_FLAGS_FULL; \
fab2782b 1834 switch (trie_type) { \
a4525e78 1835 case trie_flu8: \
780fcc9f 1836 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
3ed3004a 1837 if (UTF8_IS_ABOVE_LATIN1(*uc)) { \
e1a2878a 1838 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
613abc6d 1839 } \
a4525e78 1840 goto do_trie_utf8_fold; \
31f05a37 1841 case trie_utf8_exactfa_fold: \
baa60164 1842 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1843 /* FALLTHROUGH */ \
fab2782b 1844 case trie_utf8_fold: \
a4525e78 1845 do_trie_utf8_fold: \
fab2782b 1846 if ( foldlen>0 ) { \
9ad8cac4 1847 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
fab2782b
YO
1848 foldlen -= len; \
1849 uscan += len; \
1850 len=0; \
1851 } else { \
9ad8cac4 1852 uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \
a1a5ec35 1853 flags); \
9ad8cac4 1854 len = UTF8SKIP(uc); \
5f560d8a 1855 skiplen = UVCHR_SKIP( uvc ); \
fab2782b
YO
1856 foldlen -= skiplen; \
1857 uscan = foldbuf + skiplen; \
1858 } \
1859 break; \
3ed3004a
YO
1860 case trie_flu8_latin: \
1861 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1862 goto do_trie_latin_utf8_fold; \
baa60164
KW
1863 case trie_latin_utf8_exactfa_fold: \
1864 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1865 /* FALLTHROUGH */ \
fab2782b 1866 case trie_latin_utf8_fold: \
3ed3004a 1867 do_trie_latin_utf8_fold: \
fab2782b 1868 if ( foldlen>0 ) { \
9ad8cac4 1869 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
fab2782b
YO
1870 foldlen -= len; \
1871 uscan += len; \
1872 len=0; \
1873 } else { \
1874 len = 1; \
31f05a37 1875 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
5f560d8a 1876 skiplen = UVCHR_SKIP( uvc ); \
fab2782b
YO
1877 foldlen -= skiplen; \
1878 uscan = foldbuf + skiplen; \
1879 } \
1880 break; \
a4525e78 1881 case trie_utf8l: \
780fcc9f 1882 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
613abc6d
KW
1883 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1884 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1885 } \
780fcc9f 1886 /* FALLTHROUGH */ \
fab2782b 1887 case trie_utf8: \
9ad8cac4 1888 uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags ); \
fab2782b
YO
1889 break; \
1890 case trie_plain: \
1891 uvc = (UV)*uc; \
1892 len = 1; \
1893 } \
1894 if (uvc < 256) { \
1895 charid = trie->charmap[ uvc ]; \
1896 } \
1897 else { \
1898 charid = 0; \
1899 if (widecharmap) { \
1900 SV** const svpp = hv_fetch(widecharmap, \
1901 (char*)&uvc, sizeof(UV), 0); \
1902 if (svpp) \
1903 charid = (U16)SvIV(*svpp); \
1904 } \
1905 } \
4cadc6a9
YO
1906} STMT_END
1907
cb41e5d6 1908#define DUMP_EXEC_POS(li,s,doutf8,depth) \
ae7c5b9b 1909 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
cb41e5d6 1910 startpos, doutf8, depth)
ae7c5b9b 1911
da10aa09
KW
1912#define REXEC_FBC_SCAN(UTF8, CODE) \
1913 STMT_START { \
1914 while (s < strend) { \
1915 CODE \
1916 s += ((UTF8) ? UTF8SKIP(s) : 1); \
1917 } \
1918 } STMT_END
4cadc6a9 1919
d990bd30
KW
1920#define REXEC_FBC_CLASS_SCAN(UTF8, COND) \
1921 STMT_START { \
1922 while (s < strend) { \
1923 REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \
1924 } \
1925 } STMT_END
4cadc6a9 1926
d990bd30 1927#define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \
05bd126c 1928 if (COND) { \
19719145 1929 FBC_CHECK_AND_TRY \
21d1ed54
KW
1930 s += ((UTF8) ? UTF8SKIP(s) : 1); \
1931 previous_occurrence_end = s; \
05bd126c 1932 } \
21d1ed54
KW
1933 else { \
1934 s += ((UTF8) ? UTF8SKIP(s) : 1); \
1935 }
4cadc6a9 1936
c84a03c5 1937#define REXEC_FBC_CSCAN(CONDUTF8,COND) \
baa60164 1938 if (utf8_target) { \
da10aa09 1939 REXEC_FBC_CLASS_SCAN(1, CONDUTF8); \
e1d1eefb
YO
1940 } \
1941 else { \
da10aa09 1942 REXEC_FBC_CLASS_SCAN(0, COND); \
d981ef24 1943 }
05bd126c 1944
a9448551
KW
1945/* We keep track of where the next character should start after an occurrence
1946 * of the one we're looking for. Knowing that, we can see right away if the
1947 * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we
1948 * don't accept the 2nd and succeeding adjacent occurrences */
19719145
KW
1949#define FBC_CHECK_AND_TRY \
1950 if ( ( doevery \
1951 || s != previous_occurrence_end) \
1952 && (reginfo->intuit || regtry(reginfo, &s))) \
1953 { \
1954 goto got_it; \
1955 }
1956
a9448551
KW
1957
1958/* This differs from the above macros in that it calls a function which returns
1959 * the next occurrence of the thing being looked for in 's'; and 'strend' if
1960 * there is no such occurrence. */
1961#define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f) \
1962 while (s < strend) { \
2b1f9c71 1963 s = (f); \
a9448551
KW
1964 if (s >= strend) { \
1965 break; \
1966 } \
1967 \
1968 FBC_CHECK_AND_TRY \
1969 s += (UTF8) ? UTF8SKIP(s) : 1; \
1970 previous_occurrence_end = s; \
1971 }
1972
05bd126c
KW
1973/* The three macros below are slightly different versions of the same logic.
1974 *
1975 * The first is for /a and /aa when the target string is UTF-8. This can only
1976 * match ascii, but it must advance based on UTF-8. The other two handle the
1977 * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking
1978 * for the boundary (or non-boundary) between a word and non-word character.
1979 * The utf8 and non-utf8 cases have the same logic, but the details must be
1980 * different. Find the "wordness" of the character just prior to this one, and
1981 * compare it with the wordness of this one. If they differ, we have a
1982 * boundary. At the beginning of the string, pretend that the previous
1983 * character was a new-line.
1984 *
1985 * All these macros uncleanly have side-effects with each other and outside
1986 * variables. So far it's been too much trouble to clean-up
1987 *
1988 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1989 * a word character or not.
1990 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1991 * word/non-word
1992 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1993 *
1994 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1995 * are looking for a boundary or for a non-boundary. If we are looking for a
1996 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1997 * see if this tentative match actually works, and if so, to quit the loop
1998 * here. And vice-versa if we are looking for a non-boundary.
1999 *
2000 * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
da10aa09 2001 * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of
05bd126c
KW
2002 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
2003 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
2004 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
2005 * complement. But in that branch we complement tmp, meaning that at the
2006 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
2007 * which means at the top of the loop in the next iteration, it is
2008 * TEST_NON_UTF8(s-1) */
b2f4e957 2009#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
2010 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
2011 tmp = TEST_NON_UTF8(tmp); \
da10aa09 2012 REXEC_FBC_SCAN(1, /* 1=>is-utf8; advances s while s < strend */ \
05bd126c
KW
2013 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
2014 tmp = !tmp; \
2015 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
2016 } \
2017 else { \
2018 IF_FAIL; \
2019 } \
2020 ); \
2021
2022/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
2023 * TEST_UTF8 is a macro that for the same input code points returns identically
2024 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
236d82fd 2025#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
2026 if (s == reginfo->strbeg) { \
2027 tmp = '\n'; \
2028 } \
2029 else { /* Back-up to the start of the previous character */ \
2030 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
2031 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
3db24e1e 2032 0, UTF8_ALLOW_DEFAULT); \
05bd126c
KW
2033 } \
2034 tmp = TEST_UV(tmp); \
da10aa09 2035 REXEC_FBC_SCAN(1, /* 1=>is-utf8; advances s while s < strend */ \
7a207065 2036 if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \
05bd126c
KW
2037 tmp = !tmp; \
2038 IF_SUCCESS; \
2039 } \
2040 else { \
2041 IF_FAIL; \
2042 } \
2043 );
cfaf538b 2044
05bd126c
KW
2045/* Like the above two macros. UTF8_CODE is the complete code for handling
2046 * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
2047 * macros below */
baa60164 2048#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 2049 if (utf8_target) { \
05bd126c 2050 UTF8_CODE \
63ac0dad
KW
2051 } \
2052 else { /* Not utf8 */ \
9d9163fb 2053 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
63ac0dad 2054 tmp = TEST_NON_UTF8(tmp); \
da10aa09 2055 REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */ \
63ac0dad 2056 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
63ac0dad 2057 IF_SUCCESS; \
760cfa8e 2058 tmp = !tmp; \
63ac0dad
KW
2059 } \
2060 else { \
2061 IF_FAIL; \
2062 } \
2063 ); \
2064 } \
c8519dc7
KW
2065 /* Here, things have been set up by the previous code so that tmp is the \
2066 * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \
2067 * utf8ness of the target). We also have to check if this matches against \
2068 * the EOS, which we treat as a \n (which is the same value in both UTF-8 \
2069 * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \
2070 * string */ \
2071 if (tmp == ! TEST_NON_UTF8('\n')) { \
2072 IF_SUCCESS; \
2073 } \
2074 else { \
2075 IF_FAIL; \
2076 }
63ac0dad 2077
ae7c5b9b
KW
2078/* This is the macro to use when we want to see if something that looks like it
2079 * could match, actually does, and if so exits the loop */
2080#define REXEC_FBC_TRYIT \
2081 if ((reginfo->intuit || regtry(reginfo, &s))) \
2082 goto got_it
2083
2084/* The only difference between the BOUND and NBOUND cases is that
2085 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2086 * NBOUND. This is accomplished by passing it as either the if or else clause,
2087 * with the other one being empty (PLACEHOLDER is defined as empty).
2088 *
2089 * The TEST_FOO parameters are for operating on different forms of input, but
2090 * all should be ones that return identically for the same underlying code
2091 * points */
2092#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2093 FBC_BOUND_COMMON( \
2094 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
2095 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2096
44129e46 2097#define FBC_BOUND_A(TEST_NON_UTF8) \
ae7c5b9b
KW
2098 FBC_BOUND_COMMON( \
2099 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
2100 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2101
2102#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2103 FBC_BOUND_COMMON( \
2104 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2105 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2106
44129e46 2107#define FBC_NBOUND_A(TEST_NON_UTF8) \
ae7c5b9b
KW
2108 FBC_BOUND_COMMON( \
2109 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2110 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2111
8bde5eaf
JH
2112#ifdef DEBUGGING
2113static IV
2114S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
9ffbcbcf 2115 IV cp_out = _invlist_search(invlist, cp_in);
8bde5eaf
JH
2116 assert(cp_out >= 0);
2117 return cp_out;
2118}
2119# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2120 invmap[S_get_break_val_cp_checked(invlist, cp)]
2121#else
2122# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2123 invmap[_invlist_search(invlist, cp)]
2124#endif
2125
64935bc6
KW
2126/* Takes a pointer to an inversion list, a pointer to its corresponding
2127 * inversion map, and a code point, and returns the code point's value
2128 * according to the two arrays. It assumes that all code points have a value.
2129 * This is used as the base macro for macros for particular properties */
2130#define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
8bde5eaf 2131 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
64935bc6
KW
2132
2133/* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2134 * of a code point, returning the value for the first code point in the string.
2135 * And it takes the particular macro name that finds the desired value given a
2136 * code point. Merely convert the UTF-8 to code point and call the cp macro */
2137#define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
2138 (__ASSERT_(pos < strend) \
2139 /* Note assumes is valid UTF-8 */ \
2140 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2141
2142/* Returns the GCB value for the input code point */
2143#define getGCB_VAL_CP(cp) \
2144 _generic_GET_BREAK_VAL_CP( \
2145 PL_GCB_invlist, \
02f811dd 2146 _Perl_GCB_invmap, \
64935bc6
KW
2147 (cp))
2148
2149/* Returns the GCB value for the first code point in the UTF-8 encoded string
2150 * bounded by pos and strend */
2151#define getGCB_VAL_UTF8(pos, strend) \
2152 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
05bd126c 2153
6b659339
KW
2154/* Returns the LB value for the input code point */
2155#define getLB_VAL_CP(cp) \
2156 _generic_GET_BREAK_VAL_CP( \
2157 PL_LB_invlist, \
2158 _Perl_LB_invmap, \
2159 (cp))
2160
2161/* Returns the LB value for the first code point in the UTF-8 encoded string
2162 * bounded by pos and strend */
2163#define getLB_VAL_UTF8(pos, strend) \
2164 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2165
06ae2722
KW
2166
2167/* Returns the SB value for the input code point */
2168#define getSB_VAL_CP(cp) \
2169 _generic_GET_BREAK_VAL_CP( \
2170 PL_SB_invlist, \
bf4268fa 2171 _Perl_SB_invmap, \
06ae2722
KW
2172 (cp))
2173
2174/* Returns the SB value for the first code point in the UTF-8 encoded string
2175 * bounded by pos and strend */
2176#define getSB_VAL_UTF8(pos, strend) \
2177 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2178
ae3bb8ea
KW
2179/* Returns the WB value for the input code point */
2180#define getWB_VAL_CP(cp) \
2181 _generic_GET_BREAK_VAL_CP( \
2182 PL_WB_invlist, \
bf4268fa 2183 _Perl_WB_invmap, \
ae3bb8ea
KW
2184 (cp))
2185
2186/* Returns the WB value for the first code point in the UTF-8 encoded string
2187 * bounded by pos and strend */
2188#define getWB_VAL_UTF8(pos, strend) \
2189 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2190
786e8c11 2191/* We know what class REx starts with. Try to find this position... */
02d5137b 2192/* if reginfo->intuit, its a dryrun */
786e8c11
YO
2193/* annoyingly all the vars in this routine have different names from their counterparts
2194 in regmatch. /grrr */
3c3eec57 2195STATIC char *
07be1b83 2196S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 2197 const char *strend, regmatch_info *reginfo)
a687059c 2198{
73104a1b 2199 dVAR;
a9448551
KW
2200
2201 /* TRUE if x+ need not match at just the 1st pos of run of x's */
73104a1b 2202 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
a9448551 2203
73104a1b
KW
2204 char *pat_string; /* The pattern's exactish string */
2205 char *pat_end; /* ptr to end char of pat_string */
2206 re_fold_t folder; /* Function for computing non-utf8 folds */
2207 const U8 *fold_array; /* array for folding ords < 256 */
2208 STRLEN ln;
2209 STRLEN lnc;
73104a1b
KW
2210 U8 c1;
2211 U8 c2;
2212 char *e;
21d1ed54
KW
2213
2214 /* In some cases we accept only the first occurence of 'x' in a sequence of
2215 * them. This variable points to just beyond the end of the previous
2216 * occurrence of 'x', hence we can tell if we are in a sequence. (Having
2217 * it point to beyond the 'x' allows us to work for UTF-8 without having to
2218 * hop back.) */
2219 char * previous_occurrence_end = 0;
2220
3b6c52ce 2221 I32 tmp; /* Scratch variable */
ba44c216 2222 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 2223 UV utf8_fold_flags = 0;
f9176b44 2224 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
2225 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
2226 with a result inverts that result, as 0^1 =
2227 1 and 1^1 = 0 */
2228 _char_class_number classnum;
2229
73104a1b 2230 RXi_GET_DECL(prog,progi);
2f7f8cb1 2231
73104a1b 2232 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 2233
73104a1b
KW
2234 /* We know what class it must start with. */
2235 switch (OP(c)) {
3edce4f5 2236 case ANYOFPOSIXL:
a4525e78 2237 case ANYOFL:
780fcc9f 2238 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
a0bd1a30 2239
d1c40ef5 2240 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
a0bd1a30
KW
2241 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2242 }
2243
780fcc9f 2244 /* FALLTHROUGH */
ac44c12e 2245 case ANYOFD:
73104a1b
KW
2246 case ANYOF:
2247 if (utf8_target) {
da10aa09 2248 REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
3db24e1e 2249 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
73104a1b 2250 }
1451f692 2251 else if (ANYOF_FLAGS(c)) {
da10aa09 2252 REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
1451f692 2253 }
73104a1b 2254 else {
da10aa09 2255 REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s)));
73104a1b
KW
2256 }
2257 break;
73104a1b 2258
2813d4ad
KW
2259 case ANYOFM: /* ARG() is the base byte; FLAGS() the mask byte */
2260 /* UTF-8ness doesn't matter, so use 0 */
2261 REXEC_FBC_FIND_NEXT_SCAN(0,
12475f8b
KW
2262 (char *) find_next_masked((U8 *) s, (U8 *) strend,
2263 (U8) ARG(c), FLAGS(c)));
2813d4ad
KW
2264 break;
2265
89829bb5 2266 case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
098b07d5 2267 assert(! is_utf8_pat);
924ba076 2268 /* FALLTHROUGH */
89829bb5 2269 case EXACTFAA:
984e6dd1 2270 if (is_utf8_pat || utf8_target) {
73104a1b
KW
2271 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2272 goto do_exactf_utf8;
2273 }
2274 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
2275 folder = foldEQ_latin1; /* /a, except the sharp s one which */
2276 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 2277
2fdb7295
KW
2278 case EXACTF: /* This node only generated for non-utf8 patterns */
2279 assert(! is_utf8_pat);
73104a1b 2280 if (utf8_target) {
73104a1b
KW
2281 utf8_fold_flags = 0;
2282 goto do_exactf_utf8;
2283 }
2284 fold_array = PL_fold;
2285 folder = foldEQ;
2286 goto do_exactf_non_utf8;
2287
2288 case EXACTFL:
780fcc9f 2289 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
31f05a37 2290 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
cea315b6 2291 utf8_fold_flags = FOLDEQ_LOCALE;
73104a1b
KW
2292 goto do_exactf_utf8;
2293 }
2294 fold_array = PL_fold_locale;
2295 folder = foldEQ_locale;
2296 goto do_exactf_non_utf8;
3c760661 2297
73104a1b 2298 case EXACTFU_SS:
984e6dd1 2299 if (is_utf8_pat) {
73104a1b
KW
2300 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2301 }
2302 goto do_exactf_utf8;
16d951b7 2303
a4525e78
KW
2304 case EXACTFLU8:
2305 if (! utf8_target) { /* All code points in this node require
2306 UTF-8 to express. */
2307 break;
2308 }
613abc6d
KW
2309 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2310 | FOLDEQ_S2_FOLDS_SANE;
a4525e78
KW
2311 goto do_exactf_utf8;
2312
73104a1b 2313 case EXACTFU:
984e6dd1
DM
2314 if (is_utf8_pat || utf8_target) {
2315 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
2316 goto do_exactf_utf8;
2317 }
fac1af77 2318
73104a1b
KW
2319 /* Any 'ss' in the pattern should have been replaced by regcomp,
2320 * so we don't have to worry here about this single special case
2321 * in the Latin1 range */
2322 fold_array = PL_fold_latin1;
2323 folder = foldEQ_latin1;
2324
924ba076 2325 /* FALLTHROUGH */
73104a1b 2326
c52b8b12 2327 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
73104a1b
KW
2328 are no glitches with fold-length differences
2329 between the target string and pattern */
2330
2331 /* The idea in the non-utf8 EXACTF* cases is to first find the
2332 * first character of the EXACTF* node and then, if necessary,
2333 * case-insensitively compare the full text of the node. c1 is the
2334 * first character. c2 is its fold. This logic will not work for
2335 * Unicode semantics and the german sharp ss, which hence should
2336 * not be compiled into a node that gets here. */
2337 pat_string = STRING(c);
2338 ln = STR_LEN(c); /* length to match in octets/bytes */
2339
2340 /* We know that we have to match at least 'ln' bytes (which is the
2341 * same as characters, since not utf8). If we have to match 3
2342 * characters, and there are only 2 availabe, we know without
2343 * trying that it will fail; so don't start a match past the
2344 * required minimum number from the far end */
ea3daa5d 2345 e = HOP3c(strend, -((SSize_t)ln), s);
dda01918
HS
2346 if (e < s)
2347 break;
fac1af77 2348
73104a1b
KW
2349 c1 = *pat_string;
2350 c2 = fold_array[c1];
2351 if (c1 == c2) { /* If char and fold are the same */
c05cc3b6
KW
2352 while (s <= e) {
2353 s = (char *) memchr(s, c1, e + 1 - s);
2354 if (s == NULL) {
2355 break;
2356 }
2357
2358 /* Check that the rest of the node matches */
2359 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2360 && (reginfo->intuit || regtry(reginfo, &s)) )
2361 {
2362 goto got_it;
2363 }
2364 s++;
2365 }
73104a1b
KW
2366 }
2367 else {
c05cc3b6
KW
2368 U8 bits_differing = c1 ^ c2;
2369
2370 /* If the folds differ in one bit position only, we can mask to
2371 * match either of them, and can use this faster find method. Both
2372 * ASCII and EBCDIC tend to have their case folds differ in only
2373 * one position, so this is very likely */
2374 if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2375 bits_differing = ~ bits_differing;
2376 while (s <= e) {
2b1f9c71 2377 s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
c05cc3b6
KW
2378 (c1 & bits_differing), bits_differing);
2379 if (s > e) {
2380 break;
2381 }
2382
2383 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2384 && (reginfo->intuit || regtry(reginfo, &s)) )
2385 {
2386 goto got_it;
2387 }
2388 s++;
2389 }
2390 }
2391 else { /* Otherwise, stuck with looking byte-at-a-time. This
2392 should actually happen only in EXACTFL nodes */
2393 while (s <= e) {
2394 if ( (*(U8*)s == c1 || *(U8*)s == c2)
2395 && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2396 && (reginfo->intuit || regtry(reginfo, &s)) )
2397 {
2398 goto got_it;
2399 }
2400 s++;
2401 }
2402 }
73104a1b
KW
2403 }
2404 break;
fac1af77 2405
c52b8b12
KW
2406 do_exactf_utf8:
2407 {
73104a1b
KW
2408 unsigned expansion;
2409
2410 /* If one of the operands is in utf8, we can't use the simpler folding
2411 * above, due to the fact that many different characters can have the
2412 * same fold, or portion of a fold, or different- length fold */
2413 pat_string = STRING(c);
2414 ln = STR_LEN(c); /* length to match in octets/bytes */
2415 pat_end = pat_string + ln;
984e6dd1 2416 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
2417 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2418 : ln;
2419
2420 /* We have 'lnc' characters to match in the pattern, but because of
2421 * multi-character folding, each character in the target can match
2422 * up to 3 characters (Unicode guarantees it will never exceed
2423 * this) if it is utf8-encoded; and up to 2 if not (based on the
2424 * fact that the Latin 1 folds are already determined, and the
2425 * only multi-char fold in that range is the sharp-s folding to
2426 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
2427 * string character. Adjust lnc accordingly, rounding up, so that
2428 * if we need to match at least 4+1/3 chars, that really is 5. */
2429 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2430 lnc = (lnc + expansion - 1) / expansion;
2431
2432 /* As in the non-UTF8 case, if we have to match 3 characters, and
2433 * only 2 are left, it's guaranteed to fail, so don't start a
2434 * match that would require us to go beyond the end of the string
2435 */
ea3daa5d 2436 e = HOP3c(strend, -((SSize_t)lnc), s);
73104a1b 2437
73104a1b
KW
2438 /* XXX Note that we could recalculate e to stop the loop earlier,
2439 * as the worst case expansion above will rarely be met, and as we
2440 * go along we would usually find that e moves further to the left.
2441 * This would happen only after we reached the point in the loop
2442 * where if there were no expansion we should fail. Unclear if
2443 * worth the expense */
2444
2445 while (s <= e) {
2446 char *my_strend= (char *)strend;
2447 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 2448 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
02d5137b 2449 && (reginfo->intuit || regtry(reginfo, &s)) )
73104a1b
KW
2450 {
2451 goto got_it;
2452 }
2453 s += (utf8_target) ? UTF8SKIP(s) : 1;
2454 }
2455 break;
2456 }
236d82fd 2457
73104a1b 2458 case BOUNDL:
780fcc9f 2459 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6 2460 if (FLAGS(c) != TRADITIONAL_BOUND) {
89ad707a
KW
2461 if (! IN_UTF8_CTYPE_LOCALE) {
2462 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
64935bc6 2463 B_ON_NON_UTF8_LOCALE_IS_WRONG);
89ad707a 2464 }
64935bc6
KW
2465 goto do_boundu;
2466 }
2467
7a207065 2468 FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
73104a1b 2469 break;
64935bc6 2470
73104a1b 2471 case NBOUNDL:
780fcc9f 2472 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6 2473 if (FLAGS(c) != TRADITIONAL_BOUND) {
89ad707a
KW
2474 if (! IN_UTF8_CTYPE_LOCALE) {
2475 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
64935bc6 2476 B_ON_NON_UTF8_LOCALE_IS_WRONG);
89ad707a 2477 }
64935bc6
KW
2478 goto do_nboundu;
2479 }
2480
7a207065 2481 FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
73104a1b 2482 break;
64935bc6
KW
2483
2484 case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2485 meaning */
2486 assert(FLAGS(c) == TRADITIONAL_BOUND);
2487
7a207065 2488 FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
73104a1b 2489 break;
64935bc6
KW
2490
2491 case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2492 meaning */
2493 assert(FLAGS(c) == TRADITIONAL_BOUND);
2494
44129e46 2495 FBC_BOUND_A(isWORDCHAR_A);
73104a1b 2496 break;
64935bc6
KW
2497
2498 case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2499 meaning */
2500 assert(FLAGS(c) == TRADITIONAL_BOUND);
2501
7a207065 2502 FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
73104a1b 2503 break;
64935bc6
KW
2504
2505 case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2506 meaning */
2507 assert(FLAGS(c) == TRADITIONAL_BOUND);
2508
44129e46 2509 FBC_NBOUND_A(isWORDCHAR_A);
73104a1b 2510 break;
64935bc6 2511
73104a1b 2512 case NBOUNDU:
64935bc6 2513 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
7a207065 2514 FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
64935bc6
KW
2515 break;
2516 }
2517
2518 do_nboundu:
2519
2520 to_complement = 1;
2521 /* FALLTHROUGH */
2522
2523 case BOUNDU:
2524 do_boundu:
2525 switch((bound_type) FLAGS(c)) {
2526 case TRADITIONAL_BOUND:
7a207065 2527 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
64935bc6
KW
2528 break;
2529 case GCB_BOUND:
a7a8bd1e 2530 if (s == reginfo->strbeg) {
67481c39 2531 if (reginfo->intuit || regtry(reginfo, &s))
64935bc6
KW
2532 {
2533 goto got_it;
2534 }
a7a8bd1e
KW
2535
2536 /* Didn't match. Try at the next position (if there is one) */
64935bc6 2537 s += (utf8_target) ? UTF8SKIP(s) : 1;
a7a8bd1e
KW
2538 if (UNLIKELY(s >= reginfo->strend)) {
2539 break;
2540 }
64935bc6
KW
2541 }
2542
2543 if (utf8_target) {
85e5f08b 2544 GCB_enum before = getGCB_VAL_UTF8(
64935bc6
KW
2545 reghop3((U8*)s, -1,
2546 (U8*)(reginfo->strbeg)),
2547 (U8*) reginfo->strend);
2548 while (s < strend) {
85e5f08b 2549 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
64935bc6 2550 (U8*) reginfo->strend);
b0e24409
KW
2551 if ( (to_complement ^ isGCB(before,
2552 after,
2553 (U8*) reginfo->strbeg,
2554 (U8*) s,
2555 utf8_target))
00e3344b
KW
2556 && (reginfo->intuit || regtry(reginfo, &s)))
2557 {
2558 goto got_it;
64935bc6 2559 }
43a7bd62 2560 before = after;
64935bc6
KW
2561 s += UTF8SKIP(s);
2562 }
2563 }
2564 else { /* Not utf8. Everything is a GCB except between CR and
2565 LF */
2566 while (s < strend) {
00e3344b
KW
2567 if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
2568 || UCHARAT(s) != '\n'))
2569 && (reginfo->intuit || regtry(reginfo, &s)))
64935bc6 2570 {
00e3344b 2571 goto got_it;
64935bc6 2572 }
43a7bd62 2573 s++;
64935bc6
KW
2574 }
2575 }
2576
6de80efc
KW
2577 /* And, since this is a bound, it can match after the final
2578 * character in the string */
67481c39 2579 if ((reginfo->intuit || regtry(reginfo, &s))) {
64935bc6
KW
2580 goto got_it;
2581 }
2582 break;
ae3bb8ea 2583
6b659339
KW
2584 case LB_BOUND:
2585 if (s == reginfo->strbeg) {
2586 if (reginfo->intuit || regtry(reginfo, &s)) {
2587 goto got_it;
2588 }
2589 s += (utf8_target) ? UTF8SKIP(s) : 1;
2590 if (UNLIKELY(s >= reginfo->strend)) {
2591 break;
2592 }
2593 }
2594
2595 if (utf8_target) {
2596 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2597 -1,
2598 (U8*)(reginfo->strbeg)),
2599 (U8*) reginfo->strend);
2600 while (s < strend) {
2601 LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2602 if (to_complement ^ isLB(before,
2603 after,
2604 (U8*) reginfo->strbeg,
2605 (U8*) s,
2606 (U8*) reginfo->strend,
2607 utf8_target)
2608 && (reginfo->intuit || regtry(reginfo, &s)))
2609 {
2610 goto got_it;
2611 }
2612 before = after;
2613 s += UTF8SKIP(s);
2614 }
2615 }
2616 else { /* Not utf8. */
2617 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2618 while (s < strend) {
2619 LB_enum after = getLB_VAL_CP((U8) *s);
2620 if (to_complement ^ isLB(before,
2621 after,
2622 (U8*) reginfo->strbeg,
2623 (U8*) s,
2624 (U8*) reginfo->strend,
2625 utf8_target)
2626 && (reginfo->intuit || regtry(reginfo, &s)))
2627 {
2628 goto got_it;
2629 }
2630 before = after;
2631 s++;
2632 }
2633 }
2634
2635 if (reginfo->intuit || regtry(reginfo, &s)) {
2636 goto got_it;
2637 }
2638
2639 break;
2640
06ae2722 2641 case SB_BOUND:
a7a8bd1e 2642 if (s == reginfo->strbeg) {
67481c39 2643 if (reginfo->intuit || regtry(reginfo, &s)) {
06ae2722
KW
2644 goto got_it;
2645 }
06ae2722 2646 s += (utf8_target) ? UTF8SKIP(s) : 1;
a7a8bd1e
KW
2647 if (UNLIKELY(s >= reginfo->strend)) {
2648 break;
2649 }
06ae2722
KW
2650 }
2651
2652 if (utf8_target) {
85e5f08b 2653 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
06ae2722
KW
2654 -1,
2655 (U8*)(reginfo->strbeg)),
2656 (U8*) reginfo->strend);
2657 while (s < strend) {
85e5f08b 2658 SB_enum after = getSB_VAL_UTF8((U8*) s,
06ae2722 2659 (U8*) reginfo->strend);
00e3344b
KW
2660 if ((to_complement ^ isSB(before,
2661 after,
2662 (U8*) reginfo->strbeg,
2663 (U8*) s,
2664 (U8*) reginfo->strend,
2665 utf8_target))
2666 && (reginfo->intuit || regtry(reginfo, &s)))
06ae2722 2667 {
00e3344b 2668 goto got_it;
06ae2722 2669 }
43a7bd62 2670 before = after;
06ae2722
KW
2671 s += UTF8SKIP(s);
2672 }
2673 }
2674 else { /* Not utf8. */
85e5f08b 2675 SB_enum before = getSB_VAL_CP((U8) *(s -1));
06ae2722 2676 while (s < strend) {
85e5f08b 2677 SB_enum after = getSB_VAL_CP((U8) *s);
00e3344b
KW
2678 if ((to_complement ^ isSB(before,
2679 after,
2680 (U8*) reginfo->strbeg,
2681 (U8*) s,
2682 (U8*) reginfo->strend,
2683 utf8_target))
2684 && (reginfo->intuit || regtry(reginfo, &s)))
06ae2722 2685 {
00e3344b 2686 goto got_it;
06ae2722 2687 }
43a7bd62 2688 before = after;
06ae2722
KW
2689 s++;
2690 }
2691 }
2692
2693 /* Here are at the final position in the target string. The SB
2694 * value is always true here, so matches, depending on other
2695 * constraints */
67481c39 2696 if (reginfo->intuit || regtry(reginfo, &s)) {
06ae2722
KW
2697 goto got_it;
2698 }
2699
2700 break;
2701
ae3bb8ea
KW
2702 case WB_BOUND:
2703 if (s == reginfo->strbeg) {
67481c39 2704 if (reginfo->intuit || regtry(reginfo, &s)) {
ae3bb8ea
KW
2705 goto got_it;
2706 }
2707 s += (utf8_target) ? UTF8SKIP(s) : 1;
a7a8bd1e
KW
2708 if (UNLIKELY(s >= reginfo->strend)) {
2709 break;
2710 }
ae3bb8ea
KW
2711 }
2712
2713 if (utf8_target) {
2714 /* We are at a boundary between char_sub_0 and char_sub_1.
2715 * We also keep track of the value for char_sub_-1 as we
2716 * loop through the line. Context may be needed to make a
2717 * determination, and if so, this can save having to
2718 * recalculate it */
85e5f08b
KW
2719 WB_enum previous = WB_UNKNOWN;
2720 WB_enum before = getWB_VAL_UTF8(
ae3bb8ea
KW
2721 reghop3((U8*)s,
2722 -1,
2723 (U8*)(reginfo->strbeg)),
2724 (U8*) reginfo->strend);
2725 while (s < strend) {
85e5f08b 2726 WB_enum after = getWB_VAL_UTF8((U8*) s,
ae3bb8ea 2727 (U8*) reginfo->strend);
00e3344b
KW
2728 if ((to_complement ^ isWB(previous,
2729 before,
2730 after,
2731 (U8*) reginfo->strbeg,
2732 (U8*) s,
2733 (U8*) reginfo->strend,
2734 utf8_target))
2735 && (reginfo->intuit || regtry(reginfo, &s)))
ae3bb8ea 2736 {
00e3344b 2737 goto got_it;
ae3bb8ea 2738 }
43a7bd62
KW
2739 previous = before;
2740 before = after;
ae3bb8ea
KW
2741 s += UTF8SKIP(s);
2742 }
2743 }
2744 else { /* Not utf8. */
85e5f08b
KW
2745 WB_enum previous = WB_UNKNOWN;
2746 WB_enum before = getWB_VAL_CP((U8) *(s -1));
ae3bb8ea 2747 while (s < strend) {
85e5f08b 2748 WB_enum after = getWB_VAL_CP((U8) *s);
00e3344b
KW
2749 if ((to_complement ^ isWB(previous,
2750 before,
2751 after,
2752 (U8*) reginfo->strbeg,
2753 (U8*) s,
2754 (U8*) reginfo->strend,
2755 utf8_target))
2756 && (reginfo->intuit || regtry(reginfo, &s)))
ae3bb8ea 2757 {
00e3344b 2758 goto got_it;
ae3bb8ea 2759 }
43a7bd62
KW
2760 previous = before;
2761 before = after;
ae3bb8ea
KW
2762 s++;
2763 }
2764 }
2765
67481c39 2766 if (reginfo->intuit || regtry(reginfo, &s)) {
ae3bb8ea
KW
2767 goto got_it;
2768 }
64935bc6 2769 }
73104a1b 2770 break;
64935bc6 2771
73104a1b
KW
2772 case LNBREAK:
2773 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2774 is_LNBREAK_latin1_safe(s, strend)
2775 );
2776 break;
3018b823 2777
aff4cafe 2778 case ASCII:
a9448551 2779 REXEC_FBC_FIND_NEXT_SCAN(0, find_next_ascii(s, strend, utf8_target));
aff4cafe
KW
2780 break;
2781
2782 case NASCII:
a9448551
KW
2783 if (utf8_target) {
2784 REXEC_FBC_FIND_NEXT_SCAN(1, find_next_non_ascii(s, strend,
2785 utf8_target));
2786 }
2787 else {
2788 REXEC_FBC_FIND_NEXT_SCAN(0, find_next_non_ascii(s, strend,
2789 utf8_target));
aff4cafe
KW
2790 }
2791
2792 break;
2793
3018b823
KW
2794 /* The argument to all the POSIX node types is the class number to pass to
2795 * _generic_isCC() to build a mask for searching in PL_charclass[] */
2796
2797 case NPOSIXL:
2798 to_complement = 1;
2799 /* FALLTHROUGH */
2800
2801 case POSIXL:
780fcc9f 2802 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
a78c2fa6 2803 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, (U8 *) strend)),
3018b823 2804 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 2805 break;
3018b823
KW
2806
2807 case NPOSIXD:
2808 to_complement = 1;
2809 /* FALLTHROUGH */
2810
2811 case POSIXD:
2812 if (utf8_target) {
2813 goto posix_utf8;
2814 }
2815 goto posixa;
2816
2817 case NPOSIXA:
2818 if (utf8_target) {
2819 /* The complement of something that matches only ASCII matches all
837226c8 2820 * non-ASCII, plus everything in ASCII that isn't in the class. */
da10aa09
KW
2821 REXEC_FBC_CLASS_SCAN(1, ! isASCII_utf8_safe(s, strend)
2822 || ! _generic_isCC_A(*s, FLAGS(c)));
3018b823
KW
2823 break;
2824 }
2825
2826 to_complement = 1;
4a6c6db5 2827 goto posixa;
3018b823 2828
73104a1b
KW
2829 case POSIXA:
2830 /* Don't need to worry about utf8, as it can match only a single
4a6c6db5
KW
2831 * byte invariant character. But we do anyway for performance reasons,
2832 * as otherwise we would have to examine all the continuation
2833 * characters */
2834 if (utf8_target) {
da10aa09 2835 REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c)));
4a6c6db5
KW
2836 break;
2837 }
2838
2839 posixa:
da10aa09 2840 REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
3018b823 2841 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 2842 break;
3018b823
KW
2843
2844 case NPOSIXU:
2845 to_complement = 1;
2846 /* FALLTHROUGH */
2847
2848 case POSIXU:
2849 if (! utf8_target) {
da10aa09
KW
2850 REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2851 to_complement ^ cBOOL(_generic_isCC(*s,
3018b823
KW
2852 FLAGS(c))));
2853 }
2854 else {
2855
c52b8b12 2856 posix_utf8:
3018b823 2857 classnum = (_char_class_number) FLAGS(c);
8d692afb
KW
2858 switch (classnum) {
2859 default:
2860 REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2861 to_complement ^ cBOOL(_invlist_contains_cp(
2862 PL_XPosix_ptrs[classnum],
2863 utf8_to_uvchr_buf((U8 *) s,
2864 (U8 *) strend,
2865 NULL))));
2866 break;
779cf272 2867 case _CC_ENUM_SPACE:
da10aa09 2868 REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
7a207065 2869 to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
3018b823
KW
2870 break;
2871
2872 case _CC_ENUM_BLANK:
da10aa09 2873 REXEC_FBC_CLASS_SCAN(1,
7a207065 2874 to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
3018b823
KW
2875 break;
2876
2877 case _CC_ENUM_XDIGIT:
da10aa09 2878 REXEC_FBC_CLASS_SCAN(1,
7a207065 2879 to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3018b823
KW
2880 break;
2881
2882 case _CC_ENUM_VERTSPACE:
da10aa09 2883 REXEC_FBC_CLASS_SCAN(1,
7a207065 2884 to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3018b823
KW
2885 break;
2886
2887 case _CC_ENUM_CNTRL:
da10aa09 2888 REXEC_FBC_CLASS_SCAN(1,
7a207065 2889 to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
3018b823 2890 break;
3018b823
KW
2891 }
2892 }
2893 break;
2894
73104a1b
KW
2895 case AHOCORASICKC:
2896 case AHOCORASICK:
2897 {
2898 DECL_TRIE_TYPE(c);
2899 /* what trie are we using right now */
2900 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2901 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2902 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2903
2904 const char *last_start = strend - trie->minlen;
6148ee25 2905#ifdef DEBUGGING
73104a1b 2906 const char *real_start = s;
6148ee25 2907#endif
73104a1b
KW
2908 STRLEN maxlen = trie->maxlen;
2909 SV *sv_points;
2910 U8 **points; /* map of where we were in the input string
2911 when reading a given char. For ASCII this
2912 is unnecessary overhead as the relationship
2913 is always 1:1, but for Unicode, especially
2914 case folded Unicode this is not true. */
2915 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2916 U8 *bitmap=NULL;
2917
2918
2919 GET_RE_DEBUG_FLAGS_DECL;
2920
2921 /* We can't just allocate points here. We need to wrap it in
2922 * an SV so it gets freed properly if there is a croak while
2923 * running the match */
2924 ENTER;
2925 SAVETMPS;
2926 sv_points=newSV(maxlen * sizeof(U8 *));
2927 SvCUR_set(sv_points,
2928 maxlen * sizeof(U8 *));
2929 SvPOK_on(sv_points);
2930 sv_2mortal(sv_points);
2931 points=(U8**)SvPV_nolen(sv_points );
2932 if ( trie_type != trie_utf8_fold
2933 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2934 {
2935 if (trie->bitmap)
2936 bitmap=(U8*)trie->bitmap;
2937 else
2938 bitmap=(U8*)ANYOF_BITMAP(c);
2939 }
2940 /* this is the Aho-Corasick algorithm modified a touch
2941 to include special handling for long "unknown char" sequences.
2942 The basic idea being that we use AC as long as we are dealing
2943 with a possible matching char, when we encounter an unknown char
2944 (and we have not encountered an accepting state) we scan forward
2945 until we find a legal starting char.
2946 AC matching is basically that of trie matching, except that when
2947 we encounter a failing transition, we fall back to the current
2948 states "fail state", and try the current char again, a process
2949 we repeat until we reach the root state, state 1, or a legal
2950 transition. If we fail on the root state then we can either
2951 terminate if we have reached an accepting state previously, or
2952 restart the entire process from the beginning if we have not.
2953
2954 */
2955 while (s <= last_start) {
2956 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2957 U8 *uc = (U8*)s;
2958 U16 charid = 0;
2959 U32 base = 1;
2960 U32 state = 1;
2961 UV uvc = 0;
2962 STRLEN len = 0;
2963 STRLEN foldlen = 0;
2964 U8 *uscan = (U8*)NULL;
2965 U8 *leftmost = NULL;
2966#ifdef DEBUGGING
2967 U32 accepted_word= 0;
786e8c11 2968#endif
73104a1b
KW
2969 U32 pointpos = 0;
2970
2971 while ( state && uc <= (U8*)strend ) {
2972 int failed=0;
2973 U32 word = aho->states[ state ].wordnum;
2974
2975 if( state==1 ) {
2976 if ( bitmap ) {
2977 DEBUG_TRIE_EXECUTE_r(
2978 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2979 dump_exec_pos( (char *)uc, c, strend, real_start,
cb41e5d6 2980 (char *)uc, utf8_target, 0 );
6ad9a8ab 2981 Perl_re_printf( aTHX_
73104a1b
KW
2982 " Scanning for legal start char...\n");
2983 }
2984 );
2985 if (utf8_target) {
2986 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2987 uc += UTF8SKIP(uc);
2988 }
2989 } else {
2990 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2991 uc++;
2992 }
786e8c11 2993 }
73104a1b 2994 s= (char *)uc;
07be1b83 2995 }
73104a1b
KW
2996 if (uc >(U8*)last_start) break;
2997 }
2998
2999 if ( word ) {
3000 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
3001 if (!leftmost || lpos < leftmost) {
3002 DEBUG_r(accepted_word=word);
3003 leftmost= lpos;
7016d6eb 3004 }
73104a1b 3005 if (base==0) break;
7016d6eb 3006
73104a1b
KW
3007 }
3008 points[pointpos++ % maxlen]= uc;
3009 if (foldlen || uc < (U8*)strend) {
9ad8cac4
KW
3010 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3011 (U8 *) strend, uscan, len, uvc,
3012 charid, foldlen, foldbuf,
3013 uniflags);
73104a1b
KW
3014 DEBUG_TRIE_EXECUTE_r({
3015 dump_exec_pos( (char *)uc, c, strend,
cb41e5d6 3016 real_start, s, utf8_target, 0);
6ad9a8ab 3017 Perl_re_printf( aTHX_
147e3846 3018 " Charid:%3u CP:%4" UVxf " ",
73104a1b
KW
3019 charid, uvc);
3020 });
3021 }
3022 else {
3023 len = 0;
3024 charid = 0;
3025 }
07be1b83 3026
73104a1b
KW
3027
3028 do {
6148ee25 3029#ifdef DEBUGGING
73104a1b 3030 word = aho->states[ state ].wordnum;
6148ee25 3031#endif
73104a1b
KW
3032 base = aho->states[ state ].trans.base;
3033
3034 DEBUG_TRIE_EXECUTE_r({
3035 if (failed)
3036 dump_exec_pos( (char *)uc, c, strend, real_start,
cb41e5d6 3037 s, utf8_target, 0 );
6ad9a8ab 3038 Perl_re_printf( aTHX_
147e3846 3039 "%sState: %4" UVxf ", word=%" UVxf,
73104a1b
KW
3040 failed ? " Fail transition to " : "",
3041 (UV)state, (UV)word);
3042 });
3043 if ( base ) {
3044 U32 tmp;
3045 I32 offset;
3046 if (charid &&
3047 ( ((offset = base + charid
3048 - 1 - trie->uniquecharcount)) >= 0)
3049 && ((U32)offset < trie->lasttrans)
3050 && trie->trans[offset].check == state
3051 && (tmp=trie->trans[offset].next))
3052 {
3053 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 3054 Perl_re_printf( aTHX_ " - legal\n"));
73104a1b
KW
3055 state = tmp;
3056 break;
07be1b83
YO
3057 }
3058 else {
786e8c11 3059 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 3060 Perl_re_printf( aTHX_ " - fail\n"));
786e8c11 3061 failed = 1;
73104a1b 3062 state = aho->fail[state];
07be1b83 3063 }
07be1b83 3064 }
73104a1b
KW
3065 else {
3066 /* we must be accepting here */
3067 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 3068 Perl_re_printf( aTHX_ " - accepting\n"));
73104a1b
KW
3069 failed = 1;
3070 break;
786e8c11 3071 }
73104a1b
KW
3072 } while(state);
3073 uc += len;
3074 if (failed) {
3075 if (leftmost)
3076 break;
3077 if (!state) state = 1;
07be1b83 3078 }
73104a1b
KW
3079 }
3080 if ( aho->states[ state ].wordnum ) {
3081 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
3082 if (!leftmost || lpos < leftmost) {
3083 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3084 leftmost = lpos;
07be1b83
YO
3085 }
3086 }
73104a1b
KW
3087 if (leftmost) {
3088 s = (char*)leftmost;
3089 DEBUG_TRIE_EXECUTE_r({
147e3846 3090 Perl_re_printf( aTHX_ "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
73104a1b
KW
3091 (UV)accepted_word, (IV)(s - real_start)
3092 );
3093 });
02d5137b 3094 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
3095 FREETMPS;
3096 LEAVE;
3097 goto got_it;
3098 }
3099 s = HOPc(s,1);
3100 DEBUG_TRIE_EXECUTE_r({
6ad9a8ab 3101 Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
73104a1b
KW
3102 });
3103 } else {
3104 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 3105 Perl_re_printf( aTHX_ "No match.\n"));
73104a1b
KW
3106 break;
3107 }
3108 }
3109 FREETMPS;
3110 LEAVE;
3111 }
3112 break;
3113 default:
3114 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
73104a1b
KW
3115 }
3116 return 0;
3117 got_it:
3118 return s;
6eb5f6b9
JH
3119}
3120
60165aa4
DM
3121/* set RX_SAVED_COPY, RX_SUBBEG etc.
3122 * flags have same meanings as with regexec_flags() */
3123
749f4950
DM
3124static void
3125S_reg_set_capture_string(pTHX_ REGEXP * const rx,
60165aa4
DM
3126 char *strbeg,
3127 char *strend,
3128 SV *sv,
3129 U32 flags,
3130 bool utf8_target)
3131{
3132 struct regexp *const prog = ReANY(rx);
3133
60165aa4
DM
3134 if (flags & REXEC_COPY_STR) {
3135#ifdef PERL_ANY_COW
3136 if (SvCANCOW(sv)) {
eb8fc9fe 3137 DEBUG_C(Perl_re_printf( aTHX_
60165aa4 3138 "Copy on write: regexp capture, type %d\n",
eb8fc9fe 3139 (int) SvTYPE(sv)));
5411a0e5
DM
3140 /* Create a new COW SV to share the match string and store
3141 * in saved_copy, unless the current COW SV in saved_copy
3142 * is valid and suitable for our purpose */
3143 if (( prog->saved_copy
3144 && SvIsCOW(prog->saved_copy)
3145 && SvPOKp(prog->saved_copy)
3146 && SvIsCOW(sv)
3147 && SvPOKp(sv)
3148 && SvPVX(sv) == SvPVX(prog->saved_copy)))
a76b0e90 3149 {
5411a0e5
DM
3150 /* just reuse saved_copy SV */
3151 if (RXp_MATCH_COPIED(prog)) {
3152 Safefree(prog->subbeg);
3153 RXp_MATCH_COPIED_off(prog);
3154 }
3155 }
3156 else {
3157 /* create new COW SV to share string */
196a02af 3158 RXp_MATCH_COPY_FREE(prog);
a76b0e90 3159 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
a76b0e90 3160 }
5411a0e5
DM
3161 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3162 assert (SvPOKp(prog->saved_copy));
60165aa4
DM
3163 prog->sublen = strend - strbeg;
3164 prog->suboffset = 0;
3165 prog->subcoffset = 0;
3166 } else
3167#endif
3168 {
99a90e59
FC
3169 SSize_t min = 0;
3170 SSize_t max = strend - strbeg;
ea3daa5d 3171 SSize_t sublen;
60165aa4
DM
3172
3173 if ( (flags & REXEC_COPY_SKIP_POST)
e322109a 3174 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
3175 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3176 ) { /* don't copy $' part of string */
3177 U32 n = 0;
3178 max = -1;
3179 /* calculate the right-most part of the string covered
f67a5002 3180 * by a capture. Due to lookahead, this may be to
60165aa4
DM
3181 * the right of $&, so we have to scan all captures */
3182 while (n <= prog->lastparen) {
3183 if (prog->offs[n].end > max)
3184 max = prog->offs[n].end;
3185 n++;
3186 }
3187 if (max == -1)
3188 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3189 ? prog->offs[0].start
3190 : 0;
3191 assert(max >= 0 && max <= strend - strbeg);
3192 }
3193
3194 if ( (flags & REXEC_COPY_SKIP_PRE)
e322109a 3195 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
3196 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3197 ) { /* don't copy $` part of string */
3198 U32 n = 0;
3199 min = max;
3200 /* calculate the left-most part of the string covered
f67a5002 3201 * by a capture. Due to lookbehind, this may be to
60165aa4
DM
3202 * the left of $&, so we have to scan all captures */
3203 while (min && n <= prog->lastparen) {
3204 if ( prog->offs[n].start != -1
3205 && prog->offs[n].start < min)
3206 {
3207 min = prog->offs[n].start;
3208 }
3209 n++;
3210 }
3211 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3212 && min > prog->offs[0].end
3213 )
3214 min = prog->offs[0].end;
3215
3216 }
3217
3218 assert(min >= 0 && min <= max && min <= strend - strbeg);
3219 sublen = max - min;
3220
196a02af 3221 if (RXp_MATCH_COPIED(prog)) {
60165aa4
DM
3222 if (sublen > prog->sublen)
3223 prog->subbeg =
3224 (char*)saferealloc(prog->subbeg, sublen+1);
3225 }
3226 else
3227 prog->subbeg = (char*)safemalloc(sublen+1);
3228 Copy(strbeg + min, prog->subbeg, sublen, char);
3229 prog->subbeg[sublen] = '\0';
3230 prog->suboffset = min;
3231 prog->sublen = sublen;
196a02af 3232 RXp_MATCH_COPIED_on(prog);
60165aa4
DM
3233 }
3234 prog->subcoffset = prog->suboffset;
3235 if (prog->suboffset && utf8_target) {
3236 /* Convert byte offset to chars.
3237 * XXX ideally should only compute this if @-/@+
3238 * has been seen, a la PL_sawampersand ??? */
3239
3240 /* If there's a direct correspondence between the
3241 * string which we're matching and the original SV,
3242 * then we can use the utf8 len cache associated with
3243 * the SV. In particular, it means that under //g,
3244 * sv_pos_b2u() will use the previously cached
3245 * position to speed up working out the new length of
3246 * subcoffset, rather than counting from the start of
3247 * the string each time. This stops
3248 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3249 * from going quadratic */
3250 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
ea3daa5d
FC
3251 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3252 SV_GMAGIC|SV_CONST_RETURN);
60165aa4
DM
3253 else
3254 prog->subcoffset = utf8_length((U8*)strbeg,
3255 (U8*)(strbeg+prog->suboffset));
3256 }
3257 }
3258 else {
196a02af 3259 RXp_MATCH_COPY_FREE(prog);
60165aa4
DM
3260 prog->subbeg = strbeg;
3261 prog->suboffset = 0;
3262 prog->subcoffset = 0;
3263 prog->sublen = strend - strbeg;
3264 }
3265}
3266
3267
3268
fae667d5 3269
6eb5f6b9
JH
3270/*
3271 - regexec_flags - match a regexp against a string
3272 */
3273I32
5aaab254 3274Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
ea3daa5d 3275 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
3276/* stringarg: the point in the string at which to begin matching */
3277/* strend: pointer to null at end of string */
3278/* strbeg: real beginning of string */
3279/* minend: end of match must be >= minend bytes after stringarg. */
3280/* sv: SV being matched: only used for utf8 flag, pos() etc; string
3281 * itself is accessed via the pointers above */
3282/* data: May be used for some additional optimizations.
d058ec57 3283 Currently unused. */
a340edde 3284/* flags: For optimizations. See REXEC_* in regexp.h */
8fd1a950 3285
6eb5f6b9 3286{
8d919b0a 3287 struct regexp *const prog = ReANY(rx);
5aaab254 3288 char *s;
eb578fdb 3289 regnode *c;
03c83e26 3290 char *startpos;
ea3daa5d
FC
3291 SSize_t minlen; /* must match at least this many chars */
3292 SSize_t dontbother = 0; /* how many characters not to try at end */
f2ed9b32 3293 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 3294 I32 multiline;
f8fc2ecf 3295 RXi_GET_DECL(prog,progi);
02d5137b
DM
3296 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
3297 regmatch_info *const reginfo = &reginfo_buf;
e9105d30 3298 regexp_paren_pair *swap = NULL;
006f26b2 3299 I32 oldsave;
a3621e74
YO
3300 GET_RE_DEBUG_FLAGS_DECL;
3301
7918f24d 3302 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 3303 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
3304
3305 /* Be paranoid... */
3dc78631 3306 if (prog == NULL) {
6eb5f6b9 3307 Perl_croak(aTHX_ "NULL regexp parameter");
6eb5f6b9
JH
3308 }
3309
6c3fea77 3310 DEBUG_EXECUTE_r(
03c83e26 3311 debug_start_match(rx, utf8_target, stringarg, strend,
6c3fea77
DM
3312 "Matching");
3313 );
8adc0f72 3314
b342a604
DM
3315 startpos = stringarg;
3316
4cf1a867
DM
3317 /* set these early as they may be used by the HOP macros below */
3318 reginfo->strbeg = strbeg;
3319 reginfo->strend = strend;
3320 reginfo->is_utf8_target = cBOOL(utf8_target);
3321
58430ea8 3322 if (prog->intflags & PREGf_GPOS_SEEN) {
d307c076
DM
3323 MAGIC *mg;
3324
fef7148b
DM
3325 /* set reginfo->ganch, the position where \G can match */
3326
3327 reginfo->ganch =
3328 (flags & REXEC_IGNOREPOS)
3329 ? stringarg /* use start pos rather than pos() */
3dc78631 3330 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
25fdce4a
FC
3331 /* Defined pos(): */
3332 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
fef7148b
DM
3333 : strbeg; /* pos() not defined; use start of string */
3334
6ad9a8ab 3335 DEBUG_GPOS_r(Perl_re_printf( aTHX_
147e3846 3336 "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
fef7148b 3337
03c83e26
DM
3338 /* in the presence of \G, we may need to start looking earlier in
3339 * the string than the suggested start point of stringarg:
0b2c2a84 3340 * if prog->gofs is set, then that's a known, fixed minimum
03c83e26
DM
3341 * offset, such as
3342 * /..\G/: gofs = 2
3343 * /ab|c\G/: gofs = 1
3344 * or if the minimum offset isn't known, then we have to go back
3345 * to the start of the string, e.g. /w+\G/
3346 */
2bfbe302 3347
8e1490ee 3348 if (prog->intflags & PREGf_ANCH_GPOS) {
4cf1a867
DM
3349 if (prog->gofs) {
3350 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3351 if (!startpos ||
3352 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3353 {
6ad9a8ab 3354 DEBUG_r(Perl_re_printf( aTHX_
4cf1a867
DM
3355 "fail: ganch-gofs before earliest possible start\n"));
3356 return 0;
3357 }
2bfbe302 3358 }
4cf1a867
DM
3359 else
3360 startpos = reginfo->ganch;
2bfbe302
DM
3361 }
3362 else if (prog->gofs) {
4cf1a867
DM
3363 startpos = HOPBACKc(startpos, prog->gofs);
3364 if (!startpos)
b342a604 3365 startpos = strbeg;
03c83e26 3366 }
58430ea8 3367 else if (prog->intflags & PREGf_GPOS_FLOAT)
b342a604 3368 startpos = strbeg;
03c83e26
DM
3369 }
3370
3371 minlen = prog->minlen;
b342a604 3372 if ((startpos + minlen) > strend || startpos < strbeg) {
6ad9a8ab 3373 DEBUG_r(Perl_re_printf( aTHX_
03c83e26
DM
3374 "Regex match can't succeed, so not even tried\n"));
3375 return 0;
3376 }
3377
63a3746a
DM
3378 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3379 * which will call destuctors to reset PL_regmatch_state, free higher
3380 * PL_regmatch_slabs, and clean up regmatch_info_aux and
3381 * regmatch_info_aux_eval */
3382
3383 oldsave = PL_savestack_ix;
3384
dfa77d06
DM
3385 s = startpos;
3386
e322109a 3387 if ((prog->extflags & RXf_USE_INTUIT)
7fadf4a7
DM
3388 && !(flags & REXEC_CHECKED))
3389 {
dfa77d06 3390 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
7fadf4a7 3391 flags, NULL);
dfa77d06 3392 if (!s)
7fadf4a7
DM
3393 return 0;
3394
e322109a 3395 if (prog->extflags & RXf_CHECK_ALL) {
7fadf4a7
DM
3396 /* we can match based purely on the result of INTUIT.
3397 * Set up captures etc just for $& and $-[0]
3398 * (an intuit-only match wont have $1,$2,..) */
3399 assert(!prog->nparens);
d5e7783a
DM
3400
3401 /* s/// doesn't like it if $& is earlier than where we asked it to
3402 * start searching (which can happen on something like /.\G/) */
3403 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3404 && (s < stringarg))
3405 {
3406 /* this should only be possible under \G */
58430ea8 3407 assert(prog->intflags & PREGf_GPOS_SEEN);
6ad9a8ab 3408 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
d5e7783a
DM
3409 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3410 goto phooey;
3411 }
3412
7fadf4a7
DM
3413 /* match via INTUIT shouldn't have any captures.
3414 * Let @-, @+, $^N know */
3415 prog->lastparen = prog->lastcloseparen = 0;
196a02af 3416 RXp_MATCH_UTF8_set(prog, utf8_target);
3ff69bd6
DM
3417 prog->offs[0].start = s - strbeg;
3418 prog->offs[0].end = utf8_target
3419 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3420 : s - strbeg + prog->minlenret;
7fadf4a7 3421 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 3422 S_reg_set_capture_string(aTHX_ rx,
7fadf4a7
DM
3423 strbeg, strend,
3424 sv, flags, utf8_target);
3425
7fadf4a7
DM
3426 return 1;
3427 }
3428 }
3429
6c3fea77 3430 multiline = prog->extflags & RXf_PMf_MULTILINE;
1de06328 3431
dfa77d06 3432 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
6ad9a8ab 3433 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
a72c7584
JH
3434 "String too short [regexec_flags]...\n"));
3435 goto phooey;
1aa99e6b 3436 }
1de06328 3437
6eb5f6b9 3438 /* Check validity of program. */
f8fc2ecf 3439 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
3440 Perl_croak(aTHX_ "corrupted regexp program");
3441 }
3442
196a02af
DM
3443 RXp_MATCH_TAINTED_off(prog);
3444 RXp_MATCH_UTF8_set(prog, utf8_target);
1738e041 3445
6c3fea77
DM
3446 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
3447 reginfo->intuit = 0;
02d5137b
DM
3448 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3449 reginfo->warned = FALSE;
02d5137b 3450 reginfo->sv = sv;
1cb48e53 3451 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
6eb5f6b9 3452 /* see how far we have to get to not match where we matched before */
fe3974be 3453 reginfo->till = stringarg + minend;
6eb5f6b9 3454
60779a30 3455 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
82c23608
FC
3456 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3457 S_cleanup_regmatch_info_aux has executed (registered by
3458 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
3459 magic belonging to this SV.
3460 Not newSVsv, either, as it does not COW.
3461 */
3462 reginfo->sv = newSV(0);
4cba5ac0 3463 SvSetSV_nosteal(reginfo->sv, sv);
82c23608
FC
3464 SAVEFREESV(reginfo->sv);
3465 }
3466
331b2dcc
DM
3467 /* reserve next 2 or 3 slots in PL_regmatch_state:
3468 * slot N+0: may currently be in use: skip it
3469 * slot N+1: use for regmatch_info_aux struct
3470 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3471 * slot N+3: ready for use by regmatch()
3472 */
bf2039a9 3473
331b2dcc
DM
3474 {
3475 regmatch_state *old_regmatch_state;
3476 regmatch_slab *old_regmatch_slab;
3477 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3478
3479 /* on first ever match, allocate first slab */
3480 if (!PL_regmatch_slab) {
3481 Newx(PL_regmatch_slab, 1, regmatch_slab);
3482 PL_regmatch_slab->prev = NULL;
3483 PL_regmatch_slab->next = NULL;
3484 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3485 }
bf2039a9 3486
331b2dcc
DM
3487 old_regmatch_state = PL_regmatch_state;
3488 old_regmatch_slab = PL_regmatch_slab;
bf2039a9 3489
331b2dcc
DM
3490 for (i=0; i <= max; i++) {
3491 if (i == 1)
3492 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3493 else if (i ==2)
3494 reginfo->info_aux_eval =
3495 reginfo->info_aux->info_aux_eval =
3496 &(PL_regmatch_state->u.info_aux_eval);
bf2039a9 3497
331b2dcc
DM
3498 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
3499 PL_regmatch_state = S_push_slab(aTHX);
3500 }
bf2039a9 3501
331b2dcc
DM
3502 /* note initial PL_regmatch_state position; at end of match we'll
3503 * pop back to there and free any higher slabs */
bf2039a9 3504
331b2dcc
DM
3505 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3506 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2ac8ff4b 3507 reginfo->info_aux->poscache = NULL;
bf2039a9 3508
331b2dcc 3509 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
bf2039a9 3510