This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
S_find_byclass() Restructure bounds checking
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
f65819ce
CO
5 * One Ring to rule them all, One Ring to find them
6 *
4ac71550
TC
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
b992490d 83#include "invlist_inline.h"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
bbac6b20
KW
86#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
88
a0bd1a30
KW
89static const char utf8_locale_required[] =
90 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
91
e1cf74e3
CB
92#ifdef DEBUGGING
93/* At least one required character in the target string is expressible only in
94 * UTF-8. */
e7a474c0 95static const char non_utf8_target_but_utf8_required[]
e1cf74e3
CB
96 = "Can't match, because target string needs to be in UTF-8\n";
97#endif
98
7b031478 99#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
6ad9a8ab 100 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\
7b031478 101 goto target; \
e1cf74e3
CB
102} STMT_END
103
c74f6de9
KW
104#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
c277df42
IZ
110/*
111 * Forwards.
112 */
113
f2ed9b32 114#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 115
3dab1dad 116#define HOPc(pos,off) \
ba44c216 117 (char *)(reginfo->is_utf8_target \
220db18a 118 ? reghop3((U8*)pos, off, \
9d9163fb 119 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad 120 : (U8*)(pos + off))
557f47af 121
bb152a4b
DM
122/* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
123#define HOPBACK3(pos, off, lim) \
124 (reginfo->is_utf8_target \
125 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
126 : (pos - off >= lim) \
127 ? (U8*)pos - off \
3dab1dad 128 : NULL)
efb30f32 129
bb152a4b
DM
130#define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
131
ba44c216 132#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 133#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 134
557f47af
DM
135/* lim must be +ve. Returns NULL on overshoot */
136#define HOPMAYBE3(pos,off,lim) \
137 (reginfo->is_utf8_target \
138 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
139 : ((U8*)pos + off <= lim) \
140 ? (U8*)pos + off \
141 : NULL)
142
8e9f2289
DM
143/* like HOP3, but limits the result to <= lim even for the non-utf8 case.
144 * off must be >=0; args should be vars rather than expressions */
145#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
146 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
147 : (U8*)((pos + off) > lim ? lim : (pos + off)))
67853908 148#define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
8e9f2289 149
2974eaec
DM
150#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
151 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
152 : (U8*)(pos + off))
153#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
7016d6eb 154
c7304fe2 155#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
156/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
157
5f80c4cf 158/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
159/* it would be nice to rework regcomp.sym to generate this stuff. sigh
160 *
161 * NOTE that *nothing* that affects backtracking should be in here, specifically
162 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
163 * node that is in between two EXACT like nodes when ascertaining what the required
164 * "follow" character is. This should probably be moved to regex compile time
165 * although it may be done at run time beause of the REF possibility - more
166 * investigation required. -- demerphq
167*/
baa60164
KW
168#define JUMPABLE(rn) ( \
169 OP(rn) == OPEN || \
24be3102
YO
170 (OP(rn) == CLOSE && \
171 !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) || \
baa60164
KW
172 OP(rn) == EVAL || \
173 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
174 OP(rn) == PLUS || OP(rn) == MINMOD || \
175 OP(rn) == KEEPS || \
176 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 177)
ee9b8eae 178#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 179
ee9b8eae
YO
180#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
181
a84d97b6
HS
182/*
183 Search for mandatory following text node; for lookahead, the text must
184 follow but for lookbehind (rn->flags != 0) we skip to the next step.
185*/
baa60164 186#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
187 while (JUMPABLE(rn)) { \
188 const OPCODE type = OP(rn); \
189 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 190 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 191 else if (type == PLUS) \
cca55fe3 192 rn = NEXTOPER(rn); \
3dab1dad 193 else if (type == IFMATCH) \
a84d97b6 194 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 195 else rn += NEXT_OFF(rn); \
3dab1dad 196 } \
5f80c4cf 197} STMT_END
74750237 198
006f26b2
DM
199#define SLAB_FIRST(s) (&(s)->states[0])
200#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
201
a75351a1 202static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 203static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
bf2039a9 204static regmatch_state * S_push_slab(pTHX);
51371543 205
87c0511b 206#define REGCP_PAREN_ELEMS 3
f067efbf 207#define REGCP_OTHER_ELEMS 3
e0fa7e2b 208#define REGCP_FRAME_ELEMS 1
620d5b66
NC
209/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
210 * are needed for the regexp context stack bookkeeping. */
211
76e3520e 212STATIC CHECKPOINT
21553840 213S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
a0d0e21e 214{
a3b680e6 215 const int retval = PL_savestack_ix;
92da3157
DM
216 const int paren_elems_to_push =
217 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
218 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
219 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 220 I32 p;
271b36b1 221 DECLARE_AND_GET_RE_DEBUG_FLAGS;
a0d0e21e 222
b93070ed
DM
223 PERL_ARGS_ASSERT_REGCPPUSH;
224
e49a9654 225 if (paren_elems_to_push < 0)
e8a85d26
JH
226 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
227 (int)paren_elems_to_push, (int)maxopenparen,
228 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
e49a9654 229
e0fa7e2b 230 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
147e3846 231 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
5df417d0 232 " out of range (%lu-%ld)",
92da3157
DM
233 total_elems,
234 (unsigned long)maxopenparen,
235 (long)parenfloor);
e0fa7e2b 236
620d5b66 237 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 238
495f47a5 239 DEBUG_BUFFERS_r(
92da3157 240 if ((int)maxopenparen > (int)parenfloor)
2b1a3689 241 Perl_re_exec_indentf( aTHX_
147e3846 242 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
2b1a3689
YO
243 depth,
244 PTR2UV(rex),
495f47a5
DM
245 PTR2UV(rex->offs)
246 );
247 );
92da3157 248 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 249/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
99a90e59
FC
250 SSPUSHIV(rex->offs[p].end);
251 SSPUSHIV(rex->offs[p].start);
1ca2007e 252 SSPUSHINT(rex->offs[p].start_tmp);
2b1a3689 253 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
147e3846 254 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
2b1a3689
YO
255 depth,
256 (UV)p,
495f47a5
DM
257 (IV)rex->offs[p].start,
258 (IV)rex->offs[p].start_tmp,
259 (IV)rex->offs[p].end
40a82448 260 ));
a0d0e21e 261 }
b1ce53c5 262/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 263 SSPUSHINT(maxopenparen);
b93070ed
DM
264 SSPUSHINT(rex->lastparen);
265 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 266 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 267
a0d0e21e
LW
268 return retval;
269}
270
c277df42 271/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
272#define REGCP_SET(cp) \
273 DEBUG_STATE_r( \
147e3846
KW
274 Perl_re_exec_indentf( aTHX_ \
275 "Setting an EVAL scope, savestack=%" IVdf ",\n", \
cb41e5d6
YO
276 depth, (IV)PL_savestack_ix \
277 ) \
278 ); \
ab3bbdeb 279 cp = PL_savestack_ix
c3464db5 280
ab3bbdeb 281#define REGCP_UNWIND(cp) \
e4f74956 282 DEBUG_STATE_r( \
cb41e5d6 283 if (cp != PL_savestack_ix) \
147e3846
KW
284 Perl_re_exec_indentf( aTHX_ \
285 "Clearing an EVAL scope, savestack=%" \
286 IVdf "..%" IVdf "\n", \
cb41e5d6
YO
287 depth, (IV)(cp), (IV)PL_savestack_ix \
288 ) \
289 ); \
ab3bbdeb 290 regcpblow(cp)
c277df42 291
8e9ed7cc
DM
292/* set the start and end positions of capture ix */
293#define CLOSE_CAPTURE(ix, s, e) \
294 rex->offs[ix].start = s; \
295 rex->offs[ix].end = e; \
3be2a9fd
DM
296 if (ix > rex->lastparen) \
297 rex->lastparen = ix; \
298 rex->lastcloseparen = ix; \
69cd2617 299 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
befca383 300 "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
69cd2617
DM
301 depth, \
302 PTR2UV(rex), \
303 PTR2UV(rex->offs), \
8e9ed7cc
DM
304 (UV)ix, \
305 (IV)rex->offs[ix].start, \
3be2a9fd
DM
306 (IV)rex->offs[ix].end, \
307 (UV)rex->lastparen \
69cd2617
DM
308 ))
309
a8d1f4b4 310#define UNWIND_PAREN(lp, lcp) \
befca383
DM
311 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
312 "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
313 depth, \
314 PTR2UV(rex), \
315 PTR2UV(rex->offs), \
316 (UV)(lp), \
317 (UV)(rex->lastparen), \
318 (UV)(lcp) \
319 )); \
a8d1f4b4
DM
320 for (n = rex->lastparen; n > lp; n--) \
321 rex->offs[n].end = -1; \
322 rex->lastparen = n; \
323 rex->lastcloseparen = lcp;
324
325
f067efbf 326STATIC void
21553840 327S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
a0d0e21e 328{
e0fa7e2b 329 UV i;
87c0511b 330 U32 paren;
271b36b1 331 DECLARE_AND_GET_RE_DEBUG_FLAGS;
a3621e74 332
7918f24d
NC
333 PERL_ARGS_ASSERT_REGCPPOP;
334
b1ce53c5 335 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 336 i = SSPOPUV;
e0fa7e2b
NC
337 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
338 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
339 rex->lastcloseparen = SSPOPINT;
340 rex->lastparen = SSPOPINT;
92da3157 341 *maxopenparen_p = SSPOPINT;
b1ce53c5 342
620d5b66 343 i -= REGCP_OTHER_ELEMS;
b1ce53c5 344 /* Now restore the parentheses context. */
495f47a5
DM
345 DEBUG_BUFFERS_r(
346 if (i || rex->lastparen + 1 <= rex->nparens)
2b1a3689 347 Perl_re_exec_indentf( aTHX_
147e3846 348 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
2b1a3689
YO
349 depth,
350 PTR2UV(rex),
495f47a5
DM
351 PTR2UV(rex->offs)
352 );
353 );
92da3157 354 paren = *maxopenparen_p;
620d5b66 355 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
ea3daa5d 356 SSize_t tmps;
1ca2007e 357 rex->offs[paren].start_tmp = SSPOPINT;
99a90e59
FC
358 rex->offs[paren].start = SSPOPIV;
359 tmps = SSPOPIV;
b93070ed
DM
360 if (paren <= rex->lastparen)
361 rex->offs[paren].end = tmps;
2b1a3689 362 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
147e3846 363 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
2b1a3689
YO
364 depth,
365 (UV)paren,
495f47a5
DM
366 (IV)rex->offs[paren].start,
367 (IV)rex->offs[paren].start_tmp,
368 (IV)rex->offs[paren].end,
369 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 370 );
87c0511b 371 paren--;
a0d0e21e 372 }
daf18116 373#if 1
dafc8851
JH
374 /* It would seem that the similar code in regtry()
375 * already takes care of this, and in fact it is in
376 * a better location to since this code can #if 0-ed out
377 * but the code in regtry() is needed or otherwise tests
378 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
379 * (as of patchlevel 7877) will fail. Then again,
380 * this code seems to be necessary or otherwise
225593e1
DM
381 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
382 * --jhi updated by dapm */
b93070ed 383 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 384 if (i > *maxopenparen_p)
b93070ed
DM
385 rex->offs[i].start = -1;
386 rex->offs[i].end = -1;
2b1a3689 387 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
147e3846 388 " \\%" UVuf ": %s ..-1 undeffing\n",
2b1a3689
YO
389 depth,
390 (UV)i,
92da3157 391 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 392 ));
a0d0e21e 393 }
dafc8851 394#endif
a0d0e21e
LW
395}
396
74088413
DM
397/* restore the parens and associated vars at savestack position ix,
398 * but without popping the stack */
399
400STATIC void
21553840 401S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
74088413
DM
402{
403 I32 tmpix = PL_savestack_ix;
85882954
YO
404 PERL_ARGS_ASSERT_REGCP_RESTORE;
405
74088413 406 PL_savestack_ix = ix;
21553840 407 regcppop(rex, maxopenparen_p);
74088413
DM
408 PL_savestack_ix = tmpix;
409}
410
02db2b7b 411#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 412
9637d2a5
CB
413#ifndef PERL_IN_XSUB_RE
414
24e16d7b
KW
415bool
416Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
31c7f561
KW
417{
418 /* Returns a boolean as to whether or not 'character' is a member of the
419 * Posix character class given by 'classnum' that should be equivalent to a
420 * value in the typedef '_char_class_number'.
421 *
422 * Ideally this could be replaced by a just an array of function pointers
423 * to the C library functions that implement the macros this calls.
424 * However, to compile, the precise function signatures are required, and
a3815e44 425 * these may vary from platform to platform. To avoid having to figure
31c7f561 426 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
427 * which adds an extra layer of function call overhead (unless the C
428 * optimizer strips it away). But we don't particularly care about
429 * performance with locales anyway. */
31c7f561
KW
430
431 switch ((_char_class_number) classnum) {
15861f94 432 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 433 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
434 case _CC_ENUM_ASCII: return isASCII_LC(character);
435 case _CC_ENUM_BLANK: return isBLANK_LC(character);
cee69f79 436 case _CC_ENUM_CASED: return isLOWER_LC(character)
b0d691b2 437 || isUPPER_LC(character);
e8d596e0 438 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
439 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
440 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
441 case _CC_ENUM_LOWER: return isLOWER_LC(character);
442 case _CC_ENUM_PRINT: return isPRINT_LC(character);
443 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 444 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
445 case _CC_ENUM_UPPER: return isUPPER_LC(character);
446 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 447 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
448 default: /* VERTSPACE should never occur in locales */
449 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
450 }
451
e5964223 452 NOT_REACHED; /* NOTREACHED */
31c7f561
KW
453 return FALSE;
454}
455
9637d2a5
CB
456#endif
457
53362e85
KW
458PERL_STATIC_INLINE I32
459S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len)
460{
461 /* Compare non-UTF-8 using Unicode (Latin1) semantics. s2 must already be
462 * folded. Works on all folds representable without UTF-8, except for
463 * LATIN_SMALL_LETTER_SHARP_S, and does not check for this. Nor does it
464 * check that the strings each have at least 'len' characters.
465 *
466 * There is almost an identical API function where s2 need not be folded:
467 * Perl_foldEQ_latin1() */
468
469 const U8 *a = (const U8 *)s1;
470 const U8 *b = (const U8 *)s2;
471
472 PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
473
474 assert(len >= 0);
475
476 while (len--) {
477 assert(! isUPPER_L1(*b));
478 if (toLOWER_L1(*a) != *b) {
479 return 0;
480 }
481 a++, b++;
482 }
483 return 1;
484}
485
3018b823 486STATIC bool
a78c2fa6 487S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
3018b823
KW
488{
489 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
490 * 'character' is a member of the Posix character class given by 'classnum'
491 * that should be equivalent to a value in the typedef
492 * '_char_class_number'.
493 *
494 * This just calls isFOO_lc on the code point for the character if it is in
2f306ab9 495 * the range 0-255. Outside that range, all characters use Unicode
3018b823 496 * rules, ignoring any locale. So use the Unicode function if this class
4c404f26 497 * requires an inversion list, and use the Unicode macro otherwise. */
3018b823 498
1565c085 499
3018b823
KW
500 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
501
502 if (UTF8_IS_INVARIANT(*character)) {
503 return isFOO_lc(classnum, *character);
504 }
505 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
506 return isFOO_lc(classnum,
a62b247b 507 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
3018b823
KW
508 }
509
a78c2fa6 510 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
613abc6d 511
3018b823 512 switch ((_char_class_number) classnum) {
779cf272 513 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
3018b823
KW
514 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
515 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
516 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
4d3d8522
KW
517 default:
518 return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
a78c2fa6 519 utf8_to_uvchr_buf(character, e, NULL));
3018b823
KW
520 }
521
e1ee3960 522 return FALSE; /* Things like CNTRL are always below 256 */
3018b823
KW
523}
524
2b1f9c71
KW
525STATIC U8 *
526S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
ab1efbdc
KW
527{
528 /* Returns the position of the first byte in the sequence between 's' and
529 * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
530 * */
531
532 PERL_ARGS_ASSERT_FIND_SPAN_END;
533
534 assert(send >= s);
535
536 if ((STRLEN) (send - s) >= PERL_WORDSIZE
537 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
538 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
539 {
540 PERL_UINTMAX_T span_word;
541
542 /* Process per-byte until reach word boundary. XXX This loop could be
543 * eliminated if we knew that this platform had fast unaligned reads */
544 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
545 if (*s != span_byte) {
546 return s;
547 }
548 s++;
549 }
550
551 /* Create a word filled with the bytes we are spanning */
552 span_word = PERL_COUNT_MULTIPLIER * span_byte;
553
554 /* Process per-word as long as we have at least a full word left */
555 do {
556
557 /* Keep going if the whole word is composed of 'span_byte's */
558 if ((* (PERL_UINTMAX_T *) s) == span_word) {
559 s += PERL_WORDSIZE;
560 continue;
561 }
562
23a7ee81
KW
563 /* Here, at least one byte in the word isn't 'span_byte'. */
564
565#ifdef EBCDIC
566
567 break;
568
569#else
570
571 /* This xor leaves 1 bits only in those non-matching bytes */
ab1efbdc
KW
572 span_word ^= * (PERL_UINTMAX_T *) s;
573
574 /* Make sure the upper bit of each non-matching byte is set. This
575 * makes each such byte look like an ASCII platform variant byte */
576 span_word |= span_word << 1;
577 span_word |= span_word << 2;
578 span_word |= span_word << 4;
579
580 /* That reduces the problem to what this function solves */
73f0a2eb 581 return s + variant_byte_number(span_word);
ab1efbdc 582
23a7ee81
KW
583#endif
584
ab1efbdc
KW
585 } while (s + PERL_WORDSIZE <= send);
586 }
587
588 /* Process the straggler bytes beyond the final word boundary */
589 while (s < send) {
590 if (*s != span_byte) {
591 return s;
592 }
593 s++;
594 }
595
596 return s;
597}
598
2b1f9c71
KW
599STATIC U8 *
600S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
2813d4ad
KW
601{
602 /* Returns the position of the first byte in the sequence between 's'
603 * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
604 * returns 'send' if none found. It uses word-level operations instead of
605 * byte to speed up the process */
606
607 PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
608
609 assert(send >= s);
610 assert((byte & mask) == byte);
611
23a7ee81
KW
612#ifndef EBCDIC
613
2813d4ad
KW
614 if ((STRLEN) (send - s) >= PERL_WORDSIZE
615 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
616 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
617 {
1d8aafa0 618 PERL_UINTMAX_T word, mask_word;
2813d4ad
KW
619
620 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
2b1f9c71 621 if (((*s) & mask) == byte) {
2813d4ad
KW
622 return s;
623 }
624 s++;
625 }
626
1d8aafa0
KW
627 word = PERL_COUNT_MULTIPLIER * byte;
628 mask_word = PERL_COUNT_MULTIPLIER * mask;
2813d4ad
KW
629
630 do {
631 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
632
1d8aafa0
KW
633 /* If 'masked' contains bytes with the bit pattern of 'byte' within
634 * it, xoring with 'word' will leave each of the 8 bits in such
635 * bytes be 0, and no byte containing any other bit pattern will be
636 * 0. */
637 masked ^= word;
2813d4ad
KW
638
639 /* This causes the most significant bit to be set to 1 for any
640 * bytes in the word that aren't completely 0 */
641 masked |= masked << 1;
642 masked |= masked << 2;
643 masked |= masked << 4;
644
645 /* The msbits are the same as what marks a byte as variant, so we
646 * can use this mask. If all msbits are 1, the word doesn't
647 * contain 'byte' */
648 if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
649 s += PERL_WORDSIZE;
650 continue;
651 }
652
653 /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
654 * and any that are, are 0. Complement and re-AND to swap that */
655 masked = ~ masked;
656 masked &= PERL_VARIANTS_WORD_MASK;
657
658 /* This reduces the problem to that solved by this function */
73f0a2eb 659 s += variant_byte_number(masked);
2813d4ad
KW
660 return s;
661
662 } while (s + PERL_WORDSIZE <= send);
663 }
664
23a7ee81
KW
665#endif
666
2813d4ad 667 while (s < send) {
2b1f9c71 668 if (((*s) & mask) == byte) {
2813d4ad
KW
669 return s;
670 }
671 s++;
672 }
673
674 return s;
675}
676
070e8b2e
KW
677STATIC U8 *
678S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
679{
680 /* Returns the position of the first byte in the sequence between 's' and
681 * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
682 * 'span_byte' should have been ANDed with 'mask' in the call of this
683 * function. Returns 'send' if none found. Works like find_span_end(),
684 * except for the AND */
685
686 PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
687
688 assert(send >= s);
689 assert((span_byte & mask) == span_byte);
690
691 if ((STRLEN) (send - s) >= PERL_WORDSIZE
692 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
693 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
694 {
695 PERL_UINTMAX_T span_word, mask_word;
696
697 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
2b1f9c71 698 if (((*s) & mask) != span_byte) {
070e8b2e
KW
699 return s;
700 }
701 s++;
702 }
703
704 span_word = PERL_COUNT_MULTIPLIER * span_byte;
705 mask_word = PERL_COUNT_MULTIPLIER * mask;
706
707 do {
708 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
709
710 if (masked == span_word) {
711 s += PERL_WORDSIZE;
712 continue;
713 }
714
23a7ee81
KW
715#ifdef EBCDIC
716
717 break;
718
719#else
720
070e8b2e
KW
721 masked ^= span_word;
722 masked |= masked << 1;
723 masked |= masked << 2;
724 masked |= masked << 4;
73f0a2eb 725 return s + variant_byte_number(masked);
070e8b2e 726
23a7ee81
KW
727#endif
728
070e8b2e
KW
729 } while (s + PERL_WORDSIZE <= send);
730 }
731
732 while (s < send) {
2b1f9c71 733 if (((*s) & mask) != span_byte) {
070e8b2e
KW
734 return s;
735 }
736 s++;
737 }
738
739 return s;
740}
741
a687059c 742/*
e50aee73 743 * pregexec and friends
a687059c
LW
744 */
745
76234dfb 746#ifndef PERL_IN_XSUB_RE
a687059c 747/*
c277df42 748 - pregexec - match a regexp against a string
a687059c 749 */
c277df42 750I32
5aaab254 751Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
ea3daa5d 752 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
8fd1a950
DM
753/* stringarg: the point in the string at which to begin matching */
754/* strend: pointer to null at end of string */
755/* strbeg: real beginning of string */
756/* minend: end of match must be >= minend bytes after stringarg. */
757/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
758 * itself is accessed via the pointers above */
759/* nosave: For optimizations. */
c277df42 760{
7918f24d
NC
761 PERL_ARGS_ASSERT_PREGEXEC;
762
c277df42 763 return
9041c2e3 764 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
765 nosave ? 0 : REXEC_COPY_STR);
766}
76234dfb 767#endif
22e551b9 768
cad2e5aa 769
6eb5f6b9 770
1a4edc3c
DM
771/* re_intuit_start():
772 *
773 * Based on some optimiser hints, try to find the earliest position in the
774 * string where the regex could match.
775 *
776 * rx: the regex to match against
777 * sv: the SV being matched: only used for utf8 flag; the string
778 * itself is accessed via the pointers below. Note that on
779 * something like an overloaded SV, SvPOK(sv) may be false
780 * and the string pointers may point to something unrelated to
781 * the SV itself.
782 * strbeg: real beginning of string
783 * strpos: the point in the string at which to begin matching
784 * strend: pointer to the byte following the last char of the string
785 * flags currently unused; set to 0
786 * data: currently unused; set to NULL
787 *
788 * The basic idea of re_intuit_start() is to use some known information
789 * about the pattern, namely:
790 *
791 * a) the longest known anchored substring (i.e. one that's at a
792 * constant offset from the beginning of the pattern; but not
793 * necessarily at a fixed offset from the beginning of the
794 * string);
795 * b) the longest floating substring (i.e. one that's not at a constant
796 * offset from the beginning of the pattern);
797 * c) Whether the pattern is anchored to the string; either
798 * an absolute anchor: /^../, or anchored to \n: /^.../m,
799 * or anchored to pos(): /\G/;
800 * d) A start class: a real or synthetic character class which
801 * represents which characters are legal at the start of the pattern;
802 *
803 * to either quickly reject the match, or to find the earliest position
804 * within the string at which the pattern might match, thus avoiding
805 * running the full NFA engine at those earlier locations, only to
806 * eventually fail and retry further along.
807 *
808 * Returns NULL if the pattern can't match, or returns the address within
809 * the string which is the earliest place the match could occur.
810 *
811 * The longest of the anchored and floating substrings is called 'check'
812 * and is checked first. The other is called 'other' and is checked
813 * second. The 'other' substring may not be present. For example,
814 *
815 * /(abc|xyz)ABC\d{0,3}DEFG/
816 *
817 * will have
818 *
819 * check substr (float) = "DEFG", offset 6..9 chars
820 * other substr (anchored) = "ABC", offset 3..3 chars
821 * stclass = [ax]
822 *
823 * Be aware that during the course of this function, sometimes 'anchored'
824 * refers to a substring being anchored relative to the start of the
825 * pattern, and sometimes to the pattern itself being anchored relative to
826 * the string. For example:
827 *
828 * /\dabc/: "abc" is anchored to the pattern;
829 * /^\dabc/: "abc" is anchored to the pattern and the string;
830 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
831 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
832 * but the pattern is anchored to the string.
52a21eb3
DM
833 */
834
cad2e5aa 835char *
52a21eb3
DM
836Perl_re_intuit_start(pTHX_
837 REGEXP * const rx,
838 SV *sv,
839 const char * const strbeg,
840 char *strpos,
841 char *strend,
842 const U32 flags,
843 re_scream_pos_data *data)
cad2e5aa 844{
8d919b0a 845 struct regexp *const prog = ReANY(rx);
b2ad2123
DM
846 SSize_t start_shift = prog->check_offset_min;
847 /* Should be nonnegative! */
848 SSize_t end_shift = 0;
0fc004dd
DM
849 /* current lowest pos in string where the regex can start matching */
850 char *rx_origin = strpos;
eb578fdb 851 SV *check;
f2ed9b32 852 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
6480a6c4 853 U8 other_ix = 1 - prog->substrs->check_ix;
6ad5ffb3 854 bool ml_anch = 0;
8f4bf5fc 855 char *other_last = strpos;/* latest pos 'other' substr already checked to */
bd61b366 856 char *check_at = NULL; /* check substr found at this pos */
bbe252da 857 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 858 RXi_GET_DECL(prog,progi);
02d5137b
DM
859 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
860 regmatch_info *const reginfo = &reginfo_buf;
271b36b1 861 DECLARE_AND_GET_RE_DEBUG_FLAGS;
a3621e74 862
7918f24d 863 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
864 PERL_UNUSED_ARG(flags);
865 PERL_UNUSED_ARG(data);
7918f24d 866
6ad9a8ab 867 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0
DM
868 "Intuit: trying to determine minimum start position...\n"));
869
fb9bbddb 870 /* for now, assume that all substr offsets are positive. If at some point
f67a5002 871 * in the future someone wants to do clever things with lookbehind and
fb9bbddb
DM
872 * -ve offsets, they'll need to fix up any code in this function
873 * which uses these offsets. See the thread beginning
874 * <20140113145929.GF27210@iabyn.com>
875 */
876 assert(prog->substrs->data[0].min_offset >= 0);
877 assert(prog->substrs->data[0].max_offset >= 0);
878 assert(prog->substrs->data[1].min_offset >= 0);
879 assert(prog->substrs->data[1].max_offset >= 0);
880 assert(prog->substrs->data[2].min_offset >= 0);
881 assert(prog->substrs->data[2].max_offset >= 0);
882
f7022b5a 883 /* for now, assume that if both present, that the floating substring
83f2232d 884 * doesn't start before the anchored substring.
f7022b5a
DM
885 * If you break this assumption (e.g. doing better optimisations
886 * with lookahead/behind), then you'll need to audit the code in this
887 * function carefully first
888 */
889 assert(
890 ! ( (prog->anchored_utf8 || prog->anchored_substr)
891 && (prog->float_utf8 || prog->float_substr))
892 || (prog->float_min_offset >= prog->anchored_offset));
893
1a4edc3c
DM
894 /* byte rather than char calculation for efficiency. It fails
895 * to quickly reject some cases that can't match, but will reject
896 * them later after doing full char arithmetic */
c344f387 897 if (prog->minlen > strend - strpos) {
6ad9a8ab 898 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 899 " String too short...\n"));
cad2e5aa 900 goto fail;
2c2d71f5 901 }
d8da0584 902
196a02af 903 RXp_MATCH_UTF8_set(prog, utf8_target);
6c3fea77 904 reginfo->is_utf8_target = cBOOL(utf8_target);
bf2039a9 905 reginfo->info_aux = NULL;
9d9163fb 906 reginfo->strbeg = strbeg;
220db18a 907 reginfo->strend = strend;
aed7b151 908 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 909 reginfo->intuit = 1;
1cb48e53
DM
910 /* not actually used within intuit, but zero for safety anyway */
911 reginfo->poscache_maxiter = 0;
02d5137b 912
f2ed9b32 913 if (utf8_target) {
2814f4b3
HS
914 if ((!prog->anchored_utf8 && prog->anchored_substr)
915 || (!prog->float_utf8 && prog->float_substr))
33b8afdf
JH
916 to_utf8_substr(prog);
917 check = prog->check_utf8;
918 } else {
7e0d5ad7
KW
919 if (!prog->check_substr && prog->check_utf8) {
920 if (! to_byte_substr(prog)) {
6b54ddc5 921 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
922 }
923 }
33b8afdf
JH
924 check = prog->check_substr;
925 }
274cd312 926
1dc475d0
DM
927 /* dump the various substring data */
928 DEBUG_OPTIMISE_MORE_r({
929 int i;
930 for (i=0; i<=2; i++) {
931 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
932 : prog->substrs->data[i].substr);
933 if (!sv)
934 continue;
935
6ad9a8ab 936 Perl_re_printf( aTHX_
147e3846
KW
937 " substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
938 " useful=%" IVdf " utf8=%d [%s]\n",
1dc475d0
DM
939 i,
940 (IV)prog->substrs->data[i].min_offset,
941 (IV)prog->substrs->data[i].max_offset,
942 (IV)prog->substrs->data[i].end_shift,
943 BmUSEFUL(sv),
944 utf8_target ? 1 : 0,
945 SvPEEK(sv));
946 }
947 });
948
8e1490ee 949 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
9fc7410e
DM
950
951 /* ml_anch: check after \n?
952 *
0fa70a06 953 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
9fc7410e
DM
954 * with /.*.../, these flags will have been added by the
955 * compiler:
956 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
957 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
958 */
7d2d37f5
DM
959 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
960 && !(prog->intflags & PREGf_IMPLICIT);
cad2e5aa 961
343c8a29 962 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
c889ccc8
DM
963 /* we are only allowed to match at BOS or \G */
964
57fcbfa7 965 /* trivially reject if there's a BOS anchor and we're not at BOS.
7bb3b9eb
DM
966 *
967 * Note that we don't try to do a similar quick reject for
968 * \G, since generally the caller will have calculated strpos
969 * based on pos() and gofs, so the string is already correctly
970 * anchored by definition; and handling the exceptions would
971 * be too fiddly (e.g. REXEC_IGNOREPOS).
57fcbfa7 972 */
7bb3b9eb 973 if ( strpos != strbeg
d3d47aac 974 && (prog->intflags & PREGf_ANCH_SBOL))
c889ccc8 975 {
6ad9a8ab 976 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 977 " Not at start...\n"));
c889ccc8
DM
978 goto fail;
979 }
980
a5d12a4b
DM
981 /* in the presence of an anchor, the anchored (relative to the
982 * start of the regex) substr must also be anchored relative
66b7ec5c
DM
983 * to strpos. So quickly reject if substr isn't found there.
984 * This works for \G too, because the caller will already have
985 * subtracted gofs from pos, and gofs is the offset from the
986 * \G to the start of the regex. For example, in /.abc\Gdef/,
987 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
988 * caller will have set strpos=pos()-4; we look for the substr
989 * at position pos()-4+1, which lines up with the "a" */
a5d12a4b 990
33c28ab2 991 if (prog->check_offset_min == prog->check_offset_max) {
c889ccc8 992 /* Substring at constant offset from beg-of-str... */
b2ad2123 993 SSize_t slen = SvCUR(check);
343c8a29 994 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 995
6ad9a8ab 996 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 997 " Looking for check substr at fixed offset %" IVdf "...\n",
1dc475d0
DM
998 (IV)prog->check_offset_min));
999
7742aa66
DM
1000 if (SvTAIL(check)) {
1001 /* In this case, the regex is anchored at the end too.
1002 * Unless it's a multiline match, the lengths must match
b2ad2123 1003 * exactly, give or take a \n. NB: slen >= 1 since
7742aa66
DM
1004 * the last char of check is \n */
1005 if (!multiline
b2ad2123
DM
1006 && ( strend - s > slen
1007 || strend - s < slen - 1
1008 || (strend - s == slen && strend[-1] != '\n')))
c889ccc8 1009 {
6ad9a8ab 1010 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1011 " String too long...\n"));
c889ccc8
DM
1012 goto fail_finish;
1013 }
b2ad2123
DM
1014 /* Now should match s[0..slen-2] */
1015 slen--;
c889ccc8 1016 }
b2ad2123 1017 if (slen && (strend - s < slen
26fb2318 1018 || *SvPVX_const(check) != *s
b2ad2123 1019 || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
d307bf57 1020 {
6ad9a8ab 1021 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1022 " String not equal...\n"));
d307bf57
DM
1023 goto fail_finish;
1024 }
c889ccc8
DM
1025
1026 check_at = s;
1027 goto success_at_start;
cad2e5aa 1028 }
cad2e5aa 1029 }
cad2e5aa 1030 }
0fc004dd 1031
b2ad2123 1032 end_shift = prog->check_end_shift;
cad2e5aa 1033
19188028 1034#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
b2ad2123 1035 if (end_shift < 0)
147e3846 1036 Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
b2ad2123 1037 (IV)end_shift, RX_PRECOMP(rx));
2c2d71f5
JH
1038#endif
1039
2c2d71f5 1040 restart:
1de06328 1041
66b7ec5c
DM
1042 /* This is the (re)entry point of the main loop in this function.
1043 * The goal of this loop is to:
1044 * 1) find the "check" substring in the region rx_origin..strend
b2ad2123 1045 * (adjusted by start_shift / end_shift). If not found, reject
66b7ec5c
DM
1046 * immediately.
1047 * 2) If it exists, look for the "other" substr too if defined; for
1048 * example, if the check substr maps to the anchored substr, then
1049 * check the floating substr, and vice-versa. If not found, go
1050 * back to (1) with rx_origin suitably incremented.
1051 * 3) If we find an rx_origin position that doesn't contradict
1052 * either of the substrings, then check the possible additional
1053 * constraints on rx_origin of /^.../m or a known start class.
1054 * If these fail, then depending on which constraints fail, jump
1055 * back to here, or to various other re-entry points further along
1056 * that skip some of the first steps.
1057 * 4) If we pass all those tests, update the BmUSEFUL() count on the
1058 * substring. If the start position was determined to be at the
1059 * beginning of the string - so, not rejected, but not optimised,
1060 * since we have to run regmatch from position 0 - decrement the
1061 * BmUSEFUL() count. Otherwise increment it.
1062 */
1063
1a4edc3c
DM
1064
1065 /* first, look for the 'check' substring */
1066
1de06328 1067 {
c33e64f0
FC
1068 U8* start_point;
1069 U8* end_point;
c889ccc8 1070
c889ccc8 1071 DEBUG_OPTIMISE_MORE_r({
6ad9a8ab 1072 Perl_re_printf( aTHX_
147e3846
KW
1073 " At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1074 " Start shift: %" IVdf " End shift %" IVdf
1075 " Real end Shift: %" IVdf "\n",
675e93ee 1076 (IV)(rx_origin - strbeg),
c889ccc8 1077 (IV)prog->check_offset_min,
b2ad2123
DM
1078 (IV)start_shift,
1079 (IV)end_shift,
c889ccc8
DM
1080 (IV)prog->check_end_shift);
1081 });
1de06328 1082
b2ad2123 1083 end_point = HOPBACK3(strend, end_shift, rx_origin);
bb152a4b
DM
1084 if (!end_point)
1085 goto fail_finish;
b2ad2123 1086 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
33c28ab2
DM
1087 if (!start_point)
1088 goto fail_finish;
c889ccc8 1089
557f47af 1090
e0362b86 1091 /* If the regex is absolutely anchored to either the start of the
d3d47aac 1092 * string (SBOL) or to pos() (ANCH_GPOS), then
e0362b86
DM
1093 * check_offset_max represents an upper bound on the string where
1094 * the substr could start. For the ANCH_GPOS case, we assume that
1095 * the caller of intuit will have already set strpos to
1096 * pos()-gofs, so in this case strpos + offset_max will still be
1097 * an upper bound on the substr.
1098 */
c19c836a
DM
1099 if (!ml_anch
1100 && prog->intflags & PREGf_ANCH
e0362b86 1101 && prog->check_offset_max != SSize_t_MAX)
c19c836a 1102 {
b2ad2123 1103 SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
e0362b86
DM
1104 const char * const anchor =
1105 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
b2ad2123 1106 SSize_t targ_len = (char*)end_point - anchor;
2ce94a86 1107
b2ad2123 1108 if (check_len > targ_len) {
2ce94a86 1109 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
12453e29 1110 "Target string too short to match required substring...\n"));
2ce94a86
DM
1111 goto fail_finish;
1112 }
e0362b86
DM
1113
1114 /* do a bytes rather than chars comparison. It's conservative;
1115 * so it skips doing the HOP if the result can't possibly end
1116 * up earlier than the old value of end_point.
1117 */
b2ad2123
DM
1118 assert(anchor + check_len <= (char *)end_point);
1119 if (prog->check_offset_max + check_len < targ_len) {
e0362b86
DM
1120 end_point = HOP3lim((U8*)anchor,
1121 prog->check_offset_max,
b2ad2123 1122 end_point - check_len
2ce94a86 1123 )
b2ad2123 1124 + check_len;
12453e29
YO
1125 if (end_point < start_point)
1126 goto fail_finish;
e0362b86 1127 }
d6ef1678
DM
1128 }
1129
ae5d4331 1130 check_at = fbm_instr( start_point, end_point,
7fba1cd6 1131 check, multiline ? FBMrf_MULTILINE : 0);
c889ccc8 1132
6ad9a8ab 1133 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1134 " doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
675e93ee
DM
1135 (IV)((char*)start_point - strbeg),
1136 (IV)((char*)end_point - strbeg),
1137 (IV)(check_at ? check_at - strbeg : -1)
1138 ));
1139
8fd34720
DM
1140 /* Update the count-of-usability, remove useless subpatterns,
1141 unshift s. */
1142
1143 DEBUG_EXECUTE_r({
1144 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1145 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
6ad9a8ab 1146 Perl_re_printf( aTHX_ " %s %s substr %s%s%s",
8fd34720
DM
1147 (check_at ? "Found" : "Did not find"),
1148 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1149 ? "anchored" : "floating"),
1150 quoted,
1151 RE_SV_TAIL(check),
1152 (check_at ? " at offset " : "...\n") );
1153 });
2c2d71f5 1154
8fd34720
DM
1155 if (!check_at)
1156 goto fail_finish;
8fd34720
DM
1157 /* set rx_origin to the minimum position where the regex could start
1158 * matching, given the constraint of the just-matched check substring.
1159 * But don't set it lower than previously.
1160 */
fdc003fd 1161
8fd34720
DM
1162 if (check_at - rx_origin > prog->check_offset_max)
1163 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
675e93ee 1164 /* Finish the diagnostic message */
6ad9a8ab 1165 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1166 "%ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1167 (long)(check_at - strbeg),
1168 (IV)(rx_origin - strbeg)
1169 ));
8fd34720 1170 }
fdc003fd
DM
1171
1172
1a4edc3c 1173 /* now look for the 'other' substring if defined */
2c2d71f5 1174
fd8def15
HS
1175 if (prog->substrs->data[other_ix].utf8_substr
1176 || prog->substrs->data[other_ix].substr)
1de06328 1177 {
30944b6d 1178 /* Take into account the "other" substring. */
6c3343a6
DM
1179 char *last, *last1;
1180 char *s;
1181 SV* must;
1182 struct reg_substr_datum *other;
1183
1184 do_other_substr:
1185 other = &prog->substrs->data[other_ix];
fd8def15
HS
1186 if (!utf8_target && !other->substr) {
1187 if (!to_byte_substr(prog)) {
1188 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1189 }
1190 }
6c3343a6
DM
1191
1192 /* if "other" is anchored:
1193 * we've previously found a floating substr starting at check_at.
1194 * This means that the regex origin must lie somewhere
1195 * between min (rx_origin): HOP3(check_at, -check_offset_max)
1196 * and max: HOP3(check_at, -check_offset_min)
1197 * (except that min will be >= strpos)
1198 * So the fixed substr must lie somewhere between
1199 * HOP3(min, anchored_offset)
1200 * HOP3(max, anchored_offset) + SvCUR(substr)
1201 */
1202
1203 /* if "other" is floating
1204 * Calculate last1, the absolute latest point where the
1205 * floating substr could start in the string, ignoring any
1206 * constraints from the earlier fixed match. It is calculated
1207 * as follows:
1208 *
1209 * strend - prog->minlen (in chars) is the absolute latest
1210 * position within the string where the origin of the regex
1211 * could appear. The latest start point for the floating
1212 * substr is float_min_offset(*) on from the start of the
1213 * regex. last1 simply combines thee two offsets.
1214 *
1215 * (*) You might think the latest start point should be
1216 * float_max_offset from the regex origin, and technically
1217 * you'd be correct. However, consider
1218 * /a\d{2,4}bcd\w/
1219 * Here, float min, max are 3,5 and minlen is 7.
1220 * This can match either
1221 * /a\d\dbcd\w/
1222 * /a\d\d\dbcd\w/
1223 * /a\d\d\d\dbcd\w/
1224 * In the first case, the regex matches minlen chars; in the
1225 * second, minlen+1, in the third, minlen+2.
1226 * In the first case, the floating offset is 3 (which equals
1227 * float_min), in the second, 4, and in the third, 5 (which
1228 * equals float_max). In all cases, the floating string bcd
1229 * can never start more than 4 chars from the end of the
1230 * string, which equals minlen - float_min. As the substring
1231 * starts to match more than float_min from the start of the
1232 * regex, it makes the regex match more than minlen chars,
1233 * and the two cancel each other out. So we can always use
1234 * float_min - minlen, rather than float_max - minlen for the
1235 * latest position in the string.
1236 *
1237 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1238 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1239 */
1240
e7a14a9c 1241 assert(prog->minlen >= other->min_offset);
6c3343a6
DM
1242 last1 = HOP3c(strend,
1243 other->min_offset - prog->minlen, strbeg);
1244
4d006249 1245 if (other_ix) {/* i.e. if (other-is-float) */
6c3343a6
DM
1246 /* last is the latest point where the floating substr could
1247 * start, *given* any constraints from the earlier fixed
1248 * match. This constraint is that the floating string starts
1249 * <= float_max_offset chars from the regex origin (rx_origin).
1250 * If this value is less than last1, use it instead.
eb3831ce 1251 */
6c3343a6
DM
1252 assert(rx_origin <= last1);
1253 last =
1254 /* this condition handles the offset==infinity case, and
1255 * is a short-cut otherwise. Although it's comparing a
1256 * byte offset to a char length, it does so in a safe way,
1257 * since 1 char always occupies 1 or more bytes,
1258 * so if a string range is (last1 - rx_origin) bytes,
1259 * it will be less than or equal to (last1 - rx_origin)
1260 * chars; meaning it errs towards doing the accurate HOP3
1261 * rather than just using last1 as a short-cut */
1262 (last1 - rx_origin) < other->max_offset
1263 ? last1
1264 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1265 }
1266 else {
b2ad2123
DM
1267 assert(strpos + start_shift <= check_at);
1268 last = HOP4c(check_at, other->min_offset - start_shift,
6c3343a6
DM
1269 strbeg, strend);
1270 }
ead917d0 1271
6c3343a6
DM
1272 s = HOP3c(rx_origin, other->min_offset, strend);
1273 if (s < other_last) /* These positions already checked */
1274 s = other_last;
1275
1276 must = utf8_target ? other->utf8_substr : other->substr;
1277 assert(SvPOK(must));
675e93ee
DM
1278 {
1279 char *from = s;
1280 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1281
71a9d105
DM
1282 if (to > strend)
1283 to = strend;
88203927
DM
1284 if (from > to) {
1285 s = NULL;
6ad9a8ab 1286 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1287 " skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
88203927
DM
1288 (IV)(from - strbeg),
1289 (IV)(to - strbeg)
1290 ));
1291 }
1292 else {
1293 s = fbm_instr(
1294 (unsigned char*)from,
1295 (unsigned char*)to,
1296 must,
1297 multiline ? FBMrf_MULTILINE : 0
1298 );
6ad9a8ab 1299 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1300 " doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
88203927
DM
1301 (IV)(from - strbeg),
1302 (IV)(to - strbeg),
1303 (IV)(s ? s - strbeg : -1)
1304 ));
1305 }
675e93ee
DM
1306 }
1307
6c3343a6
DM
1308 DEBUG_EXECUTE_r({
1309 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1310 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
6ad9a8ab 1311 Perl_re_printf( aTHX_ " %s %s substr %s%s",
6c3343a6
DM
1312 s ? "Found" : "Contradicts",
1313 other_ix ? "floating" : "anchored",
1314 quoted, RE_SV_TAIL(must));
1315 });
ead917d0 1316
ead917d0 1317
6c3343a6
DM
1318 if (!s) {
1319 /* last1 is latest possible substr location. If we didn't
1320 * find it before there, we never will */
1321 if (last >= last1) {
6ad9a8ab 1322 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee 1323 "; giving up...\n"));
6c3343a6 1324 goto fail_finish;
ead917d0
DM
1325 }
1326
6c3343a6
DM
1327 /* try to find the check substr again at a later
1328 * position. Maybe next time we'll find the "other" substr
1329 * in range too */
6c3343a6
DM
1330 other_last = HOP3c(last, 1, strend) /* highest failure */;
1331 rx_origin =
4d006249 1332 other_ix /* i.e. if other-is-float */
6c3343a6
DM
1333 ? HOP3c(rx_origin, 1, strend)
1334 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
6ad9a8ab 1335 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1336 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1337 (other_ix ? "floating" : "anchored"),
1338 (long)(HOP3c(check_at, 1, strend) - strbeg),
1339 (IV)(rx_origin - strbeg)
1340 ));
6c3343a6
DM
1341 goto restart;
1342 }
1343 else {
4d006249 1344 if (other_ix) { /* if (other-is-float) */
6c3343a6
DM
1345 /* other_last is set to s, not s+1, since its possible for
1346 * a floating substr to fail first time, then succeed
1347 * second time at the same floating position; e.g.:
1348 * "-AB--AABZ" =~ /\wAB\d*Z/
1349 * The first time round, anchored and float match at
1350 * "-(AB)--AAB(Z)" then fail on the initial \w character
1351 * class. Second time round, they match at "-AB--A(AB)(Z)".
1352 */
1353 other_last = s;
ead917d0
DM
1354 }
1355 else {
6c3343a6
DM
1356 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1357 other_last = HOP3c(s, 1, strend);
ead917d0 1358 }
6ad9a8ab 1359 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1360 " at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1361 (long)(s - strbeg),
1362 (IV)(rx_origin - strbeg)
1363 ));
1364
6c3343a6 1365 }
cad2e5aa 1366 }
acba93e8
DM
1367 else {
1368 DEBUG_OPTIMISE_MORE_r(
6ad9a8ab 1369 Perl_re_printf( aTHX_
147e3846
KW
1370 " Check-only match: offset min:%" IVdf " max:%" IVdf
1371 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1372 " strend:%" IVdf "\n",
acba93e8
DM
1373 (IV)prog->check_offset_min,
1374 (IV)prog->check_offset_max,
675e93ee
DM
1375 (IV)(check_at-strbeg),
1376 (IV)(rx_origin-strbeg),
1c1c599d 1377 (IV)(rx_origin-check_at),
675e93ee 1378 (IV)(strend-strbeg)
acba93e8
DM
1379 )
1380 );
1381 }
2c2d71f5 1382
acba93e8 1383 postprocess_substr_matches:
0991020e 1384
1a4edc3c 1385 /* handle the extra constraint of /^.../m if present */
e3c6feb0 1386
7d2d37f5 1387 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
4620cb61
DM
1388 char *s;
1389
6ad9a8ab 1390 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
a62659bd 1391 " looking for /^/m anchor"));
d0880ea7
DM
1392
1393 /* we have failed the constraint of a \n before rx_origin.
2e759faa
DM
1394 * Find the next \n, if any, even if it's beyond the current
1395 * anchored and/or floating substrings. Whether we should be
1396 * scanning ahead for the next \n or the next substr is debatable.
1397 * On the one hand you'd expect rare substrings to appear less
1398 * often than \n's. On the other hand, searching for \n means
675e93ee 1399 * we're effectively flipping between check_substr and "\n" on each
2e759faa
DM
1400 * iteration as the current "rarest" string candidate, which
1401 * means for example that we'll quickly reject the whole string if
1402 * hasn't got a \n, rather than trying every substr position
1403 * first
1404 */
d0880ea7 1405
4620cb61
DM
1406 s = HOP3c(strend, - prog->minlen, strpos);
1407 if (s <= rx_origin ||
1408 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1409 {
6ad9a8ab 1410 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
d0880ea7
DM
1411 " Did not find /%s^%s/m...\n",
1412 PL_colors[0], PL_colors[1]));
a62659bd
DM
1413 goto fail_finish;
1414 }
d0880ea7 1415
4ada1233
DM
1416 /* earliest possible origin is 1 char after the \n.
1417 * (since *rx_origin == '\n', it's safe to ++ here rather than
1418 * HOP(rx_origin, 1)) */
1419 rx_origin++;
d0880ea7 1420
f4f115de 1421 if (prog->substrs->check_ix == 0 /* check is anchored */
4ada1233 1422 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
f4f115de 1423 {
d0880ea7
DM
1424 /* Position contradicts check-string; either because
1425 * check was anchored (and thus has no wiggle room),
4ada1233 1426 * or check was float and rx_origin is above the float range */
6ad9a8ab 1427 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee
DM
1428 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1429 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
d0880ea7
DM
1430 goto restart;
1431 }
1432
1433 /* if we get here, the check substr must have been float,
2e759faa 1434 * is in range, and we may or may not have had an anchored
d0880ea7
DM
1435 * "other" substr which still contradicts */
1436 assert(prog->substrs->check_ix); /* check is float */
1437
1438 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1439 /* whoops, the anchored "other" substr exists, so we still
1440 * contradict. On the other hand, the float "check" substr
1441 * didn't contradict, so just retry the anchored "other"
1442 * substr */
6ad9a8ab 1443 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1444 " Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
d0880ea7 1445 PL_colors[0], PL_colors[1],
73e8ff00
DM
1446 (IV)(rx_origin - strbeg + prog->anchored_offset),
1447 (IV)(rx_origin - strbeg)
675e93ee 1448 ));
d0880ea7
DM
1449 goto do_other_substr;
1450 }
1451
1452 /* success: we don't contradict the found floating substring
1453 * (and there's no anchored substr). */
6ad9a8ab 1454 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee
DM
1455 " Found /%s^%s/m with rx_origin %ld...\n",
1456 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
e3c6feb0
DM
1457 }
1458 else {
6ad9a8ab 1459 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
fe4f3442 1460 " (multiline anchor test skipped)\n"));
e3c6feb0
DM
1461 }
1462
ffad1e6a 1463 success_at_start:
e3c6feb0 1464
cad2e5aa 1465
dd170ff5
DM
1466 /* if we have a starting character class, then test that extra constraint.
1467 * (trie stclasses are too expensive to use here, we are better off to
1468 * leave it to regmatch itself) */
1469
f8fc2ecf 1470 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
f8fc2ecf 1471 const U8* const str = (U8*)STRING(progi->regstclass);
0991020e 1472
b2ad2123 1473 /* XXX this value could be pre-computed */
7d769928 1474 const SSize_t cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
2c75e362 1475 ? (reginfo->is_utf8_pat
7d769928
DM
1476 ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str)
1477 : (SSize_t)STR_LEN(progi->regstclass))
66e933ab 1478 : 1);
1de06328 1479 char * endpos;
fa3bb21d 1480 char *s;
000dfd2d
DM
1481 /* latest pos that a matching float substr constrains rx start to */
1482 char *rx_max_float = NULL;
1483
c75a3985
DM
1484 /* if the current rx_origin is anchored, either by satisfying an
1485 * anchored substring constraint, or a /^.../m constraint, then we
1486 * can reject the current origin if the start class isn't found
1487 * at the current position. If we have a float-only match, then
1488 * rx_origin is constrained to a range; so look for the start class
1489 * in that range. if neither, then look for the start class in the
1490 * whole rest of the string */
1491
dd170ff5
DM
1492 /* XXX DAPM it's not clear what the minlen test is for, and why
1493 * it's not used in the floating case. Nothing in the test suite
1494 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1495 * Here are some old comments, which may or may not be correct:
1496 *
1497 * minlen == 0 is possible if regstclass is \b or \B,
1498 * and the fixed substr is ''$.
1499 * Since minlen is already taken into account, rx_origin+1 is
1500 * before strend; accidentally, minlen >= 1 guaranties no false
1501 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1502 * 0) below assumes that regstclass does not come from lookahead...
1503 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1504 * This leaves EXACTF-ish only, which are dealt with in
1505 * find_byclass().
1506 */
1507
7d2d37f5 1508 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
b2ad2123 1509 endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
000dfd2d 1510 else if (prog->float_substr || prog->float_utf8) {
b2ad2123
DM
1511 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1512 endpos = HOP3clim(rx_max_float, cl_l, strend);
000dfd2d 1513 }
1de06328
YO
1514 else
1515 endpos= strend;
1516
6ad9a8ab 1517 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
b2ad2123 1518 " looking for class: start_shift: %" IVdf " check_at: %" IVdf
147e3846 1519 " rx_origin: %" IVdf " endpos: %" IVdf "\n",
b2ad2123 1520 (IV)start_shift, (IV)(check_at - strbeg),
c43b5520 1521 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
d8080198 1522
c43b5520 1523 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
f9176b44 1524 reginfo);
be778b1a 1525 if (!s) {
6eb5f6b9 1526 if (endpos == strend) {
6ad9a8ab 1527 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1dc475d0 1528 " Could not match STCLASS...\n") );
6eb5f6b9
JH
1529 goto fail;
1530 }
6ad9a8ab 1531 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1dc475d0 1532 " This position contradicts STCLASS...\n") );
e0eb31e7
DM
1533 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1534 && !(prog->intflags & PREGf_IMPLICIT))
653099ff 1535 goto fail;
9fed8d02 1536
6eb5f6b9 1537 /* Contradict one of substrings */
97136c8a
DM
1538 if (prog->anchored_substr || prog->anchored_utf8) {
1539 if (prog->substrs->check_ix == 1) { /* check is float */
1540 /* Have both, check_string is floating */
b2ad2123
DM
1541 assert(rx_origin + start_shift <= check_at);
1542 if (rx_origin + start_shift != check_at) {
97136c8a 1543 /* not at latest position float substr could match:
c75a3985
DM
1544 * Recheck anchored substring, but not floating.
1545 * The condition above is in bytes rather than
1546 * chars for efficiency. It's conservative, in
1547 * that it errs on the side of doing 'goto
88203927
DM
1548 * do_other_substr'. In this case, at worst,
1549 * an extra anchored search may get done, but in
1550 * practice the extra fbm_instr() is likely to
1551 * get skipped anyway. */
6ad9a8ab 1552 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
147e3846 1553 " about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1554 (long)(other_last - strbeg),
1555 (IV)(rx_origin - strbeg)
1556 ));
97136c8a 1557 goto do_other_substr;
3369914b 1558 }
3369914b
DM
1559 }
1560 }
97136c8a 1561 else {
9fed8d02
DM
1562 /* float-only */
1563
7d2d37f5 1564 if (ml_anch) {
c75a3985
DM
1565 /* In the presence of ml_anch, we might be able to
1566 * find another \n without breaking the current float
1567 * constraint. */
1568
1569 /* strictly speaking this should be HOP3c(..., 1, ...),
1570 * but since we goto a block of code that's going to
1571 * search for the next \n if any, its safe here */
9fed8d02 1572 rx_origin++;
6ad9a8ab 1573 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
675e93ee 1574 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
9fed8d02 1575 PL_colors[0], PL_colors[1],
675e93ee 1576 (long)(rx_origin - strbeg)) );
9fed8d02 1577 goto postprocess_substr_matches;
ab60c45a 1578 }
c75a3985
DM
1579
1580 /* strictly speaking this can never be true; but might
1581 * be if we ever allow intuit without substrings */
1582 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
9fed8d02 1583 goto fail;
c75a3985 1584
000dfd2d 1585 rx_origin = rx_max_float;
9fed8d02
DM
1586 }
1587
c75a3985
DM
1588 /* at this point, any matching substrings have been
1589 * contradicted. Start again... */
1590
9fed8d02 1591 rx_origin = HOP3c(rx_origin, 1, strend);
557f47af
DM
1592
1593 /* uses bytes rather than char calculations for efficiency.
1594 * It's conservative: it errs on the side of doing 'goto restart',
1595 * where there is code that does a proper char-based test */
b2ad2123 1596 if (rx_origin + start_shift + end_shift > strend) {
6ad9a8ab 1597 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
9fed8d02
DM
1598 " Could not match STCLASS...\n") );
1599 goto fail;
1600 }
6ad9a8ab 1601 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
147e3846 1602 " about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
9fed8d02 1603 (prog->substrs->check_ix ? "floating" : "anchored"),
b2ad2123 1604 (long)(rx_origin + start_shift - strbeg),
675e93ee
DM
1605 (IV)(rx_origin - strbeg)
1606 ));
9fed8d02 1607 goto restart;
6eb5f6b9 1608 }
9fed8d02 1609
c75a3985
DM
1610 /* Success !!! */
1611
5f9c6575 1612 if (rx_origin != s) {
6ad9a8ab 1613 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1614 " By STCLASS: moving %ld --> %ld\n",
675e93ee 1615 (long)(rx_origin - strbeg), (long)(s - strbeg))
b7953727
JH
1616 );
1617 }
1618 else {
6ad9a8ab 1619 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1620 " Does not contradict STCLASS...\n");
b7953727
JH
1621 );
1622 }
6eb5f6b9 1623 }
ffad1e6a
DM
1624
1625 /* Decide whether using the substrings helped */
1626
1627 if (rx_origin != strpos) {
1628 /* Fixed substring is found far enough so that the match
1629 cannot start at strpos. */
1630
6ad9a8ab 1631 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n"));
ffad1e6a
DM
1632 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1633 }
1634 else {
70563e16
DM
1635 /* The found rx_origin position does not prohibit matching at
1636 * strpos, so calling intuit didn't gain us anything. Decrement
1637 * the BmUSEFUL() count on the check substring, and if we reach
1638 * zero, free it. */
1639 if (!(prog->intflags & PREGf_NAUGHTY)
ffad1e6a
DM
1640 && (utf8_target ? (
1641 prog->check_utf8 /* Could be deleted already */
1642 && --BmUSEFUL(prog->check_utf8) < 0
1643 && (prog->check_utf8 == prog->float_utf8)
1644 ) : (
1645 prog->check_substr /* Could be deleted already */
1646 && --BmUSEFUL(prog->check_substr) < 0
1647 && (prog->check_substr == prog->float_substr)
1648 )))
1649 {
1650 /* If flags & SOMETHING - do not do it many times on the same match */
6ad9a8ab 1651 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n"));
ffad1e6a
DM
1652 /* XXX Does the destruction order has to change with utf8_target? */
1653 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1654 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1655 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1656 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1657 check = NULL; /* abort */
ffad1e6a
DM
1658 /* XXXX This is a remnant of the old implementation. It
1659 looks wasteful, since now INTUIT can use many
1660 other heuristics. */
1661 prog->extflags &= ~RXf_USE_INTUIT;
ffad1e6a
DM
1662 }
1663 }
1664
6ad9a8ab 1665 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
ffad1e6a 1666 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
675e93ee 1667 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
ffad1e6a 1668
c765d6e0 1669 return rx_origin;
2c2d71f5
JH
1670
1671 fail_finish: /* Substring not found */
33b8afdf 1672 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1673 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1674 fail:
6ad9a8ab 1675 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n",
e4584336 1676 PL_colors[4], PL_colors[5]));
bd61b366 1677 return NULL;
cad2e5aa 1678}
9661b544 1679
70563e16 1680
a0a388a1 1681#define DECL_TRIE_TYPE(scan) \
e7fd4aa1 1682 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
a4525e78 1683 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
3ed3004a 1684 trie_utf8l, trie_flu8, trie_flu8_latin } \
e7fd4aa1
KW
1685 trie_type = ((scan->flags == EXACT) \
1686 ? (utf8_target ? trie_utf8 : trie_plain) \
a4525e78
KW
1687 : (scan->flags == EXACTL) \
1688 ? (utf8_target ? trie_utf8l : trie_plain) \
89829bb5 1689 : (scan->flags == EXACTFAA) \
a4525e78
KW
1690 ? (utf8_target \
1691 ? trie_utf8_exactfa_fold \
1692 : trie_latin_utf8_exactfa_fold) \
1693 : (scan->flags == EXACTFLU8 \
3ed3004a
YO
1694 ? (utf8_target \
1695 ? trie_flu8 \
1696 : trie_flu8_latin) \
a4525e78
KW
1697 : (utf8_target \
1698 ? trie_utf8_fold \
3ed3004a 1699 : trie_latin_utf8_fold)))
fab2782b 1700
9ad8cac4
KW
1701/* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1702 * 'foldbuf+sizeof(foldbuf)' */
1703#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
baa60164 1704STMT_START { \
fab2782b 1705 STRLEN skiplen; \
baa60164 1706 U8 flags = FOLD_FLAGS_FULL; \
fab2782b 1707 switch (trie_type) { \
a4525e78 1708 case trie_flu8: \
780fcc9f 1709 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
3ed3004a 1710 if (UTF8_IS_ABOVE_LATIN1(*uc)) { \
e1a2878a 1711 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
613abc6d 1712 } \
a4525e78 1713 goto do_trie_utf8_fold; \
31f05a37 1714 case trie_utf8_exactfa_fold: \
baa60164 1715 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1716 /* FALLTHROUGH */ \
fab2782b 1717 case trie_utf8_fold: \
a4525e78 1718 do_trie_utf8_fold: \
fab2782b 1719 if ( foldlen>0 ) { \
9ad8cac4 1720 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
fab2782b
YO
1721 foldlen -= len; \
1722 uscan += len; \
1723 len=0; \
1724 } else { \
9ad8cac4 1725 uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \
a1a5ec35 1726 flags); \
695717ad 1727 len = UTF8_SAFE_SKIP(uc, uc_end); \
5f560d8a 1728 skiplen = UVCHR_SKIP( uvc ); \
fab2782b
YO
1729 foldlen -= skiplen; \
1730 uscan = foldbuf + skiplen; \
1731 } \
1732 break; \
3ed3004a
YO
1733 case trie_flu8_latin: \
1734 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1735 goto do_trie_latin_utf8_fold; \
baa60164
KW
1736 case trie_latin_utf8_exactfa_fold: \
1737 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1738 /* FALLTHROUGH */ \
fab2782b 1739 case trie_latin_utf8_fold: \
3ed3004a 1740 do_trie_latin_utf8_fold: \
fab2782b 1741 if ( foldlen>0 ) { \
9ad8cac4 1742 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
fab2782b
YO
1743 foldlen -= len; \
1744 uscan += len; \
1745 len=0; \
1746 } else { \
1747 len = 1; \
31f05a37 1748 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
5f560d8a 1749 skiplen = UVCHR_SKIP( uvc ); \
fab2782b
YO
1750 foldlen -= skiplen; \
1751 uscan = foldbuf + skiplen; \
1752 } \
1753 break; \
a4525e78 1754 case trie_utf8l: \
780fcc9f 1755 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
613abc6d 1756 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
14f657d4 1757 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
613abc6d 1758 } \
780fcc9f 1759 /* FALLTHROUGH */ \
fab2782b 1760 case trie_utf8: \
9ad8cac4 1761 uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags ); \
fab2782b
YO
1762 break; \
1763 case trie_plain: \
1764 uvc = (UV)*uc; \
1765 len = 1; \
1766 } \
1767 if (uvc < 256) { \
1768 charid = trie->charmap[ uvc ]; \
1769 } \
1770 else { \
1771 charid = 0; \
1772 if (widecharmap) { \
1773 SV** const svpp = hv_fetch(widecharmap, \
1774 (char*)&uvc, sizeof(UV), 0); \
1775 if (svpp) \
1776 charid = (U16)SvIV(*svpp); \
1777 } \
1778 } \
4cadc6a9
YO
1779} STMT_END
1780
cb41e5d6 1781#define DUMP_EXEC_POS(li,s,doutf8,depth) \
ae7c5b9b 1782 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
cb41e5d6 1783 startpos, doutf8, depth)
ae7c5b9b 1784
da10aa09
KW
1785#define REXEC_FBC_SCAN(UTF8, CODE) \
1786 STMT_START { \
1787 while (s < strend) { \
1788 CODE \
8c9c2723
KW
1789 s += ((UTF8) \
1790 ? UTF8_SAFE_SKIP(s, reginfo->strend) \
1791 : 1); \
da10aa09
KW
1792 } \
1793 } STMT_END
4cadc6a9 1794
d990bd30
KW
1795#define REXEC_FBC_CLASS_SCAN(UTF8, COND) \
1796 STMT_START { \
1797 while (s < strend) { \
1798 REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \
1799 } \
1800 } STMT_END
4cadc6a9 1801
d990bd30 1802#define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \
05bd126c 1803 if (COND) { \
19719145 1804 FBC_CHECK_AND_TRY \
8c9c2723 1805 s += ((UTF8) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1);\
21d1ed54 1806 previous_occurrence_end = s; \
05bd126c 1807 } \
21d1ed54
KW
1808 else { \
1809 s += ((UTF8) ? UTF8SKIP(s) : 1); \
1810 }
4cadc6a9 1811
c84a03c5 1812#define REXEC_FBC_CSCAN(CONDUTF8,COND) \
baa60164 1813 if (utf8_target) { \
da10aa09 1814 REXEC_FBC_CLASS_SCAN(1, CONDUTF8); \
e1d1eefb
YO
1815 } \
1816 else { \
da10aa09 1817 REXEC_FBC_CLASS_SCAN(0, COND); \
d981ef24 1818 }
05bd126c 1819
a9448551
KW
1820/* We keep track of where the next character should start after an occurrence
1821 * of the one we're looking for. Knowing that, we can see right away if the
1822 * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we
1823 * don't accept the 2nd and succeeding adjacent occurrences */
8c9c2723
KW
1824#define FBC_CHECK_AND_TRY \
1825 if ( ( doevery \
1826 || s != previous_occurrence_end) \
1827 && ( reginfo->intuit \
1828 || (s <= reginfo->strend && regtry(reginfo, &s)))) \
1829 { \
1830 goto got_it; \
19719145
KW
1831 }
1832
a9448551
KW
1833
1834/* This differs from the above macros in that it calls a function which returns
1835 * the next occurrence of the thing being looked for in 's'; and 'strend' if
1836 * there is no such occurrence. */
1837#define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f) \
1838 while (s < strend) { \
2b1f9c71 1839 s = (f); \
a9448551
KW
1840 if (s >= strend) { \
1841 break; \
1842 } \
1843 \
1844 FBC_CHECK_AND_TRY \
1845 s += (UTF8) ? UTF8SKIP(s) : 1; \
1846 previous_occurrence_end = s; \
1847 }
1848
ff8b8b42
KW
1849/* This differs from the above macros in that it is passed a single byte that
1850 * is known to begin the next occurrence of the thing being looked for in 's'.
1851 * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
1852 * at that position. */
1853#define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND) \
1854 while (s < strend) { \
1855 s = (char *) memchr(s, byte, strend -s); \
1856 if (s == NULL) { \
1857 s = (char *) strend; \
1858 break; \
1859 } \
1860 \
04532561
KW
1861 if (COND) { \
1862 FBC_CHECK_AND_TRY \
8c9c2723 1863 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
04532561
KW
1864 previous_occurrence_end = s; \
1865 } \
1866 else { \
1867 s += UTF8SKIP(s); \
1868 } \
ff8b8b42
KW
1869 }
1870
05bd126c
KW
1871/* The three macros below are slightly different versions of the same logic.
1872 *
1873 * The first is for /a and /aa when the target string is UTF-8. This can only
1874 * match ascii, but it must advance based on UTF-8. The other two handle the
1875 * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking
1876 * for the boundary (or non-boundary) between a word and non-word character.
1877 * The utf8 and non-utf8 cases have the same logic, but the details must be
1878 * different. Find the "wordness" of the character just prior to this one, and
1879 * compare it with the wordness of this one. If they differ, we have a
1880 * boundary. At the beginning of the string, pretend that the previous
1881 * character was a new-line.
1882 *
1883 * All these macros uncleanly have side-effects with each other and outside
1884 * variables. So far it's been too much trouble to clean-up
1885 *
1886 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1887 * a word character or not.
1888 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1889 * word/non-word
1890 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1891 *
1892 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1893 * are looking for a boundary or for a non-boundary. If we are looking for a
1894 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1895 * see if this tentative match actually works, and if so, to quit the loop
1896 * here. And vice-versa if we are looking for a non-boundary.
1897 *
1898 * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
da10aa09 1899 * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of
05bd126c
KW
1900 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
1901 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1902 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1903 * complement. But in that branch we complement tmp, meaning that at the
1904 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1905 * which means at the top of the loop in the next iteration, it is
1906 * TEST_NON_UTF8(s-1) */
b2f4e957 1907#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1908 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1909 tmp = TEST_NON_UTF8(tmp); \
da10aa09 1910 REXEC_FBC_SCAN(1, /* 1=>is-utf8; advances s while s < strend */ \
05bd126c
KW
1911 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1912 tmp = !tmp; \
1913 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
1914 } \
1915 else { \
1916 IF_FAIL; \
1917 } \
1918 ); \
1919
1920/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1921 * TEST_UTF8 is a macro that for the same input code points returns identically
92c5bde9
KW
1922 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an
1923 * end pointer as well) */
236d82fd 1924#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1925 if (s == reginfo->strbeg) { \
1926 tmp = '\n'; \
1927 } \
1928 else { /* Back-up to the start of the previous character */ \
1929 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1930 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
3db24e1e 1931 0, UTF8_ALLOW_DEFAULT); \
05bd126c
KW
1932 } \
1933 tmp = TEST_UV(tmp); \
da10aa09 1934 REXEC_FBC_SCAN(1, /* 1=>is-utf8; advances s while s < strend */ \
7a207065 1935 if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \
05bd126c
KW
1936 tmp = !tmp; \
1937 IF_SUCCESS; \
1938 } \
1939 else { \
1940 IF_FAIL; \
1941 } \
1942 );
cfaf538b 1943
05bd126c
KW
1944/* Like the above two macros. UTF8_CODE is the complete code for handling
1945 * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1946 * macros below */
baa60164 1947#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1948 if (utf8_target) { \
05bd126c 1949 UTF8_CODE \
63ac0dad
KW
1950 } \
1951 else { /* Not utf8 */ \
9d9163fb 1952 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
63ac0dad 1953 tmp = TEST_NON_UTF8(tmp); \
da10aa09 1954 REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */ \
63ac0dad 1955 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
63ac0dad 1956 IF_SUCCESS; \
760cfa8e 1957 tmp = !tmp; \
63ac0dad
KW
1958 } \
1959 else { \
1960 IF_FAIL; \
1961 } \
1962 ); \
1963 } \
c8519dc7
KW
1964 /* Here, things have been set up by the previous code so that tmp is the \
1965 * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \
1966 * utf8ness of the target). We also have to check if this matches against \
1967 * the EOS, which we treat as a \n (which is the same value in both UTF-8 \
1968 * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \
1969 * string */ \
1970 if (tmp == ! TEST_NON_UTF8('\n')) { \
1971 IF_SUCCESS; \
1972 } \
1973 else { \
1974 IF_FAIL; \
1975 }
63ac0dad 1976
ae7c5b9b 1977/* This is the macro to use when we want to see if something that looks like it
b92b2705
KW
1978 * could match, actually does, and if so exits the loop. It needs to be used
1979 * only for bounds checking macros, as it allows for matching beyond the end of
1980 * string (which should be zero length without having to look at the string
1981 * contents) */
1982#define REXEC_FBC_TRYIT \
8c9c2723 1983 if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \
ae7c5b9b
KW
1984 goto got_it
1985
1986/* The only difference between the BOUND and NBOUND cases is that
1987 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1988 * NBOUND. This is accomplished by passing it as either the if or else clause,
1989 * with the other one being empty (PLACEHOLDER is defined as empty).
1990 *
1991 * The TEST_FOO parameters are for operating on different forms of input, but
1992 * all should be ones that return identically for the same underlying code
1993 * points */
1994#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1995 FBC_BOUND_COMMON( \
1996 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1997 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1998
44129e46 1999#define FBC_BOUND_A(TEST_NON_UTF8) \
ae7c5b9b
KW
2000 FBC_BOUND_COMMON( \
2001 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
2002 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2003
2004#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2005 FBC_BOUND_COMMON( \
2006 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2007 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2008
44129e46 2009#define FBC_NBOUND_A(TEST_NON_UTF8) \
ae7c5b9b
KW
2010 FBC_BOUND_COMMON( \
2011 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2012 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2013
8bde5eaf
JH
2014#ifdef DEBUGGING
2015static IV
2016S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
9ffbcbcf 2017 IV cp_out = _invlist_search(invlist, cp_in);
8bde5eaf
JH
2018 assert(cp_out >= 0);
2019 return cp_out;
2020}
2021# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2022 invmap[S_get_break_val_cp_checked(invlist, cp)]
2023#else
2024# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2025 invmap[_invlist_search(invlist, cp)]
2026#endif
2027
64935bc6
KW
2028/* Takes a pointer to an inversion list, a pointer to its corresponding
2029 * inversion map, and a code point, and returns the code point's value
2030 * according to the two arrays. It assumes that all code points have a value.
2031 * This is used as the base macro for macros for particular properties */
2032#define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
8bde5eaf 2033 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
64935bc6
KW
2034
2035/* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2036 * of a code point, returning the value for the first code point in the string.
2037 * And it takes the particular macro name that finds the desired value given a
2038 * code point. Merely convert the UTF-8 to code point and call the cp macro */
2039#define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
2040 (__ASSERT_(pos < strend) \
2041 /* Note assumes is valid UTF-8 */ \
2042 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2043
2044/* Returns the GCB value for the input code point */
2045#define getGCB_VAL_CP(cp) \
2046 _generic_GET_BREAK_VAL_CP( \
2047 PL_GCB_invlist, \
02f811dd 2048 _Perl_GCB_invmap, \
64935bc6
KW
2049 (cp))
2050
2051/* Returns the GCB value for the first code point in the UTF-8 encoded string
2052 * bounded by pos and strend */
2053#define getGCB_VAL_UTF8(pos, strend) \
2054 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
05bd126c 2055
6b659339
KW
2056/* Returns the LB value for the input code point */
2057#define getLB_VAL_CP(cp) \
2058 _generic_GET_BREAK_VAL_CP( \
2059 PL_LB_invlist, \
2060 _Perl_LB_invmap, \
2061 (cp))
2062
2063/* Returns the LB value for the first code point in the UTF-8 encoded string
2064 * bounded by pos and strend */
2065#define getLB_VAL_UTF8(pos, strend) \
2066 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2067
06ae2722
KW
2068
2069/* Returns the SB value for the input code point */
2070#define getSB_VAL_CP(cp) \
2071 _generic_GET_BREAK_VAL_CP( \
2072 PL_SB_invlist, \
bf4268fa 2073 _Perl_SB_invmap, \
06ae2722
KW
2074 (cp))
2075
2076/* Returns the SB value for the first code point in the UTF-8 encoded string
2077 * bounded by pos and strend */
2078#define getSB_VAL_UTF8(pos, strend) \
2079 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2080
ae3bb8ea
KW
2081/* Returns the WB value for the input code point */
2082#define getWB_VAL_CP(cp) \
2083 _generic_GET_BREAK_VAL_CP( \
2084 PL_WB_invlist, \
bf4268fa 2085 _Perl_WB_invmap, \
ae3bb8ea
KW
2086 (cp))
2087
2088/* Returns the WB value for the first code point in the UTF-8 encoded string
2089 * bounded by pos and strend */
2090#define getWB_VAL_UTF8(pos, strend) \
2091 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2092
786e8c11 2093/* We know what class REx starts with. Try to find this position... */
02d5137b 2094/* if reginfo->intuit, its a dryrun */
786e8c11
YO
2095/* annoyingly all the vars in this routine have different names from their counterparts
2096 in regmatch. /grrr */
3c3eec57 2097STATIC char *
07be1b83 2098S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 2099 const char *strend, regmatch_info *reginfo)
a687059c 2100{
a9448551
KW
2101
2102 /* TRUE if x+ need not match at just the 1st pos of run of x's */
73104a1b 2103 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
a9448551 2104
73104a1b
KW
2105 char *pat_string; /* The pattern's exactish string */
2106 char *pat_end; /* ptr to end char of pat_string */
2107 re_fold_t folder; /* Function for computing non-utf8 folds */
2108 const U8 *fold_array; /* array for folding ords < 256 */
2109 STRLEN ln;
2110 STRLEN lnc;
73104a1b
KW
2111 U8 c1;
2112 U8 c2;
743dd5b8 2113 char *e = NULL;
21d1ed54
KW
2114
2115 /* In some cases we accept only the first occurence of 'x' in a sequence of
2116 * them. This variable points to just beyond the end of the previous
2117 * occurrence of 'x', hence we can tell if we are in a sequence. (Having
2118 * it point to beyond the 'x' allows us to work for UTF-8 without having to
2119 * hop back.) */
2120 char * previous_occurrence_end = 0;
2121
3b6c52ce 2122 I32 tmp; /* Scratch variable */
ba44c216 2123 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 2124 UV utf8_fold_flags = 0;
f9176b44 2125 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
2126 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
2127 with a result inverts that result, as 0^1 =
2128 1 and 1^1 = 0 */
2129 _char_class_number classnum;
2130
73104a1b 2131 RXi_GET_DECL(prog,progi);
2f7f8cb1 2132
73104a1b 2133 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 2134
73104a1b
KW
2135 /* We know what class it must start with. */
2136 switch (OP(c)) {
3edce4f5 2137 case ANYOFPOSIXL:
a4525e78 2138 case ANYOFL:
780fcc9f 2139 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
a0bd1a30 2140
d1c40ef5 2141 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
a0bd1a30
KW
2142 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2143 }
2144
780fcc9f 2145 /* FALLTHROUGH */
ac44c12e 2146 case ANYOFD:
73104a1b
KW
2147 case ANYOF:
2148 if (utf8_target) {
da10aa09 2149 REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
3db24e1e 2150 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
73104a1b 2151 }
daced5ae
KW
2152 else if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
2153 /* We know that s is in the bitmap range since the target isn't
2154 * UTF-8, so what happens for out-of-range values is not relevant,
2155 * so exclude that from the flags */
da10aa09 2156 REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
1451f692 2157 }
73104a1b 2158 else {
da10aa09 2159 REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s)));
73104a1b
KW
2160 }
2161 break;
73104a1b 2162
2813d4ad 2163 case ANYOFM: /* ARG() is the base byte; FLAGS() the mask byte */
0a5ed81e
KW
2164 /* UTF-8ness doesn't matter because only matches UTF-8 invariants, so
2165 * use 0 */
2813d4ad 2166 REXEC_FBC_FIND_NEXT_SCAN(0,
12475f8b
KW
2167 (char *) find_next_masked((U8 *) s, (U8 *) strend,
2168 (U8) ARG(c), FLAGS(c)));
2813d4ad
KW
2169 break;
2170
0a5ed81e
KW
2171 case NANYOFM: /* UTF-8ness does matter because can match UTF-8 variants.
2172 */
2173 REXEC_FBC_FIND_NEXT_SCAN(utf8_target,
3db0bccc
KW
2174 (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2175 (U8) ARG(c), FLAGS(c)));
2176 break;
2177
c316b824 2178 case ANYOFH:
ff8b8b42 2179 if (utf8_target) { /* Can't possibly match a non-UTF-8 target */
6966a05b 2180 REXEC_FBC_CLASS_SCAN(TRUE,
29a889ef
KW
2181 ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2182 && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
6966a05b
KW
2183 }
2184 break;
2185
2186 case ANYOFHb:
2187 if (utf8_target) { /* Can't possibly match a non-UTF-8 target */
2188
2189 /* We know what the first byte of any matched string should be */
ff8b8b42
KW
2190 U8 first_byte = FLAGS(c);
2191
6966a05b 2192 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
c316b824 2193 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
ff8b8b42 2194 }
c316b824
KW
2195 break;
2196
3146c00a
KW
2197 case ANYOFHr:
2198 if (utf8_target) { /* Can't possibly match a non-UTF-8 target */
2199 REXEC_FBC_CLASS_SCAN(TRUE,
6af2b2bc 2200 ( inRANGE(NATIVE_UTF8_TO_I8(*s),
3146c00a
KW
2201 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
2202 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
2203 && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
2204 }
2205 break;
2206
34924db0
KW
2207 case ANYOFHs:
2208 if (utf8_target) { /* Can't possibly match a non-UTF-8 target */
2209 REXEC_FBC_CLASS_SCAN(TRUE,
2210 ( strend -s >= FLAGS(c)
2211 && memEQ(s, ((struct regnode_anyofhs *) c)->string, FLAGS(c))
2212 && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
2213 }
2214 break;
2215
13fcf652
KW
2216 case ANYOFR:
2217 if (utf8_target) {
2218 REXEC_FBC_CLASS_SCAN(TRUE,
2219 ( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2220 && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2221 (U8 *) strend,
2222 NULL),
2223 ANYOFRbase(c), ANYOFRdelta(c))));
2224 }
2225 else {
2226 REXEC_FBC_CLASS_SCAN(0, withinCOUNT((U8) *s,
2227 ANYOFRbase(c), ANYOFRdelta(c)));
2228 }
2229 break;
2230
2d5613be
KW
2231 case ANYOFRb:
2232 if (utf8_target) {
2233
2234 /* We know what the first byte of any matched string should be */
2235 U8 first_byte = FLAGS(c);
2236
2237 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2238 withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2239 (U8 *) strend,
2240 NULL),
2241 ANYOFRbase(c), ANYOFRdelta(c)));
2242 }
2243 else {
2244 REXEC_FBC_CLASS_SCAN(0, withinCOUNT((U8) *s,
2245 ANYOFRbase(c), ANYOFRdelta(c)));
2246 }
2247 break;
2248
89829bb5 2249 case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
098b07d5 2250 assert(! is_utf8_pat);
924ba076 2251 /* FALLTHROUGH */
89829bb5 2252 case EXACTFAA:
aa419ff3
KW
2253 if (is_utf8_pat) {
2254 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2255 |FOLDEQ_S2_ALREADY_FOLDED|FOLDEQ_S2_FOLDS_SANE;
2256 goto do_exactf_utf8;
2257 }
2258 else if (utf8_target) {
2259
2260 /* Here, and elsewhere in this file, the reason we can't consider a
2261 * non-UTF-8 pattern already folded in the presence of a UTF-8
2262 * target is because any MICRO SIGN in the pattern won't be folded.
2263 * Since the fold of the MICRO SIGN requires UTF-8 to represent, we
2264 * can consider a non-UTF-8 pattern folded when matching a
2265 * non-UTF-8 target */
73104a1b
KW
2266 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2267 goto do_exactf_utf8;
2268 }
aa419ff3
KW
2269
2270 /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2271 * which these functions don't handle anyway */
2272 fold_array = PL_fold_latin1;
2273 folder = foldEQ_latin1_s2_folded;
2274 goto do_exactf_non_utf8;
77a6d856 2275
2fdb7295
KW
2276 case EXACTF: /* This node only generated for non-utf8 patterns */
2277 assert(! is_utf8_pat);
73104a1b 2278 if (utf8_target) {
73104a1b
KW
2279 goto do_exactf_utf8;
2280 }
2281 fold_array = PL_fold;
2282 folder = foldEQ;
2283 goto do_exactf_non_utf8;
2284
2285 case EXACTFL:
780fcc9f 2286 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
31f05a37 2287 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
cea315b6 2288 utf8_fold_flags = FOLDEQ_LOCALE;
73104a1b
KW
2289 goto do_exactf_utf8;
2290 }
2291 fold_array = PL_fold_locale;
2292 folder = foldEQ_locale;
2293 goto do_exactf_non_utf8;
3c760661 2294
627a7895
KW
2295 case EXACTFUP: /* Problematic even though pattern isn't UTF-8. Use
2296 full functionality normally not done except for
2297 UTF-8 */
a56633d1 2298 assert(! is_utf8_pat);
73104a1b 2299 goto do_exactf_utf8;
16d951b7 2300
a4525e78
KW
2301 case EXACTFLU8:
2302 if (! utf8_target) { /* All code points in this node require
2303 UTF-8 to express. */
2304 break;
2305 }
613abc6d
KW
2306 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2307 | FOLDEQ_S2_FOLDS_SANE;
a4525e78
KW
2308 goto do_exactf_utf8;
2309
3f2416ae 2310 case EXACTFU_REQ8:
a9f8c7ac
KW
2311 if (! utf8_target) {
2312 break;
2313 }
2314 assert(is_utf8_pat);
2315 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2316 goto do_exactf_utf8;
2317
73104a1b 2318 case EXACTFU:
984e6dd1 2319 if (is_utf8_pat || utf8_target) {
0fbec7cf 2320 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
73104a1b
KW
2321 goto do_exactf_utf8;
2322 }
fac1af77 2323
73104a1b
KW
2324 /* Any 'ss' in the pattern should have been replaced by regcomp,
2325 * so we don't have to worry here about this single special case
2326 * in the Latin1 range */
2327 fold_array = PL_fold_latin1;
0fbec7cf 2328 folder = foldEQ_latin1_s2_folded;
73104a1b 2329
924ba076 2330 /* FALLTHROUGH */
73104a1b 2331
c52b8b12 2332 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
73104a1b
KW
2333 are no glitches with fold-length differences
2334 between the target string and pattern */
2335
2336 /* The idea in the non-utf8 EXACTF* cases is to first find the
2337 * first character of the EXACTF* node and then, if necessary,
2338 * case-insensitively compare the full text of the node. c1 is the
2339 * first character. c2 is its fold. This logic will not work for
2340 * Unicode semantics and the german sharp ss, which hence should
2341 * not be compiled into a node that gets here. */
ae06e581
KW
2342 pat_string = STRINGs(c);
2343 ln = STR_LENs(c); /* length to match in octets/bytes */
73104a1b
KW
2344
2345 /* We know that we have to match at least 'ln' bytes (which is the
2346 * same as characters, since not utf8). If we have to match 3
2347 * characters, and there are only 2 availabe, we know without
2348 * trying that it will fail; so don't start a match past the
2349 * required minimum number from the far end */
ea3daa5d 2350 e = HOP3c(strend, -((SSize_t)ln), s);
dda01918
HS
2351 if (e < s)
2352 break;
fac1af77 2353
73104a1b
KW
2354 c1 = *pat_string;
2355 c2 = fold_array[c1];
2356 if (c1 == c2) { /* If char and fold are the same */
c05cc3b6
KW
2357 while (s <= e) {
2358 s = (char *) memchr(s, c1, e + 1 - s);
2359 if (s == NULL) {
2360 break;
2361 }
2362
2363 /* Check that the rest of the node matches */
2364 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2365 && (reginfo->intuit || regtry(reginfo, &s)) )
2366 {
2367 goto got_it;
2368 }
2369 s++;
2370 }
73104a1b
KW
2371 }
2372 else {
c05cc3b6
KW
2373 U8 bits_differing = c1 ^ c2;
2374
2375 /* If the folds differ in one bit position only, we can mask to
2376 * match either of them, and can use this faster find method. Both
2377 * ASCII and EBCDIC tend to have their case folds differ in only
2378 * one position, so this is very likely */
2379 if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2380 bits_differing = ~ bits_differing;
2381 while (s <= e) {
2b1f9c71 2382 s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
c05cc3b6
KW
2383 (c1 & bits_differing), bits_differing);
2384 if (s > e) {
2385 break;
2386 }
2387
2388 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2389 && (reginfo->intuit || regtry(reginfo, &s)) )
2390 {
2391 goto got_it;
2392 }
2393 s++;
2394 }
2395 }
2396 else { /* Otherwise, stuck with looking byte-at-a-time. This
2397 should actually happen only in EXACTFL nodes */
2398 while (s <= e) {
2399 if ( (*(U8*)s == c1 || *(U8*)s == c2)
2400 && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2401 && (reginfo->intuit || regtry(reginfo, &s)) )
2402 {
2403 goto got_it;
2404 }
2405 s++;
2406 }
2407 }
73104a1b
KW
2408 }
2409 break;
fac1af77 2410
c52b8b12
KW
2411 do_exactf_utf8:
2412 {
73104a1b
KW
2413 unsigned expansion;
2414
2415 /* If one of the operands is in utf8, we can't use the simpler folding
2416 * above, due to the fact that many different characters can have the
2417 * same fold, or portion of a fold, or different- length fold */
ae06e581
KW
2418 pat_string = STRINGs(c);
2419 ln = STR_LENs(c); /* length to match in octets/bytes */
73104a1b 2420 pat_end = pat_string + ln;
984e6dd1 2421 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
2422 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2423 : ln;
2424
2425 /* We have 'lnc' characters to match in the pattern, but because of
2426 * multi-character folding, each character in the target can match
2427 * up to 3 characters (Unicode guarantees it will never exceed
2428 * this) if it is utf8-encoded; and up to 2 if not (based on the
2429 * fact that the Latin 1 folds are already determined, and the
2430 * only multi-char fold in that range is the sharp-s folding to
2431 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
2432 * string character. Adjust lnc accordingly, rounding up, so that
2433 * if we need to match at least 4+1/3 chars, that really is 5. */
2434 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2435 lnc = (lnc + expansion - 1) / expansion;
2436
2437 /* As in the non-UTF8 case, if we have to match 3 characters, and
2438 * only 2 are left, it's guaranteed to fail, so don't start a
2439 * match that would require us to go beyond the end of the string
2440 */
ea3daa5d 2441 e = HOP3c(strend, -((SSize_t)lnc), s);
73104a1b 2442
73104a1b
KW
2443 /* XXX Note that we could recalculate e to stop the loop earlier,
2444 * as the worst case expansion above will rarely be met, and as we
2445 * go along we would usually find that e moves further to the left.
2446 * This would happen only after we reached the point in the loop
2447 * where if there were no expansion we should fail. Unclear if
2448 * worth the expense */
2449
2450 while (s <= e) {
2451 char *my_strend= (char *)strend;
2452 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 2453 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
02d5137b 2454 && (reginfo->intuit || regtry(reginfo, &s)) )
73104a1b
KW
2455 {
2456 goto got_it;
2457 }
8c9c2723 2458 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
73104a1b
KW
2459 }
2460 break;
2461 }
236d82fd 2462
73104a1b 2463 case BOUNDL:
780fcc9f 2464 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6 2465 if (FLAGS(c) != TRADITIONAL_BOUND) {
89ad707a
KW
2466 if (! IN_UTF8_CTYPE_LOCALE) {
2467 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
64935bc6 2468 B_ON_NON_UTF8_LOCALE_IS_WRONG);
89ad707a 2469 }
64935bc6
KW
2470 goto do_boundu;
2471 }
2472
7a207065 2473 FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
73104a1b 2474 break;
64935bc6 2475
73104a1b 2476 case NBOUNDL:
780fcc9f 2477 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6 2478 if (FLAGS(c) != TRADITIONAL_BOUND) {
89ad707a
KW
2479 if (! IN_UTF8_CTYPE_LOCALE) {
2480 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
64935bc6 2481 B_ON_NON_UTF8_LOCALE_IS_WRONG);
89ad707a 2482 }
64935bc6
KW
2483 goto do_nboundu;
2484 }
2485
7a207065 2486 FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
73104a1b 2487 break;
64935bc6
KW
2488
2489 case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2490 meaning */
2491 assert(FLAGS(c) == TRADITIONAL_BOUND);
2492
7a207065 2493 FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
73104a1b 2494 break;
64935bc6
KW
2495
2496 case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2497 meaning */
2498 assert(FLAGS(c) == TRADITIONAL_BOUND);
2499
44129e46 2500 FBC_BOUND_A(isWORDCHAR_A);
73104a1b 2501 break;
64935bc6
KW
2502
2503 case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2504 meaning */
2505 assert(FLAGS(c) == TRADITIONAL_BOUND);
2506
7a207065 2507 FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
73104a1b 2508 break;
64935bc6
KW
2509
2510 case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2511 meaning */
2512 assert(FLAGS(c) == TRADITIONAL_BOUND);
2513
44129e46 2514 FBC_NBOUND_A(isWORDCHAR_A);
73104a1b 2515 break;
64935bc6 2516
73104a1b 2517 case NBOUNDU:
64935bc6 2518 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
7a207065 2519 FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
64935bc6
KW
2520 break;
2521 }
2522
2523 do_nboundu:
2524
2525 to_complement = 1;
6ebdcce0 2526 goto do_boundu;
64935bc6
KW
2527
2528 case BOUNDU:
6ebdcce0
KW
2529 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2530 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2531 break;
2532 }
2533
64935bc6 2534 do_boundu:
a7a8bd1e 2535 if (s == reginfo->strbeg) {
67481c39 2536 if (reginfo->intuit || regtry(reginfo, &s))
64935bc6
KW
2537 {
2538 goto got_it;
2539 }
a7a8bd1e
KW
2540
2541 /* Didn't match. Try at the next position (if there is one) */
8c9c2723 2542 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
a7a8bd1e
KW
2543 if (UNLIKELY(s >= reginfo->strend)) {
2544 break;
2545 }
64935bc6 2546 }
6ebdcce0
KW
2547 switch((bound_type) FLAGS(c)) {
2548 case TRADITIONAL_BOUND: /* Should have already been handled */
2549 assert(0);
2550 break;
64935bc6 2551
6ebdcce0 2552 case GCB_BOUND:
64935bc6 2553 if (utf8_target) {
85e5f08b 2554 GCB_enum before = getGCB_VAL_UTF8(
64935bc6
KW
2555 reghop3((U8*)s, -1,
2556 (U8*)(reginfo->strbeg)),
2557 (U8*) reginfo->strend);
2558 while (s < strend) {
85e5f08b 2559 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
64935bc6 2560 (U8*) reginfo->strend);
b0e24409
KW
2561 if ( (to_complement ^ isGCB(before,
2562 after,
2563 (U8*) reginfo->strbeg,
2564 (U8*) s,
2565 utf8_target))
00e3344b
KW
2566 && (reginfo->intuit || regtry(reginfo, &s)))
2567 {
2568 goto got_it;
64935bc6 2569 }
43a7bd62 2570 before = after;
8c9c2723 2571 s += UTF8_SAFE_SKIP(s, reginfo->strend);
64935bc6
KW
2572 }
2573 }
2574 else { /* Not utf8. Everything is a GCB except between CR and
2575 LF */
2576 while (s < strend) {
00e3344b
KW
2577 if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
2578 || UCHARAT(s) != '\n'))
2579 && (reginfo->intuit || regtry(reginfo, &s)))
64935bc6 2580 {
00e3344b 2581 goto got_it;
64935bc6 2582 }
43a7bd62 2583 s++;
64935bc6
KW
2584 }
2585 }
2586
64935bc6 2587 break;
ae3bb8ea 2588
6b659339 2589 case LB_BOUND:
6b659339
KW
2590 if (utf8_target) {
2591 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2592 -1,
2593 (U8*)(reginfo->strbeg)),
2594 (U8*) reginfo->strend);
2595 while (s < strend) {
2596 LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2597 if (to_complement ^ isLB(before,
2598 after,
2599 (U8*) reginfo->strbeg,
2600 (U8*) s,
2601 (U8*) reginfo->strend,
2602 utf8_target)
2603 && (reginfo->intuit || regtry(reginfo, &s)))
2604 {
2605 goto got_it;
2606 }
2607 before = after;
8c9c2723 2608 s += UTF8_SAFE_SKIP(s, reginfo->strend);
6b659339
KW
2609 }
2610 }
2611 else { /* Not utf8. */
2612 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2613 while (s < strend) {
2614 LB_enum after = getLB_VAL_CP((U8) *s);
2615 if (to_complement ^ isLB(before,
2616 after,
2617 (U8*) reginfo->strbeg,
2618 (U8*) s,
2619 (U8*) reginfo->strend,
2620 utf8_target)
2621 && (reginfo->intuit || regtry(reginfo, &s)))
2622 {
2623 goto got_it;
2624 }
2625 before = after;
2626 s++;
2627 }
2628 }
2629
6b659339
KW
2630 break;
2631
06ae2722 2632 case SB_BOUND:
06ae2722 2633 if (utf8_target) {
85e5f08b 2634 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
06ae2722
KW
2635 -1,
2636 (U8*)(reginfo->strbeg)),
2637 (U8*) reginfo->strend);
2638 while (s < strend) {
85e5f08b 2639 SB_enum after = getSB_VAL_UTF8((U8*) s,
06ae2722 2640 (U8*) reginfo->strend);
00e3344b
KW
2641 if ((to_complement ^ isSB(before,
2642 after,
2643 (U8*) reginfo->strbeg,
2644 (U8*) s,
2645 (U8*) reginfo->strend,
2646 utf8_target))
2647 && (reginfo->intuit || regtry(reginfo, &s)))
06ae2722 2648 {
00e3344b 2649 goto got_it;
06ae2722 2650 }
43a7bd62 2651 before = after;
8c9c2723 2652 s += UTF8_SAFE_SKIP(s, reginfo->strend);
06ae2722
KW
2653 }
2654 }
2655 else { /* Not utf8. */
85e5f08b 2656 SB_enum before = getSB_VAL_CP((U8) *(s -1));
06ae2722 2657 while (s < strend) {
85e5f08b 2658 SB_enum after = getSB_VAL_CP((U8) *s);
00e3344b
KW
2659 if ((to_complement ^ isSB(before,
2660 after,
2661 (U8*) reginfo->strbeg,
2662 (U8*) s,
2663 (U8*) reginfo->strend,
2664 utf8_target))
2665 && (reginfo->intuit || regtry(reginfo, &s)))
06ae2722 2666 {
00e3344b 2667 goto got_it;
06ae2722 2668 }
43a7bd62 2669 before = after;
06ae2722
KW
2670 s++;
2671 }
2672 }
2673
06ae2722
KW
2674 break;
2675
ae3bb8ea 2676 case WB_BOUND:
ae3bb8ea
KW
2677 if (utf8_target) {
2678 /* We are at a boundary between char_sub_0 and char_sub_1.
2679 * We also keep track of the value for char_sub_-1 as we
2680 * loop through the line. Context may be needed to make a
2681 * determination, and if so, this can save having to
2682 * recalculate it */
85e5f08b
KW
2683 WB_enum previous = WB_UNKNOWN;
2684 WB_enum before = getWB_VAL_UTF8(
ae3bb8ea
KW
2685 reghop3((U8*)s,
2686 -1,
2687 (U8*)(reginfo->strbeg)),
2688 (U8*) reginfo->strend);
2689 while (s < strend) {
85e5f08b 2690 WB_enum after = getWB_VAL_UTF8((U8*) s,
ae3bb8ea 2691 (U8*) reginfo->strend);
00e3344b
KW
2692 if ((to_complement ^ isWB(previous,
2693 before,
2694 after,
2695 (U8*) reginfo->strbeg,
2696 (U8*) s,
2697 (U8*) reginfo->strend,
2698 utf8_target))
2699 && (reginfo->intuit || regtry(reginfo, &s)))
ae3bb8ea 2700 {
00e3344b 2701 goto got_it;
ae3bb8ea 2702 }
43a7bd62
KW
2703 previous = before;
2704 before = after;
8c9c2723 2705 s += UTF8_SAFE_SKIP(s, reginfo->strend);
ae3bb8ea
KW
2706 }
2707 }
2708 else { /* Not utf8. */
85e5f08b
KW
2709 WB_enum previous = WB_UNKNOWN;
2710 WB_enum before = getWB_VAL_CP((U8) *(s -1));
ae3bb8ea 2711 while (s < strend) {
85e5f08b 2712 WB_enum after = getWB_VAL_CP((U8) *s);
00e3344b
KW
2713 if ((to_complement ^ isWB(previous,
2714 before,
2715 after,
2716 (U8*) reginfo->strbeg,
2717 (U8*) s,
2718 (U8*) reginfo->strend,
2719 utf8_target))
2720 && (reginfo->intuit || regtry(reginfo, &s)))
ae3bb8ea 2721 {
00e3344b 2722 goto got_it;
ae3bb8ea 2723 }
43a7bd62
KW
2724 previous = before;
2725 before = after;
ae3bb8ea
KW
2726 s++;
2727 }
2728 }
6ebdcce0
KW
2729 }
2730
2731 /* Here are at the final position in the target string, which is a
2732 * boundary by definition, so matches, depending on other constraints.
2733 * */
ae3bb8ea 2734
8c9c2723
KW
2735 if ( reginfo->intuit
2736 || (s <= reginfo->strend && regtry(reginfo, &s)))
2737 {
ae3bb8ea
KW
2738 goto got_it;
2739 }
73104a1b 2740 break;
64935bc6 2741
73104a1b
KW
2742 case LNBREAK:
2743 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2744 is_LNBREAK_latin1_safe(s, strend)
2745 );
2746 break;
3018b823
KW
2747
2748 /* The argument to all the POSIX node types is the class number to pass to
2749 * _generic_isCC() to build a mask for searching in PL_charclass[] */
2750
2751 case NPOSIXL:
2752 to_complement = 1;
2753 /* FALLTHROUGH */
2754
2755 case POSIXL:
780fcc9f 2756 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
a78c2fa6 2757 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, (U8 *) strend)),
3018b823 2758 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 2759 break;
3018b823
KW
2760
2761 case NPOSIXD:
2762 to_complement = 1;
2763 /* FALLTHROUGH */
2764
2765 case POSIXD:
2766 if (utf8_target) {
2767 goto posix_utf8;
2768 }
2769 goto posixa;
2770
2771 case NPOSIXA:
2772 if (utf8_target) {
2773 /* The complement of something that matches only ASCII matches all
837226c8 2774 * non-ASCII, plus everything in ASCII that isn't in the class. */
da10aa09
KW
2775 REXEC_FBC_CLASS_SCAN(1, ! isASCII_utf8_safe(s, strend)
2776 || ! _generic_isCC_A(*s, FLAGS(c)));
3018b823
KW
2777 break;
2778 }
2779
2780 to_complement = 1;
4a6c6db5 2781 goto posixa;
3018b823 2782
73104a1b
KW
2783 case POSIXA:
2784 /* Don't need to worry about utf8, as it can match only a single
4a6c6db5
KW
2785 * byte invariant character. But we do anyway for performance reasons,
2786 * as otherwise we would have to examine all the continuation
2787 * characters */
2788 if (utf8_target) {
da10aa09 2789 REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c)));
4a6c6db5
KW
2790 break;
2791 }
2792
2793 posixa:
da10aa09 2794 REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
3018b823 2795 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 2796 break;
3018b823
KW
2797
2798 case NPOSIXU:
2799 to_complement = 1;
2800 /* FALLTHROUGH */
2801
2802 case POSIXU:
2803 if (! utf8_target) {
da10aa09
KW
2804 REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2805 to_complement ^ cBOOL(_generic_isCC(*s,
3018b823
KW
2806 FLAGS(c))));
2807 }
2808 else {
2809
c52b8b12 2810 posix_utf8:
3018b823 2811 classnum = (_char_class_number) FLAGS(c);
8d692afb
KW
2812 switch (classnum) {
2813 default:
2814 REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2815 to_complement ^ cBOOL(_invlist_contains_cp(
2816 PL_XPosix_ptrs[classnum],
2817 utf8_to_uvchr_buf((U8 *) s,
2818 (U8 *) strend,
2819 NULL))));
2820 break;
779cf272 2821 case _CC_ENUM_SPACE:
da10aa09 2822 REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
7a207065 2823 to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
3018b823
KW
2824 break;
2825
2826 case _CC_ENUM_BLANK:
da10aa09 2827 REXEC_FBC_CLASS_SCAN(1,
7a207065 2828 to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
3018b823
KW
2829 break;
2830
2831 case _CC_ENUM_XDIGIT:
da10aa09 2832 REXEC_FBC_CLASS_SCAN(1,
7a207065 2833 to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3018b823
KW
2834 break;
2835
2836 case _CC_ENUM_VERTSPACE:
da10aa09 2837 REXEC_FBC_CLASS_SCAN(1,
7a207065 2838 to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3018b823
KW
2839 break;
2840
2841 case _CC_ENUM_CNTRL:
da10aa09 2842 REXEC_FBC_CLASS_SCAN(1,
7a207065 2843 to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
3018b823 2844 break;
3018b823
KW
2845 }
2846 }
2847 break;
2848
73104a1b
KW
2849 case AHOCORASICKC:
2850 case AHOCORASICK:
2851 {
2852 DECL_TRIE_TYPE(c);
2853 /* what trie are we using right now */
2854 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2855 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2856 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2857
2858 const char *last_start = strend - trie->minlen;
6148ee25 2859#ifdef DEBUGGING
73104a1b 2860 const char *real_start = s;
6148ee25 2861#endif
73104a1b
KW
2862 STRLEN maxlen = trie->maxlen;
2863 SV *sv_points;
2864 U8 **points; /* map of where we were in the input string
2865 when reading a given char. For ASCII this
2866 is unnecessary overhead as the relationship
2867 is always 1:1, but for Unicode, especially
2868 case folded Unicode this is not true. */
2869 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2870 U8 *bitmap=NULL;
2871
2872
271b36b1 2873 DECLARE_AND_GET_RE_DEBUG_FLAGS;
73104a1b
KW
2874
2875 /* We can't just allocate points here. We need to wrap it in
2876 * an SV so it gets freed properly if there is a croak while
2877 * running the match */
2878 ENTER;
2879 SAVETMPS;
2880 sv_points=newSV(maxlen * sizeof(U8 *));
2881 SvCUR_set(sv_points,
2882 maxlen * sizeof(U8 *));
2883 SvPOK_on(sv_points);
2884 sv_2mortal(sv_points);
2885 points=(U8**)SvPV_nolen(sv_points );
2886 if ( trie_type != trie_utf8_fold
2887 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2888 {
2889 if (trie->bitmap)
2890 bitmap=(U8*)trie->bitmap;
2891 else
2892 bitmap=(U8*)ANYOF_BITMAP(c);
2893 }
2894 /* this is the Aho-Corasick algorithm modified a touch
2895 to include special handling for long "unknown char" sequences.
2896 The basic idea being that we use AC as long as we are dealing
2897 with a possible matching char, when we encounter an unknown char
2898 (and we have not encountered an accepting state) we scan forward
2899 until we find a legal starting char.
2900 AC matching is basically that of trie matching, except that when
2901 we encounter a failing transition, we fall back to the current
2902 states "fail state", and try the current char again, a process
2903 we repeat until we reach the root state, state 1, or a legal
2904 transition. If we fail on the root state then we can either
2905 terminate if we have reached an accepting state previously, or
2906 restart the entire process from the beginning if we have not.
2907
2908 */
2909 while (s <= last_start) {
2910 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2911 U8 *uc = (U8*)s;
2912 U16 charid = 0;
2913 U32 base = 1;
2914 U32 state = 1;
2915 UV uvc = 0;
2916 STRLEN len = 0;
2917 STRLEN foldlen = 0;
2918 U8 *uscan = (U8*)NULL;
2919 U8 *leftmost = NULL;
2920#ifdef DEBUGGING
2921 U32 accepted_word= 0;
786e8c11 2922#endif
73104a1b
KW
2923 U32 pointpos = 0;
2924
2925 while ( state && uc <= (U8*)strend ) {
2926 int failed=0;
2927 U32 word = aho->states[ state ].wordnum;
2928
2929 if( state==1 ) {
2930 if ( bitmap ) {
2931 DEBUG_TRIE_EXECUTE_r(
2932 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2933 dump_exec_pos( (char *)uc, c, strend, real_start,
cb41e5d6 2934 (char *)uc, utf8_target, 0 );
6ad9a8ab 2935 Perl_re_printf( aTHX_
73104a1b
KW
2936 " Scanning for legal start char...\n");
2937 }
2938 );
2939 if (utf8_target) {
2940 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2941 uc += UTF8SKIP(uc);
2942 }
2943 } else {
2944 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2945 uc++;
2946 }
786e8c11 2947 }
73104a1b 2948 s= (char *)uc;
07be1b83 2949 }
73104a1b
KW
2950 if (uc >(U8*)last_start) break;
2951 }
2952
2953 if ( word ) {
2954 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2955 if (!leftmost || lpos < leftmost) {
2956 DEBUG_r(accepted_word=word);
2957 leftmost= lpos;
7016d6eb 2958 }
73104a1b 2959 if (base==0) break;
7016d6eb 2960
73104a1b
KW
2961 }
2962 points[pointpos++ % maxlen]= uc;
2963 if (foldlen || uc < (U8*)strend) {
9ad8cac4
KW
2964 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
2965 (U8 *) strend, uscan, len, uvc,
2966 charid, foldlen, foldbuf,
2967 uniflags);
73104a1b
KW
2968 DEBUG_TRIE_EXECUTE_r({
2969 dump_exec_pos( (char *)uc, c, strend,
cb41e5d6 2970 real_start, s, utf8_target, 0);
6ad9a8ab 2971 Perl_re_printf( aTHX_
147e3846 2972 " Charid:%3u CP:%4" UVxf " ",
73104a1b
KW
2973 charid, uvc);
2974 });
2975 }
2976 else {
2977 len = 0;
2978 charid = 0;
2979 }
07be1b83 2980
73104a1b
KW
2981
2982 do {
6148ee25 2983#ifdef DEBUGGING
73104a1b 2984 word = aho->states[ state ].wordnum;
6148ee25 2985#endif
73104a1b
KW
2986 base = aho->states[ state ].trans.base;
2987
2988 DEBUG_TRIE_EXECUTE_r({
2989 if (failed)
2990 dump_exec_pos( (char *)uc, c, strend, real_start,
cb41e5d6 2991 s, utf8_target, 0 );
6ad9a8ab 2992 Perl_re_printf( aTHX_
147e3846 2993 "%sState: %4" UVxf ", word=%" UVxf,
73104a1b
KW
2994 failed ? " Fail transition to " : "",
2995 (UV)state, (UV)word);
2996 });
2997 if ( base ) {
2998 U32 tmp;
2999 I32 offset;
3000 if (charid &&
3001 ( ((offset = base + charid
3002 - 1 - trie->uniquecharcount)) >= 0)
3003 && ((U32)offset < trie->lasttrans)
3004 && trie->trans[offset].check == state
3005 && (tmp=trie->trans[offset].next))
3006 {
3007 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 3008 Perl_re_printf( aTHX_ " - legal\n"));
73104a1b
KW
3009 state = tmp;
3010 break;
07be1b83
YO
3011 }
3012 else {
786e8c11 3013 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 3014 Perl_re_printf( aTHX_ " - fail\n"));
786e8c11 3015 failed = 1;
73104a1b 3016 state = aho->fail[state];
07be1b83 3017 }
07be1b83 3018 }
73104a1b
KW
3019 else {
3020 /* we must be accepting here */
3021 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 3022 Perl_re_printf( aTHX_ " - accepting\n"));
73104a1b
KW
3023 failed = 1;
3024 break;
786e8c11 3025 }
73104a1b
KW
3026 } while(state);
3027 uc += len;
3028 if (failed) {
3029 if (leftmost)
3030 break;
3031 if (!state) state = 1;
07be1b83 3032 }
73104a1b
KW
3033 }
3034 if ( aho->states[ state ].wordnum ) {
3035 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
3036 if (!leftmost || lpos < leftmost) {
3037 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3038 leftmost = lpos;
07be1b83
YO
3039 }
3040 }
73104a1b
KW
3041 if (leftmost) {
3042 s = (char*)leftmost;
3043 DEBUG_TRIE_EXECUTE_r({
147e3846 3044 Perl_re_printf( aTHX_ "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
73104a1b
KW
3045 (UV)accepted_word, (IV)(s - real_start)
3046 );
3047 });
02d5137b 3048 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
3049 FREETMPS;
3050 LEAVE;
3051 goto got_it;
3052 }
8c9c2723
KW
3053 if (s < reginfo->strend) {
3054 s = HOPc(s,1);
3055 }
73104a1b 3056 DEBUG_TRIE_EXECUTE_r({
6ad9a8ab 3057 Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
73104a1b
KW
3058 });
3059 } else {
3060 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 3061 Perl_re_printf( aTHX_ "No match.\n"));
73104a1b
KW
3062 break;
3063 }
3064 }
3065 FREETMPS;
3066 LEAVE;
3067 }
3068 break;
3069 default:
3070 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
73104a1b
KW
3071 }
3072 return 0;
3073 got_it:
3074 return s;
6eb5f6b9
JH
3075}
3076
60165aa4
DM
3077/* set RX_SAVED_COPY, RX_SUBBEG etc.
3078 * flags have same meanings as with regexec_flags() */
3079
749f4950
DM
3080static void
3081S_reg_set_capture_string(pTHX_ REGEXP * const rx,
60165aa4
DM
3082 char *strbeg,
3083 char *strend,
3084 SV *sv,
3085 U32 flags,
3086 bool utf8_target)
3087{
3088 struct regexp *const prog = ReANY(rx);
3089
60165aa4
DM
3090 if (flags & REXEC_COPY_STR) {
3091#ifdef PERL_ANY_COW
3092 if (SvCANCOW(sv)) {
eb8fc9fe 3093 DEBUG_C(Perl_re_printf( aTHX_
60165aa4 3094 "Copy on write: regexp capture, type %d\n",
eb8fc9fe 3095 (int) SvTYPE(sv)));
5411a0e5
DM
3096 /* Create a new COW SV to share the match string and store
3097 * in saved_copy, unless the current COW SV in saved_copy
3098 * is valid and suitable for our purpose */
3099 if (( prog->saved_copy
3100 && SvIsCOW(prog->saved_copy)
3101 && SvPOKp(prog->saved_copy)
3102 && SvIsCOW(sv)
3103 && SvPOKp(sv)
3104 && SvPVX(sv) == SvPVX(prog->saved_copy)))
a76b0e90 3105 {
5411a0e5
DM
3106 /* just reuse saved_copy SV */
3107 if (RXp_MATCH_COPIED(prog)) {
3108 Safefree(prog->subbeg);
3109 RXp_MATCH_COPIED_off(prog);
3110 }
3111 }
3112 else {
3113 /* create new COW SV to share string */
196a02af 3114 RXp_MATCH_COPY_FREE(prog);
a76b0e90 3115 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
a76b0e90 3116 }
5411a0e5
DM
3117 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3118 assert (SvPOKp(prog->saved_copy));
60165aa4
DM
3119 prog->sublen = strend - strbeg;
3120 prog->suboffset = 0;
3121 prog->subcoffset = 0;
3122 } else
3123#endif
3124 {
99a90e59
FC
3125 SSize_t min = 0;
3126 SSize_t max = strend - strbeg;
ea3daa5d 3127 SSize_t sublen;
60165aa4
DM
3128
3129 if ( (flags & REXEC_COPY_SKIP_POST)
e322109a 3130 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
3131 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3132 ) { /* don't copy $' part of string */
3133 U32 n = 0;
3134 max = -1;
3135 /* calculate the right-most part of the string covered
f67a5002 3136 * by a capture. Due to lookahead, this may be to
60165aa4
DM
3137 * the right of $&, so we have to scan all captures */
3138 while (n <= prog->lastparen) {
3139 if (prog->offs[n].end > max)
3140 max = prog->offs[n].end;
3141 n++;
3142 }
3143 if (max == -1)
3144 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3145 ? prog->offs[0].start
3146 : 0;
3147 assert(max >= 0 && max <= strend - strbeg);
3148 }
3149
3150 if ( (flags & REXEC_COPY_SKIP_PRE)
e322109a 3151 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
3152 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3153 ) { /* don't copy $` part of string */
3154 U32 n = 0;
3155 min = max;
3156 /* calculate the left-most part of the string covered
f67a5002 3157 * by a capture. Due to lookbehind, this may be to
60165aa4
DM
3158 * the left of $&, so we have to scan all captures */
3159 while (min && n <= prog->lastparen) {
3160 if ( prog->offs[n].start != -1
3161 && prog->offs[n].start < min)
3162 {
3163 min = prog->offs[n].start;
3164 }
3165 n++;
3166 }
3167 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3168 && min > prog->offs[0].end
3169 )
3170 min = prog->offs[0].end;
3171
3172 }
3173
3174 assert(min >= 0 && min <= max && min <= strend - strbeg);
3175 sublen = max - min;
3176
196a02af 3177 if (RXp_MATCH_COPIED(prog)) {
60165aa4
DM
3178 if (sublen > prog->sublen)
3179 prog->subbeg =
3180 (char*)saferealloc(prog->subbeg, sublen+1);
3181 }
3182 else
3183 prog->subbeg = (char*)safemalloc(sublen+1);
3184 Copy(strbeg + min, prog->subbeg, sublen, char);
3185 prog->subbeg[sublen] = '\0';
3186 prog->suboffset = min;
3187 prog->sublen = sublen;
196a02af 3188 RXp_MATCH_COPIED_on(prog);
60165aa4
DM
3189 }
3190 prog->subcoffset = prog->suboffset;
3191 if (prog->suboffset && utf8_target) {
3192 /* Convert byte offset to chars.
3193 * XXX ideally should only compute this if @-/@+
3194 * has been seen, a la PL_sawampersand ??? */
3195
3196 /* If there's a direct correspondence between the
3197 * string which we're matching and the original SV,
3198 * then we can use the utf8 len cache associated with
3199 * the SV. In particular, it means that under //g,
3200 * sv_pos_b2u() will use the previously cached
3201 * position to speed up working out the new length of
3202 * subcoffset, rather than counting from the start of
3203 * the string each time. This stops
3204 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3205 * from going quadratic */
3206 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
ea3daa5d
FC
3207 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3208 SV_GMAGIC|SV_CONST_RETURN);
60165aa4
DM
3209 else
3210 prog->subcoffset = utf8_length((U8*)strbeg,
3211 (U8*)(strbeg+prog->suboffset));
3212 }
3213 }
3214 else {
196a02af 3215 RXp_MATCH_COPY_FREE(prog);
60165aa4
DM
3216 prog->subbeg = strbeg;
3217 prog->suboffset = 0;
3218 prog->subcoffset = 0;
3219 prog->sublen = strend - strbeg;
3220 }
3221}
3222
3223
3224
fae667d5 3225
6eb5f6b9
JH
3226/*
3227 - regexec_flags - match a regexp against a string
3228 */
3229I32
5aaab254 3230Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
ea3daa5d 3231 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
3232/* stringarg: the point in the string at which to begin matching */
3233/* strend: pointer to null at end of string */
3234/* strbeg: real beginning of string */
3235/* minend: end of match must be >= minend bytes after stringarg. */
3236/* sv: SV being matched: only used for utf8 flag, pos() etc; string
3237 * itself is accessed via the pointers above */
3238/* data: May be used for some additional optimizations.
d058ec57 3239 Currently unused. */
a340edde 3240/* flags: For optimizations. See REXEC_* in regexp.h */
8fd1a950 3241
6eb5f6b9 3242{
8d919b0a 3243 struct regexp *const prog = ReANY(rx);
5aaab254 3244 char *s;
eb578fdb 3245 regnode *c;
03c83e26 3246 char *startpos;
ea3daa5d
FC
3247 SSize_t minlen; /* must match at least this many chars */
3248 SSize_t dontbother = 0; /* how many characters not to try at end */
f2ed9b32 3249 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 3250 I32 multiline;
f8fc2ecf 3251 RXi_GET_DECL(prog,progi);
02d5137b
DM
3252 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
3253 regmatch_info *const reginfo = &reginfo_buf;
e9105d30 3254 regexp_paren_pair *swap = NULL;
006f26b2 3255 I32 oldsave;
271b36b1 3256 DECLARE_AND_GET_RE_DEBUG_FLAGS;
a3621e74 3257
7918f24d 3258 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 3259 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
3260
3261 /* Be paranoid... */
3dc78631 3262 if (prog == NULL) {
6eb5f6b9 3263 Perl_croak(aTHX_ "NULL regexp parameter");
6eb5f6b9
JH
3264 }
3265
6c3fea77 3266 DEBUG_EXECUTE_r(
03c83e26 3267 debug_start_match(rx, utf8_target, stringarg, strend,
6c3fea77
DM
3268 "Matching");
3269 );
8adc0f72 3270
b342a604
DM
3271 startpos = stringarg;
3272
4cf1a867
DM
3273 /* set these early as they may be used by the HOP macros below */
3274 reginfo->strbeg = strbeg;
3275 reginfo->strend = strend;
3276 reginfo->is_utf8_target = cBOOL(utf8_target);
3277
58430ea8 3278 if (prog->intflags & PREGf_GPOS_SEEN) {
d307c076
DM
3279 MAGIC *mg;
3280
fef7148b
DM
3281 /* set reginfo->ganch, the position where \G can match */
3282
3283 reginfo->ganch =
3284 (flags & REXEC_IGNOREPOS)
3285 ? stringarg /* use start pos rather than pos() */
3dc78631 3286 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
25fdce4a
FC
3287 /* Defined pos(): */
3288 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
fef7148b
DM
3289 : strbeg; /* pos() not defined; use start of string */
3290
6ad9a8ab 3291 DEBUG_GPOS_r(Perl_re_printf( aTHX_
147e3846 3292 "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
fef7148b 3293
03c83e26
DM
3294 /* in the presence of \G, we may need to start looking earlier in
3295 * the string than the suggested start point of stringarg:
0b2c2a84 3296 * if prog->gofs is set, then that's a known, fixed minimum
03c83e26
DM
3297 * offset, such as
3298 * /..\G/: gofs = 2
3299 * /ab|c\G/: gofs = 1
3300 * or if the minimum offset isn't known, then we have to go back
3301 * to the start of the string, e.g. /w+\G/
3302 */
2bfbe302 3303
8e1490ee 3304 if (prog->intflags & PREGf_ANCH_GPOS) {
4cf1a867
DM
3305 if (prog->gofs) {
3306 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3307 if (!startpos ||
3308 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3309 {
6d4548a7 3310 DEBUG_GPOS_r(Perl_re_printf( aTHX_
4cf1a867
DM
3311 "fail: ganch-gofs before earliest possible start\n"));
3312 return 0;
3313 }
2bfbe302 3314 }
4cf1a867
DM
3315 else
3316 startpos = reginfo->ganch;
2bfbe302
DM
3317 }
3318 else if (prog->gofs) {
4cf1a867
DM
3319 startpos = HOPBACKc(startpos, prog->gofs);
3320 if (!startpos)
b342a604 3321 startpos = strbeg;
03c83e26 3322 }
58430ea8 3323 else if (prog->intflags & PREGf_GPOS_FLOAT)
b342a604 3324 startpos = strbeg;
03c83e26
DM
3325 }
3326
3327 minlen = prog->minlen;
b342a604 3328 if ((startpos + minlen) > strend || startpos < strbeg) {
6a86f90c
KW
3329 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3330 "Regex match can't succeed, so not even tried\n"));
03c83e26
DM
3331 return 0;
3332 }
3333
63a3746a
DM
3334 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3335 * which will call destuctors to reset PL_regmatch_state, free higher
3336 * PL_regmatch_slabs, and clean up regmatch_info_aux and
3337 * regmatch_info_aux_eval */
3338
3339 oldsave = PL_savestack_ix;
3340
dfa77d06
DM
3341 s = startpos;
3342
e322109a 3343 if ((prog->extflags & RXf_USE_INTUIT)
7fadf4a7
DM
3344 && !(flags & REXEC_CHECKED))
3345 {
dfa77d06 3346 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
7fadf4a7 3347 flags, NULL);
dfa77d06 3348 if (!s)
7fadf4a7
DM
3349 return 0;
3350
e322109a 3351 if (prog->extflags & RXf_CHECK_ALL) {
7fadf4a7
DM
3352 /* we can match based purely on the result of INTUIT.
3353 * Set up captures etc just for $& and $-[0]
3354 * (an intuit-only match wont have $1,$2,..) */
3355 assert(!prog->nparens);
d5e7783a
DM
3356
3357 /* s/// doesn't like it if $& is earlier than where we asked it to
3358 * start searching (which can happen on something like /.\G/) */
3359 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3360 && (s < stringarg))
3361 {
3362 /* this should only be possible under \G */
58430ea8 3363 assert(prog->intflags & PREGf_GPOS_SEEN);
6ad9a8ab 3364 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
d5e7783a
DM
3365 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3366 goto phooey;
3367 }
3368
7fadf4a7
DM
3369 /* match via INTUIT shouldn't have any captures.
3370 * Let @-, @+, $^N know */
3371 prog->lastparen = prog->lastcloseparen = 0;
196a02af 3372 RXp_MATCH_UTF8_set(prog, utf8_target);
3ff69bd6
DM
3373 prog->offs[0].start = s - strbeg;
3374 prog->offs[0].end = utf8_target
8875b6de 3375 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3ff69bd6 3376 : s - strbeg + prog->minlenret;
7fadf4a7 3377 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 3378 S_reg_set_capture_string(aTHX_ rx,
7fadf4a7
DM
3379 strbeg, strend,
3380 sv, flags, utf8_target);
3381
7fadf4a7
DM
3382 return 1;
3383 }
3384 }
3385
6c3fea77 3386 multiline = prog->extflags & RXf_PMf_MULTILINE;
1de06328 3387
dfa77d06 3388 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
6ad9a8ab 3389 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
a72c7584
JH
3390 "String too short [regexec_flags]...\n"));
3391 goto phooey;
1aa99e6b 3392 }
1de06328 3393
6eb5f6b9 3394 /* Check validity of program. */
f8fc2ecf 3395 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
3396 Perl_croak(aTHX_ "corrupted regexp program");
3397 }
3398
196a02af
DM
3399 RXp_MATCH_TAINTED_off(prog);
3400 RXp_MATCH_UTF8_set(prog, utf8_target);
1738e041 3401
6c3fea77
DM
3402 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
3403 reginfo->intuit = 0;
02d5137b
DM
3404 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3405 reginfo->warned = FALSE;
02d5137b 3406 reginfo->sv = sv;
1cb48e53 3407 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
6eb5f6b9 3408 /* see how far we have to get to not match where we matched before */
fe3974be 3409 reginfo->till = stringarg + minend;
6eb5f6b9 3410
60779a30 3411 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
82c23608
FC
3412 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3413 S_cleanup_regmatch_info_aux has executed (registered by
3414 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
3415 magic belonging to this SV.
3416 Not newSVsv, either, as it does not COW.
3417 */
3418 reginfo->sv = newSV(0);
4cba5ac0 3419 SvSetSV_nosteal(reginfo->sv, sv);
82c23608
FC
3420 SAVEFREESV(reginfo->sv);
3421 }
3422
331b2dcc
DM
3423 /* reserve next 2 or 3 slots in PL_regmatch_state:
3424 * slot N+0: may currently be in use: skip it
3425 * slot N+1: use for regmatch_info_aux struct
3426 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3427 * slot N+3: ready for use by regmatch()
3428 */
bf2039a9 3429
331b2dcc
DM
3430 {
3431 regmatch_state *old_regmatch_state;
3432 regmatch_slab *old_regmatch_slab;
3433 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3434
3435 /* on first ever match, allocate first slab */
3436 if (!PL_regmatch_slab) {
3437 Newx(PL_regmatch_slab, 1, regmatch_slab);
3438 PL_regmatch_slab->prev = NULL;
3439 PL_regmatch_slab->next = NULL;
3440 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3441 }
bf2039a9 3442
331b2dcc
DM
3443 old_regmatch_state = PL_regmatch_state;
3444 old_regmatch_slab = PL_regmatch_slab;
bf2039a9 3445
331b2dcc
DM
3446 for (i=0; i <= max; i++) {
3447 if (i == 1)
3448 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3449 else if (i ==2)
3450 reginfo->info_aux_eval =
3451 reginfo->info_aux->info_aux_eval =
3452 &(PL_regmatch_state->u.info_aux_eval);
bf2039a9 3453
331b2dcc
DM
3454 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
3455 PL_regmatch_state = S_push_slab(aTHX);
3456 }
bf2039a9 3457
331b2dcc
DM
3458 /* note initial PL_regmatch_state position; at end of match we'll
3459 * pop back to there and free any higher slabs */
bf2039a9 3460
331b2dcc
DM
3461 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3462 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2ac8ff4b 3463 reginfo->info_aux->poscache = NULL;