This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regexec.c: S_find_byclass(): utf8ness in switch()
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
f65819ce
CO
5 * One Ring to rule them all, One Ring to find them
6 *
4ac71550
TC
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
b992490d 83#include "invlist_inline.h"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
bbac6b20
KW
86#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
88
a0bd1a30
KW
89static const char utf8_locale_required[] =
90 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
91
e1cf74e3
CB
92#ifdef DEBUGGING
93/* At least one required character in the target string is expressible only in
94 * UTF-8. */
e7a474c0 95static const char non_utf8_target_but_utf8_required[]
e1cf74e3
CB
96 = "Can't match, because target string needs to be in UTF-8\n";
97#endif
98
7b031478 99#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
6ad9a8ab 100 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\
7b031478 101 goto target; \
e1cf74e3
CB
102} STMT_END
103
c74f6de9
KW
104#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
c277df42
IZ
110/*
111 * Forwards.
112 */
113
f2ed9b32 114#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 115
3dab1dad 116#define HOPc(pos,off) \
ba44c216 117 (char *)(reginfo->is_utf8_target \
220db18a 118 ? reghop3((U8*)pos, off, \
9d9163fb 119 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad 120 : (U8*)(pos + off))
557f47af 121
bb152a4b
DM
122/* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
123#define HOPBACK3(pos, off, lim) \
124 (reginfo->is_utf8_target \
125 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
126 : (pos - off >= lim) \
127 ? (U8*)pos - off \
3dab1dad 128 : NULL)
efb30f32 129
bb152a4b
DM
130#define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
131
ba44c216 132#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 133#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 134
557f47af
DM
135/* lim must be +ve. Returns NULL on overshoot */
136#define HOPMAYBE3(pos,off,lim) \
137 (reginfo->is_utf8_target \
138 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
139 : ((U8*)pos + off <= lim) \
140 ? (U8*)pos + off \
141 : NULL)
142
8e9f2289
DM
143/* like HOP3, but limits the result to <= lim even for the non-utf8 case.
144 * off must be >=0; args should be vars rather than expressions */
145#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
146 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
147 : (U8*)((pos + off) > lim ? lim : (pos + off)))
67853908 148#define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
8e9f2289 149
2974eaec
DM
150#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
151 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
152 : (U8*)(pos + off))
153#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
7016d6eb 154
c7304fe2 155#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
156/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
157
5f80c4cf 158/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
159/* it would be nice to rework regcomp.sym to generate this stuff. sigh
160 *
161 * NOTE that *nothing* that affects backtracking should be in here, specifically
162 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
163 * node that is in between two EXACT like nodes when ascertaining what the required
164 * "follow" character is. This should probably be moved to regex compile time
165 * although it may be done at run time beause of the REF possibility - more
166 * investigation required. -- demerphq
167*/
baa60164
KW
168#define JUMPABLE(rn) ( \
169 OP(rn) == OPEN || \
24be3102
YO
170 (OP(rn) == CLOSE && \
171 !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) || \
baa60164
KW
172 OP(rn) == EVAL || \
173 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
174 OP(rn) == PLUS || OP(rn) == MINMOD || \
175 OP(rn) == KEEPS || \
176 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 177)
ee9b8eae 178#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 179
ee9b8eae
YO
180#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
181
a84d97b6
HS
182/*
183 Search for mandatory following text node; for lookahead, the text must
184 follow but for lookbehind (rn->flags != 0) we skip to the next step.
185*/
baa60164 186#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
187 while (JUMPABLE(rn)) { \
188 const OPCODE type = OP(rn); \
189 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 190 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 191 else if (type == PLUS) \
cca55fe3 192 rn = NEXTOPER(rn); \
3dab1dad 193 else if (type == IFMATCH) \
a84d97b6 194 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 195 else rn += NEXT_OFF(rn); \
3dab1dad 196 } \
5f80c4cf 197} STMT_END
74750237 198
006f26b2
DM
199#define SLAB_FIRST(s) (&(s)->states[0])
200#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
201
a75351a1 202static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 203static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
bf2039a9 204static regmatch_state * S_push_slab(pTHX);
51371543 205
87c0511b 206#define REGCP_PAREN_ELEMS 3
f067efbf 207#define REGCP_OTHER_ELEMS 3
e0fa7e2b 208#define REGCP_FRAME_ELEMS 1
620d5b66
NC
209/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
210 * are needed for the regexp context stack bookkeeping. */
211
76e3520e 212STATIC CHECKPOINT
21553840 213S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
a0d0e21e 214{
a3b680e6 215 const int retval = PL_savestack_ix;
92da3157
DM
216 const int paren_elems_to_push =
217 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
218 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
219 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 220 I32 p;
271b36b1 221 DECLARE_AND_GET_RE_DEBUG_FLAGS;
a0d0e21e 222
b93070ed
DM
223 PERL_ARGS_ASSERT_REGCPPUSH;
224
e49a9654 225 if (paren_elems_to_push < 0)
e8a85d26
JH
226 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
227 (int)paren_elems_to_push, (int)maxopenparen,
228 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
e49a9654 229
e0fa7e2b 230 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
147e3846 231 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
5df417d0 232 " out of range (%lu-%ld)",
92da3157
DM
233 total_elems,
234 (unsigned long)maxopenparen,
235 (long)parenfloor);
e0fa7e2b 236
620d5b66 237 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 238
495f47a5 239 DEBUG_BUFFERS_r(
92da3157 240 if ((int)maxopenparen > (int)parenfloor)
2b1a3689 241 Perl_re_exec_indentf( aTHX_
147e3846 242 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
2b1a3689
YO
243 depth,
244 PTR2UV(rex),
495f47a5
DM
245 PTR2UV(rex->offs)
246 );
247 );
92da3157 248 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 249/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
99a90e59
FC
250 SSPUSHIV(rex->offs[p].end);
251 SSPUSHIV(rex->offs[p].start);
1ca2007e 252 SSPUSHINT(rex->offs[p].start_tmp);
2b1a3689 253 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
147e3846 254 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
2b1a3689
YO
255 depth,
256 (UV)p,
495f47a5
DM
257 (IV)rex->offs[p].start,
258 (IV)rex->offs[p].start_tmp,
259 (IV)rex->offs[p].end
40a82448 260 ));
a0d0e21e 261 }
b1ce53c5 262/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 263 SSPUSHINT(maxopenparen);
b93070ed
DM
264 SSPUSHINT(rex->lastparen);
265 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 266 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 267
a0d0e21e
LW
268 return retval;
269}
270
c277df42 271/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
272#define REGCP_SET(cp) \
273 DEBUG_STATE_r( \
147e3846
KW
274 Perl_re_exec_indentf( aTHX_ \
275 "Setting an EVAL scope, savestack=%" IVdf ",\n", \
cb41e5d6
YO
276 depth, (IV)PL_savestack_ix \
277 ) \
278 ); \
ab3bbdeb 279 cp = PL_savestack_ix
c3464db5 280
ab3bbdeb 281#define REGCP_UNWIND(cp) \
e4f74956 282 DEBUG_STATE_r( \
cb41e5d6 283 if (cp != PL_savestack_ix) \
147e3846
KW
284 Perl_re_exec_indentf( aTHX_ \
285 "Clearing an EVAL scope, savestack=%" \
286 IVdf "..%" IVdf "\n", \
cb41e5d6
YO
287 depth, (IV)(cp), (IV)PL_savestack_ix \
288 ) \
289 ); \
ab3bbdeb 290 regcpblow(cp)
c277df42 291
8e9ed7cc
DM
292/* set the start and end positions of capture ix */
293#define CLOSE_CAPTURE(ix, s, e) \
294 rex->offs[ix].start = s; \
295 rex->offs[ix].end = e; \
3be2a9fd
DM
296 if (ix > rex->lastparen) \
297 rex->lastparen = ix; \
298 rex->lastcloseparen = ix; \
69cd2617 299 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
befca383 300 "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
69cd2617
DM
301 depth, \
302 PTR2UV(rex), \
303 PTR2UV(rex->offs), \
8e9ed7cc
DM
304 (UV)ix, \
305 (IV)rex->offs[ix].start, \
3be2a9fd
DM
306 (IV)rex->offs[ix].end, \
307 (UV)rex->lastparen \
69cd2617
DM
308 ))
309
a8d1f4b4 310#define UNWIND_PAREN(lp, lcp) \
befca383
DM
311 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
312 "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
313 depth, \
314 PTR2UV(rex), \
315 PTR2UV(rex->offs), \
316 (UV)(lp), \
317 (UV)(rex->lastparen), \
318 (UV)(lcp) \
319 )); \
a8d1f4b4
DM
320 for (n = rex->lastparen; n > lp; n--) \
321 rex->offs[n].end = -1; \
322 rex->lastparen = n; \
323 rex->lastcloseparen = lcp;
324
325
f067efbf 326STATIC void
21553840 327S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
a0d0e21e 328{
e0fa7e2b 329 UV i;
87c0511b 330 U32 paren;
271b36b1 331 DECLARE_AND_GET_RE_DEBUG_FLAGS;
a3621e74 332
7918f24d
NC
333 PERL_ARGS_ASSERT_REGCPPOP;
334
b1ce53c5 335 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 336 i = SSPOPUV;
e0fa7e2b
NC
337 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
338 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
339 rex->lastcloseparen = SSPOPINT;
340 rex->lastparen = SSPOPINT;
92da3157 341 *maxopenparen_p = SSPOPINT;
b1ce53c5 342
620d5b66 343 i -= REGCP_OTHER_ELEMS;
b1ce53c5 344 /* Now restore the parentheses context. */
495f47a5
DM
345 DEBUG_BUFFERS_r(
346 if (i || rex->lastparen + 1 <= rex->nparens)
2b1a3689 347 Perl_re_exec_indentf( aTHX_
147e3846 348 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
2b1a3689
YO
349 depth,
350 PTR2UV(rex),
495f47a5
DM
351 PTR2UV(rex->offs)
352 );
353 );
92da3157 354 paren = *maxopenparen_p;
620d5b66 355 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
ea3daa5d 356 SSize_t tmps;
1ca2007e 357 rex->offs[paren].start_tmp = SSPOPINT;
99a90e59
FC
358 rex->offs[paren].start = SSPOPIV;
359 tmps = SSPOPIV;
b93070ed
DM
360 if (paren <= rex->lastparen)
361 rex->offs[paren].end = tmps;
2b1a3689 362 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
147e3846 363 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
2b1a3689
YO
364 depth,
365 (UV)paren,
495f47a5
DM
366 (IV)rex->offs[paren].start,
367 (IV)rex->offs[paren].start_tmp,
368 (IV)rex->offs[paren].end,
369 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 370 );
87c0511b 371 paren--;
a0d0e21e 372 }
daf18116 373#if 1
dafc8851
JH
374 /* It would seem that the similar code in regtry()
375 * already takes care of this, and in fact it is in
376 * a better location to since this code can #if 0-ed out
377 * but the code in regtry() is needed or otherwise tests
378 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
379 * (as of patchlevel 7877) will fail. Then again,
380 * this code seems to be necessary or otherwise
225593e1
DM
381 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
382 * --jhi updated by dapm */
b93070ed 383 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 384 if (i > *maxopenparen_p)
b93070ed
DM
385 rex->offs[i].start = -1;
386 rex->offs[i].end = -1;
2b1a3689 387 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
147e3846 388 " \\%" UVuf ": %s ..-1 undeffing\n",
2b1a3689
YO
389 depth,
390 (UV)i,
92da3157 391 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 392 ));
a0d0e21e 393 }
dafc8851 394#endif
a0d0e21e
LW
395}
396
74088413
DM
397/* restore the parens and associated vars at savestack position ix,
398 * but without popping the stack */
399
400STATIC void
21553840 401S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
74088413
DM
402{
403 I32 tmpix = PL_savestack_ix;
85882954
YO
404 PERL_ARGS_ASSERT_REGCP_RESTORE;
405
74088413 406 PL_savestack_ix = ix;
21553840 407 regcppop(rex, maxopenparen_p);
74088413
DM
408 PL_savestack_ix = tmpix;
409}
410
02db2b7b 411#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 412
9637d2a5
CB
413#ifndef PERL_IN_XSUB_RE
414
24e16d7b
KW
415bool
416Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
31c7f561
KW
417{
418 /* Returns a boolean as to whether or not 'character' is a member of the
419 * Posix character class given by 'classnum' that should be equivalent to a
420 * value in the typedef '_char_class_number'.
421 *
422 * Ideally this could be replaced by a just an array of function pointers
423 * to the C library functions that implement the macros this calls.
424 * However, to compile, the precise function signatures are required, and
a3815e44 425 * these may vary from platform to platform. To avoid having to figure
31c7f561 426 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
427 * which adds an extra layer of function call overhead (unless the C
428 * optimizer strips it away). But we don't particularly care about
429 * performance with locales anyway. */
31c7f561
KW
430
431 switch ((_char_class_number) classnum) {
15861f94 432 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 433 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
434 case _CC_ENUM_ASCII: return isASCII_LC(character);
435 case _CC_ENUM_BLANK: return isBLANK_LC(character);
cee69f79 436 case _CC_ENUM_CASED: return isLOWER_LC(character)
b0d691b2 437 || isUPPER_LC(character);
e8d596e0 438 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
439 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
440 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
441 case _CC_ENUM_LOWER: return isLOWER_LC(character);
442 case _CC_ENUM_PRINT: return isPRINT_LC(character);
443 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 444 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
445 case _CC_ENUM_UPPER: return isUPPER_LC(character);
446 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 447 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
448 default: /* VERTSPACE should never occur in locales */
449 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
450 }
451
e5964223 452 NOT_REACHED; /* NOTREACHED */
31c7f561
KW
453 return FALSE;
454}
455
9637d2a5
CB
456#endif
457
53362e85
KW
458PERL_STATIC_INLINE I32
459S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len)
460{
461 /* Compare non-UTF-8 using Unicode (Latin1) semantics. s2 must already be
462 * folded. Works on all folds representable without UTF-8, except for
463 * LATIN_SMALL_LETTER_SHARP_S, and does not check for this. Nor does it
464 * check that the strings each have at least 'len' characters.
465 *
466 * There is almost an identical API function where s2 need not be folded:
467 * Perl_foldEQ_latin1() */
468
469 const U8 *a = (const U8 *)s1;
470 const U8 *b = (const U8 *)s2;
471
472 PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
473
474 assert(len >= 0);
475
476 while (len--) {
477 assert(! isUPPER_L1(*b));
478 if (toLOWER_L1(*a) != *b) {
479 return 0;
480 }
481 a++, b++;
482 }
483 return 1;
484}
485
3018b823 486STATIC bool
a78c2fa6 487S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
3018b823
KW
488{
489 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
490 * 'character' is a member of the Posix character class given by 'classnum'
491 * that should be equivalent to a value in the typedef
492 * '_char_class_number'.
493 *
494 * This just calls isFOO_lc on the code point for the character if it is in
2f306ab9 495 * the range 0-255. Outside that range, all characters use Unicode
3018b823 496 * rules, ignoring any locale. So use the Unicode function if this class
4c404f26 497 * requires an inversion list, and use the Unicode macro otherwise. */
3018b823 498
1565c085 499
3018b823
KW
500 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
501
502 if (UTF8_IS_INVARIANT(*character)) {
503 return isFOO_lc(classnum, *character);
504 }
505 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
506 return isFOO_lc(classnum,
a62b247b 507 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
3018b823
KW
508 }
509
a78c2fa6 510 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
613abc6d 511
3018b823 512 switch ((_char_class_number) classnum) {
779cf272 513 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
3018b823
KW
514 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
515 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
516 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
4d3d8522
KW
517 default:
518 return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
a78c2fa6 519 utf8_to_uvchr_buf(character, e, NULL));
3018b823
KW
520 }
521
e1ee3960 522 return FALSE; /* Things like CNTRL are always below 256 */
3018b823
KW
523}
524
2b1f9c71
KW
525STATIC U8 *
526S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
ab1efbdc
KW
527{
528 /* Returns the position of the first byte in the sequence between 's' and
529 * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
530 * */
531
532 PERL_ARGS_ASSERT_FIND_SPAN_END;
533
534 assert(send >= s);
535
536 if ((STRLEN) (send - s) >= PERL_WORDSIZE
537 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
538 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
539 {
540 PERL_UINTMAX_T span_word;
541
542 /* Process per-byte until reach word boundary. XXX This loop could be
543 * eliminated if we knew that this platform had fast unaligned reads */
544 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
545 if (*s != span_byte) {
546 return s;
547 }
548 s++;
549 }
550
551 /* Create a word filled with the bytes we are spanning */
552 span_word = PERL_COUNT_MULTIPLIER * span_byte;
553
554 /* Process per-word as long as we have at least a full word left */
555 do {
556
557 /* Keep going if the whole word is composed of 'span_byte's */
558 if ((* (PERL_UINTMAX_T *) s) == span_word) {
559 s += PERL_WORDSIZE;
560 continue;
561 }
562
23a7ee81
KW
563 /* Here, at least one byte in the word isn't 'span_byte'. */
564
565#ifdef EBCDIC
566
567 break;
568
569#else
570
571 /* This xor leaves 1 bits only in those non-matching bytes */
ab1efbdc
KW
572 span_word ^= * (PERL_UINTMAX_T *) s;
573
574 /* Make sure the upper bit of each non-matching byte is set. This
575 * makes each such byte look like an ASCII platform variant byte */
576 span_word |= span_word << 1;
577 span_word |= span_word << 2;
578 span_word |= span_word << 4;
579
580 /* That reduces the problem to what this function solves */
73f0a2eb 581 return s + variant_byte_number(span_word);
ab1efbdc 582
23a7ee81
KW
583#endif
584
ab1efbdc
KW
585 } while (s + PERL_WORDSIZE <= send);
586 }
587
588 /* Process the straggler bytes beyond the final word boundary */
589 while (s < send) {
590 if (*s != span_byte) {
591 return s;
592 }
593 s++;
594 }
595
596 return s;
597}
598
2b1f9c71
KW
599STATIC U8 *
600S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
2813d4ad
KW
601{
602 /* Returns the position of the first byte in the sequence between 's'
603 * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
604 * returns 'send' if none found. It uses word-level operations instead of
605 * byte to speed up the process */
606
607 PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
608
609 assert(send >= s);
610 assert((byte & mask) == byte);
611
23a7ee81
KW
612#ifndef EBCDIC
613
2813d4ad
KW
614 if ((STRLEN) (send - s) >= PERL_WORDSIZE
615 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
616 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
617 {
1d8aafa0 618 PERL_UINTMAX_T word, mask_word;
2813d4ad
KW
619
620 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
2b1f9c71 621 if (((*s) & mask) == byte) {
2813d4ad
KW
622 return s;
623 }
624 s++;
625 }
626
1d8aafa0
KW
627 word = PERL_COUNT_MULTIPLIER * byte;
628 mask_word = PERL_COUNT_MULTIPLIER * mask;
2813d4ad
KW
629
630 do {
631 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
632
1d8aafa0
KW
633 /* If 'masked' contains bytes with the bit pattern of 'byte' within
634 * it, xoring with 'word' will leave each of the 8 bits in such
635 * bytes be 0, and no byte containing any other bit pattern will be
636 * 0. */
637 masked ^= word;
2813d4ad
KW
638
639 /* This causes the most significant bit to be set to 1 for any
640 * bytes in the word that aren't completely 0 */
641 masked |= masked << 1;
642 masked |= masked << 2;
643 masked |= masked << 4;
644
645 /* The msbits are the same as what marks a byte as variant, so we
646 * can use this mask. If all msbits are 1, the word doesn't
647 * contain 'byte' */
648 if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
649 s += PERL_WORDSIZE;
650 continue;
651 }
652
653 /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
654 * and any that are, are 0. Complement and re-AND to swap that */
655 masked = ~ masked;
656 masked &= PERL_VARIANTS_WORD_MASK;
657
658 /* This reduces the problem to that solved by this function */
73f0a2eb 659 s += variant_byte_number(masked);
2813d4ad
KW
660 return s;
661
662 } while (s + PERL_WORDSIZE <= send);
663 }
664
23a7ee81
KW
665#endif
666
2813d4ad 667 while (s < send) {
2b1f9c71 668 if (((*s) & mask) == byte) {
2813d4ad
KW
669 return s;
670 }
671 s++;
672 }
673
674 return s;
675}
676
070e8b2e
KW
677STATIC U8 *
678S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
679{
680 /* Returns the position of the first byte in the sequence between 's' and
681 * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
682 * 'span_byte' should have been ANDed with 'mask' in the call of this
683 * function. Returns 'send' if none found. Works like find_span_end(),
684 * except for the AND */
685
686 PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
687
688 assert(send >= s);
689 assert((span_byte & mask) == span_byte);
690
691 if ((STRLEN) (send - s) >= PERL_WORDSIZE
692 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
693 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
694 {
695 PERL_UINTMAX_T span_word, mask_word;
696
697 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
2b1f9c71 698 if (((*s) & mask) != span_byte) {
070e8b2e
KW
699 return s;
700 }
701 s++;
702 }
703
704 span_word = PERL_COUNT_MULTIPLIER * span_byte;
705 mask_word = PERL_COUNT_MULTIPLIER * mask;
706
707 do {
708 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
709
710 if (masked == span_word) {
711 s += PERL_WORDSIZE;
712 continue;
713 }
714
23a7ee81
KW
715#ifdef EBCDIC
716
717 break;
718
719#else
720
070e8b2e
KW
721 masked ^= span_word;
722 masked |= masked << 1;
723 masked |= masked << 2;
724 masked |= masked << 4;
73f0a2eb 725 return s + variant_byte_number(masked);
070e8b2e 726
23a7ee81
KW
727#endif
728
070e8b2e
KW
729 } while (s + PERL_WORDSIZE <= send);
730 }
731
732 while (s < send) {
2b1f9c71 733 if (((*s) & mask) != span_byte) {
070e8b2e
KW
734 return s;
735 }
736 s++;
737 }
738
739 return s;
740}
741
a687059c 742/*
e50aee73 743 * pregexec and friends
a687059c
LW
744 */
745
76234dfb 746#ifndef PERL_IN_XSUB_RE
a687059c 747/*
c277df42 748 - pregexec - match a regexp against a string
a687059c 749 */
c277df42 750I32
5aaab254 751Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
ea3daa5d 752 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
8fd1a950
DM
753/* stringarg: the point in the string at which to begin matching */
754/* strend: pointer to null at end of string */
755/* strbeg: real beginning of string */
756/* minend: end of match must be >= minend bytes after stringarg. */
757/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
758 * itself is accessed via the pointers above */
759/* nosave: For optimizations. */
c277df42 760{
7918f24d
NC
761 PERL_ARGS_ASSERT_PREGEXEC;
762
c277df42 763 return
9041c2e3 764 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
765 nosave ? 0 : REXEC_COPY_STR);
766}
76234dfb 767#endif
22e551b9 768
cad2e5aa 769
6eb5f6b9 770
1a4edc3c
DM
771/* re_intuit_start():
772 *
773 * Based on some optimiser hints, try to find the earliest position in the
774 * string where the regex could match.
775 *
776 * rx: the regex to match against
777 * sv: the SV being matched: only used for utf8 flag; the string
778 * itself is accessed via the pointers below. Note that on
779 * something like an overloaded SV, SvPOK(sv) may be false
780 * and the string pointers may point to something unrelated to
781 * the SV itself.
782 * strbeg: real beginning of string
783 * strpos: the point in the string at which to begin matching
784 * strend: pointer to the byte following the last char of the string
785 * flags currently unused; set to 0
786 * data: currently unused; set to NULL
787 *
788 * The basic idea of re_intuit_start() is to use some known information
789 * about the pattern, namely:
790 *
791 * a) the longest known anchored substring (i.e. one that's at a
792 * constant offset from the beginning of the pattern; but not
793 * necessarily at a fixed offset from the beginning of the
794 * string);
795 * b) the longest floating substring (i.e. one that's not at a constant
796 * offset from the beginning of the pattern);
797 * c) Whether the pattern is anchored to the string; either
798 * an absolute anchor: /^../, or anchored to \n: /^.../m,
799 * or anchored to pos(): /\G/;
800 * d) A start class: a real or synthetic character class which
801 * represents which characters are legal at the start of the pattern;
802 *
803 * to either quickly reject the match, or to find the earliest position
804 * within the string at which the pattern might match, thus avoiding
805 * running the full NFA engine at those earlier locations, only to
806 * eventually fail and retry further along.
807 *
808 * Returns NULL if the pattern can't match, or returns the address within
809 * the string which is the earliest place the match could occur.
810 *
811 * The longest of the anchored and floating substrings is called 'check'
812 * and is checked first. The other is called 'other' and is checked
813 * second. The 'other' substring may not be present. For example,
814 *
815 * /(abc|xyz)ABC\d{0,3}DEFG/
816 *
817 * will have
818 *
819 * check substr (float) = "DEFG", offset 6..9 chars
820 * other substr (anchored) = "ABC", offset 3..3 chars
821 * stclass = [ax]
822 *
823 * Be aware that during the course of this function, sometimes 'anchored'
824 * refers to a substring being anchored relative to the start of the
825 * pattern, and sometimes to the pattern itself being anchored relative to
826 * the string. For example:
827 *
828 * /\dabc/: "abc" is anchored to the pattern;
829 * /^\dabc/: "abc" is anchored to the pattern and the string;
830 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
831 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
832 * but the pattern is anchored to the string.
52a21eb3
DM
833 */
834
cad2e5aa 835char *
52a21eb3
DM
836Perl_re_intuit_start(pTHX_
837 REGEXP * const rx,
838 SV *sv,
839 const char * const strbeg,
840 char *strpos,
841 char *strend,
842 const U32 flags,
843 re_scream_pos_data *data)
cad2e5aa 844{
8d919b0a 845 struct regexp *const prog = ReANY(rx);
b2ad2123
DM
846 SSize_t start_shift = prog->check_offset_min;
847 /* Should be nonnegative! */
848 SSize_t end_shift = 0;
0fc004dd
DM
849 /* current lowest pos in string where the regex can start matching */
850 char *rx_origin = strpos;
eb578fdb 851 SV *check;
f2ed9b32 852 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
6480a6c4 853 U8 other_ix = 1 - prog->substrs->check_ix;
6ad5ffb3 854 bool ml_anch = 0;
8f4bf5fc 855 char *other_last = strpos;/* latest pos 'other' substr already checked to */
bd61b366 856 char *check_at = NULL; /* check substr found at this pos */
bbe252da 857 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 858 RXi_GET_DECL(prog,progi);
02d5137b
DM
859 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
860 regmatch_info *const reginfo = &reginfo_buf;
271b36b1 861 DECLARE_AND_GET_RE_DEBUG_FLAGS;
a3621e74 862
7918f24d 863 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
864 PERL_UNUSED_ARG(flags);
865 PERL_UNUSED_ARG(data);
7918f24d 866
6ad9a8ab 867 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0
DM
868 "Intuit: trying to determine minimum start position...\n"));
869
fb9bbddb 870 /* for now, assume that all substr offsets are positive. If at some point
f67a5002 871 * in the future someone wants to do clever things with lookbehind and
fb9bbddb
DM
872 * -ve offsets, they'll need to fix up any code in this function
873 * which uses these offsets. See the thread beginning
874 * <20140113145929.GF27210@iabyn.com>
875 */
876 assert(prog->substrs->data[0].min_offset >= 0);
877 assert(prog->substrs->data[0].max_offset >= 0);
878 assert(prog->substrs->data[1].min_offset >= 0);
879 assert(prog->substrs->data[1].max_offset >= 0);
880 assert(prog->substrs->data[2].min_offset >= 0);
881 assert(prog->substrs->data[2].max_offset >= 0);
882
f7022b5a 883 /* for now, assume that if both present, that the floating substring
83f2232d 884 * doesn't start before the anchored substring.
f7022b5a
DM
885 * If you break this assumption (e.g. doing better optimisations
886 * with lookahead/behind), then you'll need to audit the code in this
887 * function carefully first
888 */
889 assert(
890 ! ( (prog->anchored_utf8 || prog->anchored_substr)
891 && (prog->float_utf8 || prog->float_substr))
892 || (prog->float_min_offset >= prog->anchored_offset));
893
1a4edc3c
DM
894 /* byte rather than char calculation for efficiency. It fails
895 * to quickly reject some cases that can't match, but will reject
896 * them later after doing full char arithmetic */
c344f387 897 if (prog->minlen > strend - strpos) {
6ad9a8ab 898 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 899 " String too short...\n"));
cad2e5aa 900 goto fail;
2c2d71f5 901 }
d8da0584 902
196a02af 903 RXp_MATCH_UTF8_set(prog, utf8_target);
6c3fea77 904 reginfo->is_utf8_target = cBOOL(utf8_target);
bf2039a9 905 reginfo->info_aux = NULL;
9d9163fb 906 reginfo->strbeg = strbeg;
220db18a 907 reginfo->strend = strend;
aed7b151 908 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 909 reginfo->intuit = 1;
1cb48e53
DM
910 /* not actually used within intuit, but zero for safety anyway */
911 reginfo->poscache_maxiter = 0;
02d5137b 912
f2ed9b32 913 if (utf8_target) {
2814f4b3
HS
914 if ((!prog->anchored_utf8 && prog->anchored_substr)
915 || (!prog->float_utf8 && prog->float_substr))
33b8afdf
JH
916 to_utf8_substr(prog);
917 check = prog->check_utf8;
918 } else {
7e0d5ad7
KW
919 if (!prog->check_substr && prog->check_utf8) {
920 if (! to_byte_substr(prog)) {
6b54ddc5 921 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
922 }
923 }
33b8afdf
JH
924 check = prog->check_substr;
925 }
274cd312 926
1dc475d0
DM
927 /* dump the various substring data */
928 DEBUG_OPTIMISE_MORE_r({
929 int i;
930 for (i=0; i<=2; i++) {
931 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
932 : prog->substrs->data[i].substr);
933 if (!sv)
934 continue;
935
6ad9a8ab 936 Perl_re_printf( aTHX_
147e3846
KW
937 " substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
938 " useful=%" IVdf " utf8=%d [%s]\n",
1dc475d0
DM
939 i,
940 (IV)prog->substrs->data[i].min_offset,
941 (IV)prog->substrs->data[i].max_offset,
942 (IV)prog->substrs->data[i].end_shift,
943 BmUSEFUL(sv),
944 utf8_target ? 1 : 0,
945 SvPEEK(sv));
946 }
947 });
948
8e1490ee 949 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
9fc7410e
DM
950
951 /* ml_anch: check after \n?
952 *
0fa70a06 953 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
9fc7410e
DM
954 * with /.*.../, these flags will have been added by the
955 * compiler:
956 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
957 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
958 */
7d2d37f5
DM
959 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
960 && !(prog->intflags & PREGf_IMPLICIT);
cad2e5aa 961
343c8a29 962 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
c889ccc8
DM
963 /* we are only allowed to match at BOS or \G */
964
57fcbfa7 965 /* trivially reject if there's a BOS anchor and we're not at BOS.
7bb3b9eb
DM
966 *
967 * Note that we don't try to do a similar quick reject for
968 * \G, since generally the caller will have calculated strpos
969 * based on pos() and gofs, so the string is already correctly
970 * anchored by definition; and handling the exceptions would
971 * be too fiddly (e.g. REXEC_IGNOREPOS).
57fcbfa7 972 */
7bb3b9eb 973 if ( strpos != strbeg
d3d47aac 974 && (prog->intflags & PREGf_ANCH_SBOL))
c889ccc8 975 {
6ad9a8ab 976 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 977 " Not at start...\n"));
c889ccc8
DM
978 goto fail;
979 }
980
a5d12a4b
DM
981 /* in the presence of an anchor, the anchored (relative to the
982 * start of the regex) substr must also be anchored relative
66b7ec5c
DM
983 * to strpos. So quickly reject if substr isn't found there.
984 * This works for \G too, because the caller will already have
985 * subtracted gofs from pos, and gofs is the offset from the
986 * \G to the start of the regex. For example, in /.abc\Gdef/,
987 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
988 * caller will have set strpos=pos()-4; we look for the substr
989 * at position pos()-4+1, which lines up with the "a" */
a5d12a4b 990
33c28ab2 991 if (prog->check_offset_min == prog->check_offset_max) {
c889ccc8 992 /* Substring at constant offset from beg-of-str... */
b2ad2123 993 SSize_t slen = SvCUR(check);
343c8a29 994 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 995
6ad9a8ab 996 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 997 " Looking for check substr at fixed offset %" IVdf "...\n",
1dc475d0
DM
998 (IV)prog->check_offset_min));
999
7742aa66
DM
1000 if (SvTAIL(check)) {
1001 /* In this case, the regex is anchored at the end too.
1002 * Unless it's a multiline match, the lengths must match
b2ad2123 1003 * exactly, give or take a \n. NB: slen >= 1 since
7742aa66
DM
1004 * the last char of check is \n */
1005 if (!multiline
b2ad2123
DM
1006 && ( strend - s > slen
1007 || strend - s < slen - 1
1008 || (strend - s == slen && strend[-1] != '\n')))
c889ccc8 1009 {
6ad9a8ab 1010 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1011 " String too long...\n"));
c889ccc8
DM
1012 goto fail_finish;
1013 }
b2ad2123
DM
1014 /* Now should match s[0..slen-2] */
1015 slen--;
c889ccc8 1016 }
b2ad2123 1017 if (slen && (strend - s < slen
26fb2318 1018 || *SvPVX_const(check) != *s
b2ad2123 1019 || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
d307bf57 1020 {
6ad9a8ab 1021 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1022 " String not equal...\n"));
d307bf57
DM
1023 goto fail_finish;
1024 }
c889ccc8
DM
1025
1026 check_at = s;
1027 goto success_at_start;
cad2e5aa 1028 }
cad2e5aa 1029 }
cad2e5aa 1030 }
0fc004dd 1031
b2ad2123 1032 end_shift = prog->check_end_shift;
cad2e5aa 1033
19188028 1034#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
b2ad2123 1035 if (end_shift < 0)
147e3846 1036 Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
b2ad2123 1037 (IV)end_shift, RX_PRECOMP(rx));
2c2d71f5
JH
1038#endif
1039
2c2d71f5 1040 restart:
1de06328 1041
66b7ec5c
DM
1042 /* This is the (re)entry point of the main loop in this function.
1043 * The goal of this loop is to:
1044 * 1) find the "check" substring in the region rx_origin..strend
b2ad2123 1045 * (adjusted by start_shift / end_shift). If not found, reject
66b7ec5c
DM
1046 * immediately.
1047 * 2) If it exists, look for the "other" substr too if defined; for
1048 * example, if the check substr maps to the anchored substr, then
1049 * check the floating substr, and vice-versa. If not found, go
1050 * back to (1) with rx_origin suitably incremented.
1051 * 3) If we find an rx_origin position that doesn't contradict
1052 * either of the substrings, then check the possible additional
1053 * constraints on rx_origin of /^.../m or a known start class.
1054 * If these fail, then depending on which constraints fail, jump
1055 * back to here, or to various other re-entry points further along
1056 * that skip some of the first steps.
1057 * 4) If we pass all those tests, update the BmUSEFUL() count on the
1058 * substring. If the start position was determined to be at the
1059 * beginning of the string - so, not rejected, but not optimised,
1060 * since we have to run regmatch from position 0 - decrement the
1061 * BmUSEFUL() count. Otherwise increment it.
1062 */
1063
1a4edc3c
DM
1064
1065 /* first, look for the 'check' substring */
1066
1de06328 1067 {
c33e64f0
FC
1068 U8* start_point;
1069 U8* end_point;
c889ccc8 1070
c889ccc8 1071 DEBUG_OPTIMISE_MORE_r({
6ad9a8ab 1072 Perl_re_printf( aTHX_
147e3846
KW
1073 " At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1074 " Start shift: %" IVdf " End shift %" IVdf
1075 " Real end Shift: %" IVdf "\n",
675e93ee 1076 (IV)(rx_origin - strbeg),
c889ccc8 1077 (IV)prog->check_offset_min,
b2ad2123
DM
1078 (IV)start_shift,
1079 (IV)end_shift,
c889ccc8
DM
1080 (IV)prog->check_end_shift);
1081 });
1de06328 1082
b2ad2123 1083 end_point = HOPBACK3(strend, end_shift, rx_origin);
bb152a4b
DM
1084 if (!end_point)
1085 goto fail_finish;
b2ad2123 1086 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
33c28ab2
DM
1087 if (!start_point)
1088 goto fail_finish;
c889ccc8 1089
557f47af 1090
e0362b86 1091 /* If the regex is absolutely anchored to either the start of the
d3d47aac 1092 * string (SBOL) or to pos() (ANCH_GPOS), then
e0362b86
DM
1093 * check_offset_max represents an upper bound on the string where
1094 * the substr could start. For the ANCH_GPOS case, we assume that
1095 * the caller of intuit will have already set strpos to
1096 * pos()-gofs, so in this case strpos + offset_max will still be
1097 * an upper bound on the substr.
1098 */
c19c836a
DM
1099 if (!ml_anch
1100 && prog->intflags & PREGf_ANCH
e0362b86 1101 && prog->check_offset_max != SSize_t_MAX)
c19c836a 1102 {
b2ad2123 1103 SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
e0362b86
DM
1104 const char * const anchor =
1105 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
b2ad2123 1106 SSize_t targ_len = (char*)end_point - anchor;
2ce94a86 1107
b2ad2123 1108 if (check_len > targ_len) {
2ce94a86 1109 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
12453e29 1110 "Target string too short to match required substring...\n"));
2ce94a86
DM
1111 goto fail_finish;
1112 }
e0362b86
DM
1113
1114 /* do a bytes rather than chars comparison. It's conservative;
1115 * so it skips doing the HOP if the result can't possibly end
1116 * up earlier than the old value of end_point.
1117 */
b2ad2123
DM
1118 assert(anchor + check_len <= (char *)end_point);
1119 if (prog->check_offset_max + check_len < targ_len) {
e0362b86
DM
1120 end_point = HOP3lim((U8*)anchor,
1121 prog->check_offset_max,
b2ad2123 1122 end_point - check_len
2ce94a86 1123 )
b2ad2123 1124 + check_len;
12453e29
YO
1125 if (end_point < start_point)
1126 goto fail_finish;
e0362b86 1127 }
d6ef1678
DM
1128 }
1129
ae5d4331 1130 check_at = fbm_instr( start_point, end_point,
7fba1cd6 1131 check, multiline ? FBMrf_MULTILINE : 0);
c889ccc8 1132
6ad9a8ab 1133 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1134 " doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
675e93ee
DM
1135 (IV)((char*)start_point - strbeg),
1136 (IV)((char*)end_point - strbeg),
1137 (IV)(check_at ? check_at - strbeg : -1)
1138 ));
1139
8fd34720
DM
1140 /* Update the count-of-usability, remove useless subpatterns,
1141 unshift s. */
1142
1143 DEBUG_EXECUTE_r({
1144 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1145 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
6ad9a8ab 1146 Perl_re_printf( aTHX_ " %s %s substr %s%s%s",
8fd34720
DM
1147 (check_at ? "Found" : "Did not find"),
1148 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1149 ? "anchored" : "floating"),
1150 quoted,
1151 RE_SV_TAIL(check),
1152 (check_at ? " at offset " : "...\n") );
1153 });
2c2d71f5 1154
8fd34720
DM
1155 if (!check_at)
1156 goto fail_finish;
8fd34720
DM
1157 /* set rx_origin to the minimum position where the regex could start
1158 * matching, given the constraint of the just-matched check substring.
1159 * But don't set it lower than previously.
1160 */
fdc003fd 1161
8fd34720
DM
1162 if (check_at - rx_origin > prog->check_offset_max)
1163 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
675e93ee 1164 /* Finish the diagnostic message */
6ad9a8ab 1165 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1166 "%ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1167 (long)(check_at - strbeg),
1168 (IV)(rx_origin - strbeg)
1169 ));
8fd34720 1170 }
fdc003fd
DM
1171
1172
1a4edc3c 1173 /* now look for the 'other' substring if defined */
2c2d71f5 1174
fd8def15
HS
1175 if (prog->substrs->data[other_ix].utf8_substr
1176 || prog->substrs->data[other_ix].substr)
1de06328 1177 {
30944b6d 1178 /* Take into account the "other" substring. */
6c3343a6
DM
1179 char *last, *last1;
1180 char *s;
1181 SV* must;
1182 struct reg_substr_datum *other;
1183
1184 do_other_substr:
1185 other = &prog->substrs->data[other_ix];
fd8def15
HS
1186 if (!utf8_target && !other->substr) {
1187 if (!to_byte_substr(prog)) {
1188 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1189 }
1190 }
6c3343a6
DM
1191
1192 /* if "other" is anchored:
1193 * we've previously found a floating substr starting at check_at.
1194 * This means that the regex origin must lie somewhere
1195 * between min (rx_origin): HOP3(check_at, -check_offset_max)
1196 * and max: HOP3(check_at, -check_offset_min)
1197 * (except that min will be >= strpos)
1198 * So the fixed substr must lie somewhere between
1199 * HOP3(min, anchored_offset)
1200 * HOP3(max, anchored_offset) + SvCUR(substr)
1201 */
1202
1203 /* if "other" is floating
1204 * Calculate last1, the absolute latest point where the
1205 * floating substr could start in the string, ignoring any
1206 * constraints from the earlier fixed match. It is calculated
1207 * as follows:
1208 *
1209 * strend - prog->minlen (in chars) is the absolute latest
1210 * position within the string where the origin of the regex
1211 * could appear. The latest start point for the floating
1212 * substr is float_min_offset(*) on from the start of the
1213 * regex. last1 simply combines thee two offsets.
1214 *
1215 * (*) You might think the latest start point should be
1216 * float_max_offset from the regex origin, and technically
1217 * you'd be correct. However, consider
1218 * /a\d{2,4}bcd\w/
1219 * Here, float min, max are 3,5 and minlen is 7.
1220 * This can match either
1221 * /a\d\dbcd\w/
1222 * /a\d\d\dbcd\w/
1223 * /a\d\d\d\dbcd\w/
1224 * In the first case, the regex matches minlen chars; in the
1225 * second, minlen+1, in the third, minlen+2.
1226 * In the first case, the floating offset is 3 (which equals
1227 * float_min), in the second, 4, and in the third, 5 (which
1228 * equals float_max). In all cases, the floating string bcd
1229 * can never start more than 4 chars from the end of the
1230 * string, which equals minlen - float_min. As the substring
1231 * starts to match more than float_min from the start of the
1232 * regex, it makes the regex match more than minlen chars,
1233 * and the two cancel each other out. So we can always use
1234 * float_min - minlen, rather than float_max - minlen for the
1235 * latest position in the string.
1236 *
1237 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1238 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1239 */
1240
e7a14a9c 1241 assert(prog->minlen >= other->min_offset);
6c3343a6
DM
1242 last1 = HOP3c(strend,
1243 other->min_offset - prog->minlen, strbeg);
1244
4d006249 1245 if (other_ix) {/* i.e. if (other-is-float) */
6c3343a6
DM
1246 /* last is the latest point where the floating substr could
1247 * start, *given* any constraints from the earlier fixed
1248 * match. This constraint is that the floating string starts
1249 * <= float_max_offset chars from the regex origin (rx_origin).
1250 * If this value is less than last1, use it instead.
eb3831ce 1251 */
6c3343a6
DM
1252 assert(rx_origin <= last1);
1253 last =
1254 /* this condition handles the offset==infinity case, and
1255 * is a short-cut otherwise. Although it's comparing a
1256 * byte offset to a char length, it does so in a safe way,
1257 * since 1 char always occupies 1 or more bytes,
1258 * so if a string range is (last1 - rx_origin) bytes,
1259 * it will be less than or equal to (last1 - rx_origin)
1260 * chars; meaning it errs towards doing the accurate HOP3
1261 * rather than just using last1 as a short-cut */
1262 (last1 - rx_origin) < other->max_offset
1263 ? last1
1264 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1265 }
1266 else {
b2ad2123
DM
1267 assert(strpos + start_shift <= check_at);
1268 last = HOP4c(check_at, other->min_offset - start_shift,
6c3343a6
DM
1269 strbeg, strend);
1270 }
ead917d0 1271
6c3343a6
DM
1272 s = HOP3c(rx_origin, other->min_offset, strend);
1273 if (s < other_last) /* These positions already checked */
1274 s = other_last;
1275
1276 must = utf8_target ? other->utf8_substr : other->substr;
1277 assert(SvPOK(must));
675e93ee
DM
1278 {
1279 char *from = s;
1280 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1281
71a9d105
DM
1282 if (to > strend)
1283 to = strend;
88203927
DM
1284 if (from > to) {
1285 s = NULL;
6ad9a8ab 1286 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1287 " skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
88203927
DM
1288 (IV)(from - strbeg),
1289 (IV)(to - strbeg)
1290 ));
1291 }
1292 else {
1293 s = fbm_instr(
1294 (unsigned char*)from,
1295 (unsigned char*)to,
1296 must,
1297 multiline ? FBMrf_MULTILINE : 0
1298 );
6ad9a8ab 1299 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1300 " doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
88203927
DM
1301 (IV)(from - strbeg),
1302 (IV)(to - strbeg),
1303 (IV)(s ? s - strbeg : -1)
1304 ));
1305 }
675e93ee
DM
1306 }
1307
6c3343a6
DM
1308 DEBUG_EXECUTE_r({
1309 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1310 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
6ad9a8ab 1311 Perl_re_printf( aTHX_ " %s %s substr %s%s",
6c3343a6
DM
1312 s ? "Found" : "Contradicts",
1313 other_ix ? "floating" : "anchored",
1314 quoted, RE_SV_TAIL(must));
1315 });
ead917d0 1316
ead917d0 1317
6c3343a6
DM
1318 if (!s) {
1319 /* last1 is latest possible substr location. If we didn't
1320 * find it before there, we never will */
1321 if (last >= last1) {
6ad9a8ab 1322 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee 1323 "; giving up...\n"));
6c3343a6 1324 goto fail_finish;
ead917d0
DM
1325 }
1326
6c3343a6
DM
1327 /* try to find the check substr again at a later
1328 * position. Maybe next time we'll find the "other" substr
1329 * in range too */
6c3343a6
DM
1330 other_last = HOP3c(last, 1, strend) /* highest failure */;
1331 rx_origin =
4d006249 1332 other_ix /* i.e. if other-is-float */
6c3343a6
DM
1333 ? HOP3c(rx_origin, 1, strend)
1334 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
6ad9a8ab 1335 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1336 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1337 (other_ix ? "floating" : "anchored"),
1338 (long)(HOP3c(check_at, 1, strend) - strbeg),
1339 (IV)(rx_origin - strbeg)
1340 ));
6c3343a6
DM
1341 goto restart;
1342 }
1343 else {
4d006249 1344 if (other_ix) { /* if (other-is-float) */
6c3343a6
DM
1345 /* other_last is set to s, not s+1, since its possible for
1346 * a floating substr to fail first time, then succeed
1347 * second time at the same floating position; e.g.:
1348 * "-AB--AABZ" =~ /\wAB\d*Z/
1349 * The first time round, anchored and float match at
1350 * "-(AB)--AAB(Z)" then fail on the initial \w character
1351 * class. Second time round, they match at "-AB--A(AB)(Z)".
1352 */
1353 other_last = s;
ead917d0
DM
1354 }
1355 else {
6c3343a6
DM
1356 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1357 other_last = HOP3c(s, 1, strend);
ead917d0 1358 }
6ad9a8ab 1359 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1360 " at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1361 (long)(s - strbeg),
1362 (IV)(rx_origin - strbeg)
1363 ));
1364
6c3343a6 1365 }
cad2e5aa 1366 }
acba93e8
DM
1367 else {
1368 DEBUG_OPTIMISE_MORE_r(
6ad9a8ab 1369 Perl_re_printf( aTHX_
147e3846
KW
1370 " Check-only match: offset min:%" IVdf " max:%" IVdf
1371 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1372 " strend:%" IVdf "\n",
acba93e8
DM
1373 (IV)prog->check_offset_min,
1374 (IV)prog->check_offset_max,
675e93ee
DM
1375 (IV)(check_at-strbeg),
1376 (IV)(rx_origin-strbeg),
1c1c599d 1377 (IV)(rx_origin-check_at),
675e93ee 1378 (IV)(strend-strbeg)
acba93e8
DM
1379 )
1380 );
1381 }
2c2d71f5 1382
acba93e8 1383 postprocess_substr_matches:
0991020e 1384
1a4edc3c 1385 /* handle the extra constraint of /^.../m if present */
e3c6feb0 1386
7d2d37f5 1387 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
4620cb61
DM
1388 char *s;
1389
6ad9a8ab 1390 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
a62659bd 1391 " looking for /^/m anchor"));
d0880ea7
DM
1392
1393 /* we have failed the constraint of a \n before rx_origin.
2e759faa
DM
1394 * Find the next \n, if any, even if it's beyond the current
1395 * anchored and/or floating substrings. Whether we should be
1396 * scanning ahead for the next \n or the next substr is debatable.
1397 * On the one hand you'd expect rare substrings to appear less
1398 * often than \n's. On the other hand, searching for \n means
675e93ee 1399 * we're effectively flipping between check_substr and "\n" on each
2e759faa
DM
1400 * iteration as the current "rarest" string candidate, which
1401 * means for example that we'll quickly reject the whole string if
1402 * hasn't got a \n, rather than trying every substr position
1403 * first
1404 */
d0880ea7 1405
4620cb61
DM
1406 s = HOP3c(strend, - prog->minlen, strpos);
1407 if (s <= rx_origin ||
1408 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1409 {
6ad9a8ab 1410 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
d0880ea7
DM
1411 " Did not find /%s^%s/m...\n",
1412 PL_colors[0], PL_colors[1]));
a62659bd
DM
1413 goto fail_finish;
1414 }
d0880ea7 1415
4ada1233
DM
1416 /* earliest possible origin is 1 char after the \n.
1417 * (since *rx_origin == '\n', it's safe to ++ here rather than
1418 * HOP(rx_origin, 1)) */
1419 rx_origin++;
d0880ea7 1420
f4f115de 1421 if (prog->substrs->check_ix == 0 /* check is anchored */
4ada1233 1422 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
f4f115de 1423 {
d0880ea7
DM
1424 /* Position contradicts check-string; either because
1425 * check was anchored (and thus has no wiggle room),
4ada1233 1426 * or check was float and rx_origin is above the float range */
6ad9a8ab 1427 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee
DM
1428 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1429 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
d0880ea7
DM
1430 goto restart;
1431 }
1432
1433 /* if we get here, the check substr must have been float,
2e759faa 1434 * is in range, and we may or may not have had an anchored
d0880ea7
DM
1435 * "other" substr which still contradicts */
1436 assert(prog->substrs->check_ix); /* check is float */
1437
1438 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1439 /* whoops, the anchored "other" substr exists, so we still
1440 * contradict. On the other hand, the float "check" substr
1441 * didn't contradict, so just retry the anchored "other"
1442 * substr */
6ad9a8ab 1443 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1444 " Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
d0880ea7 1445 PL_colors[0], PL_colors[1],
73e8ff00
DM
1446 (IV)(rx_origin - strbeg + prog->anchored_offset),
1447 (IV)(rx_origin - strbeg)
675e93ee 1448 ));
d0880ea7
DM
1449 goto do_other_substr;
1450 }
1451
1452 /* success: we don't contradict the found floating substring
1453 * (and there's no anchored substr). */
6ad9a8ab 1454 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee
DM
1455 " Found /%s^%s/m with rx_origin %ld...\n",
1456 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
e3c6feb0
DM
1457 }
1458 else {
6ad9a8ab 1459 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
fe4f3442 1460 " (multiline anchor test skipped)\n"));
e3c6feb0
DM
1461 }
1462
ffad1e6a 1463 success_at_start:
e3c6feb0 1464
cad2e5aa 1465
dd170ff5
DM
1466 /* if we have a starting character class, then test that extra constraint.
1467 * (trie stclasses are too expensive to use here, we are better off to
1468 * leave it to regmatch itself) */
1469
f8fc2ecf 1470 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
f8fc2ecf 1471 const U8* const str = (U8*)STRING(progi->regstclass);
0991020e 1472
b2ad2123 1473 /* XXX this value could be pre-computed */
7d769928 1474 const SSize_t cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
2c75e362 1475 ? (reginfo->is_utf8_pat
7d769928
DM
1476 ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str)
1477 : (SSize_t)STR_LEN(progi->regstclass))
66e933ab 1478 : 1);
1de06328 1479 char * endpos;
fa3bb21d 1480 char *s;
000dfd2d
DM
1481 /* latest pos that a matching float substr constrains rx start to */
1482 char *rx_max_float = NULL;
1483
c75a3985
DM
1484 /* if the current rx_origin is anchored, either by satisfying an
1485 * anchored substring constraint, or a /^.../m constraint, then we
1486 * can reject the current origin if the start class isn't found
1487 * at the current position. If we have a float-only match, then
1488 * rx_origin is constrained to a range; so look for the start class
1489 * in that range. if neither, then look for the start class in the
1490 * whole rest of the string */
1491
dd170ff5
DM
1492 /* XXX DAPM it's not clear what the minlen test is for, and why
1493 * it's not used in the floating case. Nothing in the test suite
1494 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1495 * Here are some old comments, which may or may not be correct:
1496 *
1497 * minlen == 0 is possible if regstclass is \b or \B,
1498 * and the fixed substr is ''$.
1499 * Since minlen is already taken into account, rx_origin+1 is
1500 * before strend; accidentally, minlen >= 1 guaranties no false
1501 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1502 * 0) below assumes that regstclass does not come from lookahead...
1503 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1504 * This leaves EXACTF-ish only, which are dealt with in
1505 * find_byclass().
1506 */
1507
7d2d37f5 1508 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
b2ad2123 1509 endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
000dfd2d 1510 else if (prog->float_substr || prog->float_utf8) {
b2ad2123
DM
1511 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1512 endpos = HOP3clim(rx_max_float, cl_l, strend);
000dfd2d 1513 }
1de06328
YO
1514 else
1515 endpos= strend;
1516
6ad9a8ab 1517 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
b2ad2123 1518 " looking for class: start_shift: %" IVdf " check_at: %" IVdf
147e3846 1519 " rx_origin: %" IVdf " endpos: %" IVdf "\n",
b2ad2123 1520 (IV)start_shift, (IV)(check_at - strbeg),
c43b5520 1521 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
d8080198 1522
c43b5520 1523 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
f9176b44 1524 reginfo);
be778b1a 1525 if (!s) {
6eb5f6b9 1526 if (endpos == strend) {
6ad9a8ab 1527 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1dc475d0 1528 " Could not match STCLASS...\n") );
6eb5f6b9
JH
1529 goto fail;
1530 }
6ad9a8ab 1531 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1dc475d0 1532 " This position contradicts STCLASS...\n") );
e0eb31e7
DM
1533 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1534 && !(prog->intflags & PREGf_IMPLICIT))
653099ff 1535 goto fail;
9fed8d02 1536
6eb5f6b9 1537 /* Contradict one of substrings */
97136c8a
DM
1538 if (prog->anchored_substr || prog->anchored_utf8) {
1539 if (prog->substrs->check_ix == 1) { /* check is float */
1540 /* Have both, check_string is floating */
b2ad2123
DM
1541 assert(rx_origin + start_shift <= check_at);
1542 if (rx_origin + start_shift != check_at) {
97136c8a 1543 /* not at latest position float substr could match:
c75a3985
DM
1544 * Recheck anchored substring, but not floating.
1545 * The condition above is in bytes rather than
1546 * chars for efficiency. It's conservative, in
1547 * that it errs on the side of doing 'goto
88203927
DM
1548 * do_other_substr'. In this case, at worst,
1549 * an extra anchored search may get done, but in
1550 * practice the extra fbm_instr() is likely to
1551 * get skipped anyway. */
6ad9a8ab 1552 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
147e3846 1553 " about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1554 (long)(other_last - strbeg),
1555 (IV)(rx_origin - strbeg)
1556 ));
97136c8a 1557 goto do_other_substr;
3369914b 1558 }
3369914b
DM
1559 }
1560 }
97136c8a 1561 else {
9fed8d02
DM
1562 /* float-only */
1563
7d2d37f5 1564 if (ml_anch) {
c75a3985
DM
1565 /* In the presence of ml_anch, we might be able to
1566 * find another \n without breaking the current float
1567 * constraint. */
1568
1569 /* strictly speaking this should be HOP3c(..., 1, ...),
1570 * but since we goto a block of code that's going to
1571 * search for the next \n if any, its safe here */
9fed8d02 1572 rx_origin++;
6ad9a8ab 1573 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
675e93ee 1574 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
9fed8d02 1575 PL_colors[0], PL_colors[1],
675e93ee 1576 (long)(rx_origin - strbeg)) );
9fed8d02 1577 goto postprocess_substr_matches;
ab60c45a 1578 }
c75a3985
DM
1579
1580 /* strictly speaking this can never be true; but might
1581 * be if we ever allow intuit without substrings */
1582 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
9fed8d02 1583 goto fail;
c75a3985 1584
000dfd2d 1585 rx_origin = rx_max_float;
9fed8d02
DM
1586 }
1587
c75a3985
DM
1588 /* at this point, any matching substrings have been
1589 * contradicted. Start again... */
1590
9fed8d02 1591 rx_origin = HOP3c(rx_origin, 1, strend);
557f47af
DM
1592
1593 /* uses bytes rather than char calculations for efficiency.
1594 * It's conservative: it errs on the side of doing 'goto restart',
1595 * where there is code that does a proper char-based test */
b2ad2123 1596 if (rx_origin + start_shift + end_shift > strend) {
6ad9a8ab 1597 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
9fed8d02
DM
1598 " Could not match STCLASS...\n") );
1599 goto fail;
1600 }
6ad9a8ab 1601 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
147e3846 1602 " about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
9fed8d02 1603 (prog->substrs->check_ix ? "floating" : "anchored"),
b2ad2123 1604 (long)(rx_origin + start_shift - strbeg),
675e93ee
DM
1605 (IV)(rx_origin - strbeg)
1606 ));
9fed8d02 1607 goto restart;
6eb5f6b9 1608 }
9fed8d02 1609
c75a3985
DM
1610 /* Success !!! */
1611
5f9c6575 1612 if (rx_origin != s) {
6ad9a8ab 1613 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1614 " By STCLASS: moving %ld --> %ld\n",
675e93ee 1615 (long)(rx_origin - strbeg), (long)(s - strbeg))
b7953727
JH
1616 );
1617 }
1618 else {
6ad9a8ab 1619 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1620 " Does not contradict STCLASS...\n");
b7953727
JH
1621 );
1622 }
6eb5f6b9 1623 }
ffad1e6a
DM
1624
1625 /* Decide whether using the substrings helped */
1626
1627 if (rx_origin != strpos) {
1628 /* Fixed substring is found far enough so that the match
1629 cannot start at strpos. */
1630
6ad9a8ab 1631 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n"));
ffad1e6a
DM
1632 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1633 }
1634 else {
70563e16
DM
1635 /* The found rx_origin position does not prohibit matching at
1636 * strpos, so calling intuit didn't gain us anything. Decrement
1637 * the BmUSEFUL() count on the check substring, and if we reach
1638 * zero, free it. */
1639 if (!(prog->intflags & PREGf_NAUGHTY)
ffad1e6a
DM
1640 && (utf8_target ? (
1641 prog->check_utf8 /* Could be deleted already */
1642 && --BmUSEFUL(prog->check_utf8) < 0
1643 && (prog->check_utf8 == prog->float_utf8)
1644 ) : (
1645 prog->check_substr /* Could be deleted already */
1646 && --BmUSEFUL(prog->check_substr) < 0
1647 && (prog->check_substr == prog->float_substr)
1648 )))
1649 {
1650 /* If flags & SOMETHING - do not do it many times on the same match */
6ad9a8ab 1651 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n"));
ffad1e6a
DM
1652 /* XXX Does the destruction order has to change with utf8_target? */
1653 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1654 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1655 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1656 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1657 check = NULL; /* abort */
ffad1e6a
DM
1658 /* XXXX This is a remnant of the old implementation. It
1659 looks wasteful, since now INTUIT can use many
1660 other heuristics. */
1661 prog->extflags &= ~RXf_USE_INTUIT;
ffad1e6a
DM
1662 }
1663 }
1664
6ad9a8ab 1665 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
ffad1e6a 1666 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
675e93ee 1667 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
ffad1e6a 1668
c765d6e0 1669 return rx_origin;
2c2d71f5
JH
1670
1671 fail_finish: /* Substring not found */
33b8afdf 1672 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1673 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1674 fail:
6ad9a8ab 1675 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n",
e4584336 1676 PL_colors[4], PL_colors[5]));
bd61b366 1677 return NULL;
cad2e5aa 1678}
9661b544 1679
70563e16 1680
a0a388a1 1681#define DECL_TRIE_TYPE(scan) \
e7fd4aa1 1682 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
a4525e78 1683 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
3ed3004a 1684 trie_utf8l, trie_flu8, trie_flu8_latin } \
e7fd4aa1
KW
1685 trie_type = ((scan->flags == EXACT) \
1686 ? (utf8_target ? trie_utf8 : trie_plain) \
a4525e78
KW
1687 : (scan->flags == EXACTL) \
1688 ? (utf8_target ? trie_utf8l : trie_plain) \
89829bb5 1689 : (scan->flags == EXACTFAA) \
a4525e78
KW
1690 ? (utf8_target \
1691 ? trie_utf8_exactfa_fold \
1692 : trie_latin_utf8_exactfa_fold) \
1693 : (scan->flags == EXACTFLU8 \
3ed3004a
YO
1694 ? (utf8_target \
1695 ? trie_flu8 \
1696 : trie_flu8_latin) \
a4525e78
KW
1697 : (utf8_target \
1698 ? trie_utf8_fold \
3ed3004a 1699 : trie_latin_utf8_fold)))
fab2782b 1700
9ad8cac4
KW
1701/* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1702 * 'foldbuf+sizeof(foldbuf)' */
1703#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
baa60164 1704STMT_START { \
fab2782b 1705 STRLEN skiplen; \
baa60164 1706 U8 flags = FOLD_FLAGS_FULL; \
fab2782b 1707 switch (trie_type) { \
a4525e78 1708 case trie_flu8: \
780fcc9f 1709 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
3ed3004a 1710 if (UTF8_IS_ABOVE_LATIN1(*uc)) { \
e1a2878a 1711 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
613abc6d 1712 } \
a4525e78 1713 goto do_trie_utf8_fold; \
31f05a37 1714 case trie_utf8_exactfa_fold: \
baa60164 1715 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1716 /* FALLTHROUGH */ \
fab2782b 1717 case trie_utf8_fold: \
a4525e78 1718 do_trie_utf8_fold: \
fab2782b 1719 if ( foldlen>0 ) { \
9ad8cac4 1720 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
fab2782b
YO
1721 foldlen -= len; \
1722 uscan += len; \
1723 len=0; \
1724 } else { \
9ad8cac4 1725 uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \
a1a5ec35 1726 flags); \
695717ad 1727 len = UTF8_SAFE_SKIP(uc, uc_end); \
5f560d8a 1728 skiplen = UVCHR_SKIP( uvc ); \
fab2782b
YO
1729 foldlen -= skiplen; \
1730 uscan = foldbuf + skiplen; \
1731 } \
1732 break; \
3ed3004a
YO
1733 case trie_flu8_latin: \
1734 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1735 goto do_trie_latin_utf8_fold; \
baa60164
KW
1736 case trie_latin_utf8_exactfa_fold: \
1737 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1738 /* FALLTHROUGH */ \
fab2782b 1739 case trie_latin_utf8_fold: \
3ed3004a 1740 do_trie_latin_utf8_fold: \
fab2782b 1741 if ( foldlen>0 ) { \
9ad8cac4 1742 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
fab2782b
YO
1743 foldlen -= len; \
1744 uscan += len; \
1745 len=0; \
1746 } else { \
1747 len = 1; \
31f05a37 1748 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
5f560d8a 1749 skiplen = UVCHR_SKIP( uvc ); \
fab2782b
YO
1750 foldlen -= skiplen; \
1751 uscan = foldbuf + skiplen; \
1752 } \
1753 break; \
a4525e78 1754 case trie_utf8l: \
780fcc9f 1755 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
613abc6d 1756 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
14f657d4 1757 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
613abc6d 1758 } \
780fcc9f 1759 /* FALLTHROUGH */ \
fab2782b 1760 case trie_utf8: \
9ad8cac4 1761 uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags ); \
fab2782b
YO
1762 break; \
1763 case trie_plain: \
1764 uvc = (UV)*uc; \
1765 len = 1; \
1766 } \
1767 if (uvc < 256) { \
1768 charid = trie->charmap[ uvc ]; \
1769 } \
1770 else { \
1771 charid = 0; \
1772 if (widecharmap) { \
1773 SV** const svpp = hv_fetch(widecharmap, \
1774 (char*)&uvc, sizeof(UV), 0); \
1775 if (svpp) \
1776 charid = (U16)SvIV(*svpp); \
1777 } \
1778 } \
4cadc6a9
YO
1779} STMT_END
1780
cb41e5d6 1781#define DUMP_EXEC_POS(li,s,doutf8,depth) \
ae7c5b9b 1782 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
cb41e5d6 1783 startpos, doutf8, depth)
ae7c5b9b 1784
da10aa09
KW
1785#define REXEC_FBC_SCAN(UTF8, CODE) \
1786 STMT_START { \
1787 while (s < strend) { \
1788 CODE \
8c9c2723
KW
1789 s += ((UTF8) \
1790 ? UTF8_SAFE_SKIP(s, reginfo->strend) \
1791 : 1); \
da10aa09
KW
1792 } \
1793 } STMT_END
4cadc6a9 1794
d990bd30
KW
1795#define REXEC_FBC_CLASS_SCAN(UTF8, COND) \
1796 STMT_START { \
1797 while (s < strend) { \
1798 REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \
1799 } \
1800 } STMT_END
4cadc6a9 1801
d990bd30 1802#define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \
05bd126c 1803 if (COND) { \
19719145 1804 FBC_CHECK_AND_TRY \
8c9c2723 1805 s += ((UTF8) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1);\
21d1ed54 1806 previous_occurrence_end = s; \
05bd126c 1807 } \
21d1ed54
KW
1808 else { \
1809 s += ((UTF8) ? UTF8SKIP(s) : 1); \
1810 }
4cadc6a9 1811
c84a03c5 1812#define REXEC_FBC_CSCAN(CONDUTF8,COND) \
baa60164 1813 if (utf8_target) { \
da10aa09 1814 REXEC_FBC_CLASS_SCAN(1, CONDUTF8); \
e1d1eefb
YO
1815 } \
1816 else { \
da10aa09 1817 REXEC_FBC_CLASS_SCAN(0, COND); \
d981ef24 1818 }
05bd126c 1819
a9448551
KW
1820/* We keep track of where the next character should start after an occurrence
1821 * of the one we're looking for. Knowing that, we can see right away if the
1822 * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we
1823 * don't accept the 2nd and succeeding adjacent occurrences */
8c9c2723
KW
1824#define FBC_CHECK_AND_TRY \
1825 if ( ( doevery \
1826 || s != previous_occurrence_end) \
1827 && ( reginfo->intuit \
1828 || (s <= reginfo->strend && regtry(reginfo, &s)))) \
1829 { \
1830 goto got_it; \
19719145
KW
1831 }
1832
a9448551
KW
1833
1834/* This differs from the above macros in that it calls a function which returns
1835 * the next occurrence of the thing being looked for in 's'; and 'strend' if
1836 * there is no such occurrence. */
1837#define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f) \
1838 while (s < strend) { \
2b1f9c71 1839 s = (f); \
a9448551
KW
1840 if (s >= strend) { \
1841 break; \
1842 } \
1843 \
1844 FBC_CHECK_AND_TRY \
1845 s += (UTF8) ? UTF8SKIP(s) : 1; \
1846 previous_occurrence_end = s; \
1847 }
1848
ff8b8b42
KW
1849/* This differs from the above macros in that it is passed a single byte that
1850 * is known to begin the next occurrence of the thing being looked for in 's'.
1851 * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
1852 * at that position. */
1853#define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND) \
1854 while (s < strend) { \
1855 s = (char *) memchr(s, byte, strend -s); \
1856 if (s == NULL) { \
1857 s = (char *) strend; \
1858 break; \
1859 } \
1860 \
04532561
KW
1861 if (COND) { \
1862 FBC_CHECK_AND_TRY \
8c9c2723 1863 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
04532561
KW
1864 previous_occurrence_end = s; \
1865 } \
1866 else { \
1867 s += UTF8SKIP(s); \
1868 } \
ff8b8b42
KW
1869 }
1870
05bd126c
KW
1871/* The three macros below are slightly different versions of the same logic.
1872 *
1873 * The first is for /a and /aa when the target string is UTF-8. This can only
1874 * match ascii, but it must advance based on UTF-8. The other two handle the
1875 * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking
1876 * for the boundary (or non-boundary) between a word and non-word character.
1877 * The utf8 and non-utf8 cases have the same logic, but the details must be
1878 * different. Find the "wordness" of the character just prior to this one, and
1879 * compare it with the wordness of this one. If they differ, we have a
1880 * boundary. At the beginning of the string, pretend that the previous
1881 * character was a new-line.
1882 *
1883 * All these macros uncleanly have side-effects with each other and outside
1884 * variables. So far it's been too much trouble to clean-up
1885 *
1886 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1887 * a word character or not.
1888 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1889 * word/non-word
1890 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1891 *
1892 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1893 * are looking for a boundary or for a non-boundary. If we are looking for a
1894 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1895 * see if this tentative match actually works, and if so, to quit the loop
1896 * here. And vice-versa if we are looking for a non-boundary.
1897 *
1898 * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
da10aa09 1899 * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of
05bd126c
KW
1900 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
1901 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1902 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1903 * complement. But in that branch we complement tmp, meaning that at the
1904 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1905 * which means at the top of the loop in the next iteration, it is
1906 * TEST_NON_UTF8(s-1) */
b2f4e957 1907#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1908 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1909 tmp = TEST_NON_UTF8(tmp); \
da10aa09 1910 REXEC_FBC_SCAN(1, /* 1=>is-utf8; advances s while s < strend */ \
05bd126c
KW
1911 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1912 tmp = !tmp; \
1913 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
1914 } \
1915 else { \
1916 IF_FAIL; \
1917 } \
1918 ); \
1919
1920/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1921 * TEST_UTF8 is a macro that for the same input code points returns identically
92c5bde9
KW
1922 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an
1923 * end pointer as well) */
236d82fd 1924#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1925 if (s == reginfo->strbeg) { \
1926 tmp = '\n'; \
1927 } \
1928 else { /* Back-up to the start of the previous character */ \
1929 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1930 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
3db24e1e 1931 0, UTF8_ALLOW_DEFAULT); \
05bd126c
KW
1932 } \
1933 tmp = TEST_UV(tmp); \
da10aa09 1934 REXEC_FBC_SCAN(1, /* 1=>is-utf8; advances s while s < strend */ \
7a207065 1935 if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \
05bd126c
KW
1936 tmp = !tmp; \
1937 IF_SUCCESS; \
1938 } \
1939 else { \
1940 IF_FAIL; \
1941 } \
1942 );
cfaf538b 1943
05bd126c
KW
1944/* Like the above two macros. UTF8_CODE is the complete code for handling
1945 * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1946 * macros below */
baa60164 1947#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1948 if (utf8_target) { \
05bd126c 1949 UTF8_CODE \
63ac0dad
KW
1950 } \
1951 else { /* Not utf8 */ \
9d9163fb 1952 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
63ac0dad 1953 tmp = TEST_NON_UTF8(tmp); \
da10aa09 1954 REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */ \
63ac0dad 1955 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
63ac0dad 1956 IF_SUCCESS; \
760cfa8e 1957 tmp = !tmp; \
63ac0dad
KW
1958 } \
1959 else { \
1960 IF_FAIL; \
1961 } \
1962 ); \
1963 } \
c8519dc7
KW
1964 /* Here, things have been set up by the previous code so that tmp is the \
1965 * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \
1966 * utf8ness of the target). We also have to check if this matches against \
1967 * the EOS, which we treat as a \n (which is the same value in both UTF-8 \
1968 * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \
1969 * string */ \
1970 if (tmp == ! TEST_NON_UTF8('\n')) { \
1971 IF_SUCCESS; \
1972 } \
1973 else { \
1974 IF_FAIL; \
1975 }
63ac0dad 1976
ae7c5b9b 1977/* This is the macro to use when we want to see if something that looks like it
b92b2705
KW
1978 * could match, actually does, and if so exits the loop. It needs to be used
1979 * only for bounds checking macros, as it allows for matching beyond the end of
1980 * string (which should be zero length without having to look at the string
1981 * contents) */
1982#define REXEC_FBC_TRYIT \
8c9c2723 1983 if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \
ae7c5b9b
KW
1984 goto got_it
1985
1986/* The only difference between the BOUND and NBOUND cases is that
1987 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1988 * NBOUND. This is accomplished by passing it as either the if or else clause,
1989 * with the other one being empty (PLACEHOLDER is defined as empty).
1990 *
1991 * The TEST_FOO parameters are for operating on different forms of input, but
1992 * all should be ones that return identically for the same underlying code
1993 * points */
1994#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1995 FBC_BOUND_COMMON( \
1996 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1997 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1998
44129e46 1999#define FBC_BOUND_A(TEST_NON_UTF8) \
ae7c5b9b
KW
2000 FBC_BOUND_COMMON( \
2001 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
2002 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2003
2004#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2005 FBC_BOUND_COMMON( \
2006 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2007 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2008
44129e46 2009#define FBC_NBOUND_A(TEST_NON_UTF8) \
ae7c5b9b
KW
2010 FBC_BOUND_COMMON( \
2011 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2012 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2013
8bde5eaf
JH
2014#ifdef DEBUGGING
2015static IV
2016S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
9ffbcbcf 2017 IV cp_out = _invlist_search(invlist, cp_in);
8bde5eaf
JH
2018 assert(cp_out >= 0);
2019 return cp_out;
2020}
2021# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2022 invmap[S_get_break_val_cp_checked(invlist, cp)]
2023#else
2024# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2025 invmap[_invlist_search(invlist, cp)]
2026#endif
2027
64935bc6
KW
2028/* Takes a pointer to an inversion list, a pointer to its corresponding
2029 * inversion map, and a code point, and returns the code point's value
2030 * according to the two arrays. It assumes that all code points have a value.
2031 * This is used as the base macro for macros for particular properties */
2032#define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
8bde5eaf 2033 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
64935bc6
KW
2034
2035/* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2036 * of a code point, returning the value for the first code point in the string.
2037 * And it takes the particular macro name that finds the desired value given a
2038 * code point. Merely convert the UTF-8 to code point and call the cp macro */
2039#define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
2040 (__ASSERT_(pos < strend) \
2041 /* Note assumes is valid UTF-8 */ \
2042 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2043
2044/* Returns the GCB value for the input code point */
2045#define getGCB_VAL_CP(cp) \
2046 _generic_GET_BREAK_VAL_CP( \
2047 PL_GCB_invlist, \
02f811dd 2048 _Perl_GCB_invmap, \
64935bc6
KW
2049 (cp))
2050
2051/* Returns the GCB value for the first code point in the UTF-8 encoded string
2052 * bounded by pos and strend */
2053#define getGCB_VAL_UTF8(pos, strend) \
2054 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
05bd126c 2055
6b659339
KW
2056/* Returns the LB value for the input code point */
2057#define getLB_VAL_CP(cp) \
2058 _generic_GET_BREAK_VAL_CP( \
2059 PL_LB_invlist, \
2060 _Perl_LB_invmap, \
2061 (cp))
2062
2063/* Returns the LB value for the first code point in the UTF-8 encoded string
2064 * bounded by pos and strend */
2065#define getLB_VAL_UTF8(pos, strend) \
2066 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2067
06ae2722
KW
2068
2069/* Returns the SB value for the input code point */
2070#define getSB_VAL_CP(cp) \
2071 _generic_GET_BREAK_VAL_CP( \
2072 PL_SB_invlist, \
bf4268fa 2073 _Perl_SB_invmap, \
06ae2722
KW
2074 (cp))
2075
2076/* Returns the SB value for the first code point in the UTF-8 encoded string
2077 * bounded by pos and strend */
2078#define getSB_VAL_UTF8(pos, strend) \
2079 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2080
ae3bb8ea
KW
2081/* Returns the WB value for the input code point */
2082#define getWB_VAL_CP(cp) \
2083 _generic_GET_BREAK_VAL_CP( \
2084 PL_WB_invlist, \
bf4268fa 2085 _Perl_WB_invmap, \
ae3bb8ea
KW
2086 (cp))
2087
2088/* Returns the WB value for the first code point in the UTF-8 encoded string
2089 * bounded by pos and strend */
2090#define getWB_VAL_UTF8(pos, strend) \
2091 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2092
786e8c11 2093/* We know what class REx starts with. Try to find this position... */
02d5137b 2094/* if reginfo->intuit, its a dryrun */
786e8c11
YO
2095/* annoyingly all the vars in this routine have different names from their counterparts
2096 in regmatch. /grrr */
3c3eec57 2097STATIC char *
07be1b83 2098S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 2099 const char *strend, regmatch_info *reginfo)
a687059c 2100{
a9448551
KW
2101
2102 /* TRUE if x+ need not match at just the 1st pos of run of x's */
73104a1b 2103 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
a9448551 2104
73104a1b
KW
2105 char *pat_string; /* The pattern's exactish string */
2106 char *pat_end; /* ptr to end char of pat_string */
2107 re_fold_t folder; /* Function for computing non-utf8 folds */
2108 const U8 *fold_array; /* array for folding ords < 256 */
2109 STRLEN ln;
2110 STRLEN lnc;
73104a1b
KW
2111 U8 c1;
2112 U8 c2;
743dd5b8 2113 char *e = NULL;
21d1ed54
KW
2114
2115 /* In some cases we accept only the first occurence of 'x' in a sequence of
2116 * them. This variable points to just beyond the end of the previous
2117 * occurrence of 'x', hence we can tell if we are in a sequence. (Having
2118 * it point to beyond the 'x' allows us to work for UTF-8 without having to
2119 * hop back.) */
2120 char * previous_occurrence_end = 0;
2121
3b6c52ce 2122 I32 tmp; /* Scratch variable */
ba44c216 2123 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 2124 UV utf8_fold_flags = 0;
f9176b44 2125 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
2126 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
2127 with a result inverts that result, as 0^1 =
2128 1 and 1^1 = 0 */
2129 _char_class_number classnum;
2130
73104a1b 2131 RXi_GET_DECL(prog,progi);
2f7f8cb1 2132
73104a1b 2133 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 2134
73104a1b 2135 /* We know what class it must start with. */
dd8dc88c
KW
2136 switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) {
2137 case ANYOFPOSIXL_t8_pb: case ANYOFPOSIXL_t8_p8: case ANYOFPOSIXL_tb_pb: case ANYOFPOSIXL_tb_p8:
2138 case ANYOFL_t8_pb: case ANYOFL_t8_p8: case ANYOFL_tb_pb: case ANYOFL_tb_p8:
780fcc9f 2139 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
a0bd1a30 2140
d1c40ef5 2141 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
a0bd1a30
KW
2142 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2143 }
2144
780fcc9f 2145 /* FALLTHROUGH */
dd8dc88c
KW
2146 case ANYOFD_t8_pb: case ANYOFD_t8_p8: case ANYOFD_tb_pb: case ANYOFD_tb_p8:
2147 case ANYOF_t8_pb: case ANYOF_t8_p8: case ANYOF_tb_pb: case ANYOF_tb_p8:
73104a1b 2148 if (utf8_target) {
da10aa09 2149 REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
3db24e1e 2150 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
73104a1b 2151 }
daced5ae
KW
2152 else if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
2153 /* We know that s is in the bitmap range since the target isn't
2154 * UTF-8, so what happens for out-of-range values is not relevant,
2155 * so exclude that from the flags */
da10aa09 2156 REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
1451f692 2157 }
73104a1b 2158 else {
da10aa09 2159 REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s)));
73104a1b
KW
2160 }
2161 break;
73104a1b 2162
dd8dc88c 2163 case ANYOFM_t8_pb: case ANYOFM_t8_p8: case ANYOFM_tb_pb: case ANYOFM_tb_p8: /* ARG() is the base byte; FLAGS() the mask byte */
0a5ed81e
KW
2164 /* UTF-8ness doesn't matter because only matches UTF-8 invariants, so
2165 * use 0 */
2813d4ad 2166 REXEC_FBC_FIND_NEXT_SCAN(0,
12475f8b
KW
2167 (char *) find_next_masked((U8 *) s, (U8 *) strend,
2168 (U8) ARG(c), FLAGS(c)));
2813d4ad
KW
2169 break;
2170
dd8dc88c 2171 case NANYOFM_t8_pb: case NANYOFM_t8_p8: case NANYOFM_tb_pb: case NANYOFM_tb_p8: /* UTF-8ness does matter because can match UTF-8 variants.
0a5ed81e
KW
2172 */
2173 REXEC_FBC_FIND_NEXT_SCAN(utf8_target,
3db0bccc
KW
2174 (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2175 (U8) ARG(c), FLAGS(c)));
2176 break;
2177
dd8dc88c 2178 case ANYOFH_t8_pb: case ANYOFH_t8_p8: case ANYOFH_tb_pb: case ANYOFH_tb_p8:
ff8b8b42 2179 if (utf8_target) { /* Can't possibly match a non-UTF-8 target */
6966a05b 2180 REXEC_FBC_CLASS_SCAN(TRUE,
29a889ef
KW
2181 ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2182 && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
6966a05b
KW
2183 }
2184 break;
2185
dd8dc88c 2186 case ANYOFHb_t8_pb: case ANYOFHb_t8_p8: case ANYOFHb_tb_pb: case ANYOFHb_tb_p8:
6966a05b
KW
2187 if (utf8_target) { /* Can't possibly match a non-UTF-8 target */
2188
2189 /* We know what the first byte of any matched string should be */
ff8b8b42
KW
2190 U8 first_byte = FLAGS(c);
2191
6966a05b 2192 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
c316b824 2193 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
ff8b8b42 2194 }
c316b824
KW
2195 break;
2196
dd8dc88c 2197 case ANYOFHr_t8_pb: case ANYOFHr_t8_p8: case ANYOFHr_tb_pb: case ANYOFHr_tb_p8:
3146c00a
KW
2198 if (utf8_target) { /* Can't possibly match a non-UTF-8 target */
2199 REXEC_FBC_CLASS_SCAN(TRUE,
6af2b2bc 2200 ( inRANGE(NATIVE_UTF8_TO_I8(*s),
3146c00a
KW
2201 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
2202 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
2203 && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
2204 }
2205 break;
2206
dd8dc88c 2207 case ANYOFHs_t8_pb: case ANYOFHs_t8_p8: case ANYOFHs_tb_pb: case ANYOFHs_tb_p8:
34924db0
KW
2208 if (utf8_target) { /* Can't possibly match a non-UTF-8 target */
2209 REXEC_FBC_CLASS_SCAN(TRUE,
2210 ( strend -s >= FLAGS(c)
2211 && memEQ(s, ((struct regnode_anyofhs *) c)->string, FLAGS(c))
2212 && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
2213 }
2214 break;
2215
dd8dc88c 2216 case ANYOFR_t8_pb: case ANYOFR_t8_p8: case ANYOFR_tb_pb: case ANYOFR_tb_p8:
13fcf652
KW
2217 if (utf8_target) {
2218 REXEC_FBC_CLASS_SCAN(TRUE,
2219 ( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2220 && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2221 (U8 *) strend,
2222 NULL),
2223 ANYOFRbase(c), ANYOFRdelta(c))));
2224 }
2225 else {
2226 REXEC_FBC_CLASS_SCAN(0, withinCOUNT((U8) *s,
2227 ANYOFRbase(c), ANYOFRdelta(c)));
2228 }
2229 break;
2230
dd8dc88c 2231 case ANYOFRb_t8_pb: case ANYOFRb_t8_p8: case ANYOFRb_tb_pb: case ANYOFRb_tb_p8:
2d5613be
KW
2232 if (utf8_target) {
2233
2234 /* We know what the first byte of any matched string should be */
2235 U8 first_byte = FLAGS(c);
2236
2237 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2238 withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2239 (U8 *) strend,
2240 NULL),
2241 ANYOFRbase(c), ANYOFRdelta(c)));
2242 }
2243 else {
2244 REXEC_FBC_CLASS_SCAN(0, withinCOUNT((U8) *s,
2245 ANYOFRbase(c), ANYOFRdelta(c)));
2246 }
2247 break;
2248
dd8dc88c 2249 case EXACTFAA_NO_TRIE_t8_pb: case EXACTFAA_NO_TRIE_t8_p8: case EXACTFAA_NO_TRIE_tb_pb: case EXACTFAA_NO_TRIE_tb_p8: /* This node only generated for non-utf8 patterns */
098b07d5 2250 assert(! is_utf8_pat);
924ba076 2251 /* FALLTHROUGH */
dd8dc88c 2252 case EXACTFAA_t8_pb: case EXACTFAA_t8_p8: case EXACTFAA_tb_pb: case EXACTFAA_tb_p8:
aa419ff3
KW
2253 if (is_utf8_pat) {
2254 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2255 |FOLDEQ_S2_ALREADY_FOLDED|FOLDEQ_S2_FOLDS_SANE;
2256 goto do_exactf_utf8;
2257 }
2258 else if (utf8_target) {
2259
2260 /* Here, and elsewhere in this file, the reason we can't consider a
2261 * non-UTF-8 pattern already folded in the presence of a UTF-8
2262 * target is because any MICRO SIGN in the pattern won't be folded.
2263 * Since the fold of the MICRO SIGN requires UTF-8 to represent, we
2264 * can consider a non-UTF-8 pattern folded when matching a
2265 * non-UTF-8 target */
73104a1b
KW
2266 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2267 goto do_exactf_utf8;
2268 }
aa419ff3
KW
2269
2270 /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2271 * which these functions don't handle anyway */
2272 fold_array = PL_fold_latin1;
2273 folder = foldEQ_latin1_s2_folded;
2274 goto do_exactf_non_utf8;
77a6d856 2275
dd8dc88c 2276 case EXACTF_t8_pb: case EXACTF_t8_p8: case EXACTF_tb_pb: case EXACTF_tb_p8: /* This node only generated for non-utf8 patterns */
2fdb7295 2277 assert(! is_utf8_pat);
73104a1b 2278 if (utf8_target) {
73104a1b
KW
2279 goto do_exactf_utf8;
2280 }
2281 fold_array = PL_fold;
2282 folder = foldEQ;
2283 goto do_exactf_non_utf8;
2284
dd8dc88c 2285 case EXACTFL_t8_pb: case EXACTFL_t8_p8: case EXACTFL_tb_pb: case EXACTFL_tb_p8:
780fcc9f 2286 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
31f05a37 2287 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
cea315b6 2288 utf8_fold_flags = FOLDEQ_LOCALE;
73104a1b
KW
2289 goto do_exactf_utf8;
2290 }
2291 fold_array = PL_fold_locale;
2292 folder = foldEQ_locale;
2293 goto do_exactf_non_utf8;
3c760661 2294
dd8dc88c 2295 case EXACTFUP_t8_pb: case EXACTFUP_t8_p8: case EXACTFUP_tb_pb: case EXACTFUP_tb_p8: /* Problematic even though pattern isn't UTF-8. Use
627a7895
KW
2296 full functionality normally not done except for
2297 UTF-8 */
a56633d1 2298 assert(! is_utf8_pat);
73104a1b 2299 goto do_exactf_utf8;
16d951b7 2300
dd8dc88c 2301 case EXACTFLU8_t8_pb: case EXACTFLU8_t8_p8: case EXACTFLU8_tb_pb: case EXACTFLU8_tb_p8:
a4525e78
KW
2302 if (! utf8_target) { /* All code points in this node require
2303 UTF-8 to express. */
2304 break;
2305 }
613abc6d
KW
2306 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2307 | FOLDEQ_S2_FOLDS_SANE;
a4525e78
KW
2308 goto do_exactf_utf8;
2309
dd8dc88c 2310 case EXACTFU_REQ8_t8_pb: case EXACTFU_REQ8_t8_p8: case EXACTFU_REQ8_tb_pb: case EXACTFU_REQ8_tb_p8:
a9f8c7ac
KW
2311 if (! utf8_target) {
2312 break;
2313 }
2314 assert(is_utf8_pat);
2315 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2316 goto do_exactf_utf8;
2317
dd8dc88c 2318 case EXACTFU_t8_pb: case EXACTFU_t8_p8: case EXACTFU_tb_pb: case EXACTFU_tb_p8:
984e6dd1 2319 if (is_utf8_pat || utf8_target) {
0fbec7cf 2320 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
73104a1b
KW
2321 goto do_exactf_utf8;
2322 }
fac1af77 2323
73104a1b
KW
2324 /* Any 'ss' in the pattern should have been replaced by regcomp,
2325 * so we don't have to worry here about this single special case
2326 * in the Latin1 range */
2327 fold_array = PL_fold_latin1;
0fbec7cf 2328 folder = foldEQ_latin1_s2_folded;
73104a1b 2329
924ba076 2330 /* FALLTHROUGH */
73104a1b 2331
c52b8b12 2332 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
73104a1b
KW
2333 are no glitches with fold-length differences
2334 between the target string and pattern */
2335
2336 /* The idea in the non-utf8 EXACTF* cases is to first find the
2337 * first character of the EXACTF* node and then, if necessary,
2338 * case-insensitively compare the full text of the node. c1 is the
2339 * first character. c2 is its fold. This logic will not work for
2340 * Unicode semantics and the german sharp ss, which hence should
2341 * not be compiled into a node that gets here. */
ae06e581
KW
2342 pat_string = STRINGs(c);
2343 ln = STR_LENs(c); /* length to match in octets/bytes */
73104a1b
KW
2344
2345 /* We know that we have to match at least 'ln' bytes (which is the
2346 * same as characters, since not utf8). If we have to match 3
2347 * characters, and there are only 2 availabe, we know without
2348 * trying that it will fail; so don't start a match past the
2349 * required minimum number from the far end */
ea3daa5d 2350 e = HOP3c(strend, -((SSize_t)ln), s);
dda01918
HS
2351 if (e < s)
2352 break;
fac1af77 2353
73104a1b
KW
2354 c1 = *pat_string;
2355 c2 = fold_array[c1];
2356 if (c1 == c2) { /* If char and fold are the same */
c05cc3b6
KW
2357 while (s <= e) {
2358 s = (char *) memchr(s, c1, e + 1 - s);
2359 if (s == NULL) {
2360 break;
2361 }
2362
2363 /* Check that the rest of the node matches */
2364 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2365 && (reginfo->intuit || regtry(reginfo, &s)) )
2366 {
2367 goto got_it;
2368 }
2369 s++;
2370 }
73104a1b
KW
2371 }
2372 else {
c05cc3b6
KW
2373 U8 bits_differing = c1 ^ c2;
2374
2375 /* If the folds differ in one bit position only, we can mask to
2376 * match either of them, and can use this faster find method. Both
2377 * ASCII and EBCDIC tend to have their case folds differ in only
2378 * one position, so this is very likely */
2379 if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2380 bits_differing = ~ bits_differing;
2381 while (s <= e) {
2b1f9c71 2382 s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
c05cc3b6
KW
2383 (c1 & bits_differing), bits_differing);
2384 if (s > e) {
2385 break;
2386 }
2387
2388 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2389 && (reginfo->intuit || regtry(reginfo, &s)) )
2390 {
2391 goto got_it;
2392 }
2393 s++;
2394 }
2395 }
2396 else { /* Otherwise, stuck with looking byte-at-a-time. This
2397 should actually happen only in EXACTFL nodes */
2398 while (s <= e) {
2399 if ( (*(U8*)s == c1 || *(U8*)s == c2)
2400 && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2401 && (reginfo->intuit || regtry(reginfo, &s)) )
2402 {
2403 goto got_it;
2404 }
2405 s++;
2406 }
2407 }
73104a1b
KW
2408 }
2409 break;
fac1af77 2410
c52b8b12
KW
2411 do_exactf_utf8:
2412 {
73104a1b
KW
2413 unsigned expansion;
2414
2415 /* If one of the operands is in utf8, we can't use the simpler folding
2416 * above, due to the fact that many different characters can have the
2417 * same fold, or portion of a fold, or different- length fold */
ae06e581
KW
2418 pat_string = STRINGs(c);
2419 ln = STR_LENs(c); /* length to match in octets/bytes */
73104a1b 2420 pat_end = pat_string + ln;
984e6dd1 2421 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
2422 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2423 : ln;
2424
2425 /* We have 'lnc' characters to match in the pattern, but because of
2426 * multi-character folding, each character in the target can match
2427 * up to 3 characters (Unicode guarantees it will never exceed
2428 * this) if it is utf8-encoded; and up to 2 if not (based on the
2429 * fact that the Latin 1 folds are already determined, and the
2430 * only multi-char fold in that range is the sharp-s folding to
2431 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
2432 * string character. Adjust lnc accordingly, rounding up, so that
2433 * if we need to match at least 4+1/3 chars, that really is 5. */
2434 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2435 lnc = (lnc + expansion - 1) / expansion;
2436
2437 /* As in the non-UTF8 case, if we have to match 3 characters, and
2438 * only 2 are left, it's guaranteed to fail, so don't start a
2439 * match that would require us to go beyond the end of the string
2440 */
ea3daa5d 2441 e = HOP3c(strend, -((SSize_t)lnc), s);
73104a1b 2442
73104a1b
KW
2443 /* XXX Note that we could recalculate e to stop the loop earlier,
2444 * as the worst case expansion above will rarely be met, and as we
2445 * go along we would usually find that e moves further to the left.
2446 * This would happen only after we reached the point in the loop
2447 * where if there were no expansion we should fail. Unclear if
2448 * worth the expense */
2449
2450 while (s <= e) {
2451 char *my_strend= (char *)strend;
2452 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 2453 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
02d5137b 2454 && (reginfo->intuit || regtry(reginfo, &s)) )
73104a1b
KW
2455 {
2456 goto got_it;
2457 }
8c9c2723 2458 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
73104a1b
KW
2459 }
2460 break;
2461 }
236d82fd 2462
dd8dc88c 2463 case BOUNDL_t8_pb: case BOUNDL_t8_p8: case BOUNDL_tb_pb: case BOUNDL_tb_p8:
780fcc9f 2464 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6 2465 if (FLAGS(c) != TRADITIONAL_BOUND) {
89ad707a
KW
2466 if (! IN_UTF8_CTYPE_LOCALE) {
2467 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
64935bc6 2468 B_ON_NON_UTF8_LOCALE_IS_WRONG);
89ad707a 2469 }
64935bc6
KW
2470 goto do_boundu;
2471 }
2472
7a207065 2473 FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
73104a1b 2474 break;
64935bc6 2475
dd8dc88c 2476 case NBOUNDL_t8_pb: case NBOUNDL_t8_p8: case NBOUNDL_tb_pb: case NBOUNDL_tb_p8:
780fcc9f 2477 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6 2478 if (FLAGS(c) != TRADITIONAL_BOUND) {
89ad707a
KW
2479 if (! IN_UTF8_CTYPE_LOCALE) {
2480 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
64935bc6 2481 B_ON_NON_UTF8_LOCALE_IS_WRONG);
89ad707a 2482 }
64935bc6
KW
2483 goto do_nboundu;
2484 }
2485
7a207065 2486 FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
73104a1b 2487 break;
64935bc6 2488
dd8dc88c 2489 case BOUND_t8_pb: case BOUND_t8_p8: case BOUND_tb_pb: case BOUND_tb_p8: /* regcomp.c makes sure that this only has the traditional \b
64935bc6
KW
2490 meaning */
2491 assert(FLAGS(c) == TRADITIONAL_BOUND);
2492
7a207065 2493 FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
73104a1b 2494 break;
64935bc6 2495
dd8dc88c 2496 case BOUNDA_t8_pb: case BOUNDA_t8_p8: case BOUNDA_tb_pb: case BOUNDA_tb_p8: /* regcomp.c makes sure that this only has the traditional \b
64935bc6
KW
2497 meaning */
2498 assert(FLAGS(c) == TRADITIONAL_BOUND);
2499
44129e46 2500 FBC_BOUND_A(isWORDCHAR_A);
73104a1b 2501 break;
64935bc6 2502
dd8dc88c 2503 case NBOUND_t8_pb: case NBOUND_t8_p8: case NBOUND_tb_pb: case NBOUND_tb_p8: /* regcomp.c makes sure that this only has the traditional \b
64935bc6
KW
2504 meaning */
2505 assert(FLAGS(c) == TRADITIONAL_BOUND);
2506
7a207065 2507 FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
73104a1b 2508 break;
64935bc6 2509
dd8dc88c 2510 case NBOUNDA_t8_pb: case NBOUNDA_t8_p8: case NBOUNDA_tb_pb: case NBOUNDA_tb_p8: /* regcomp.c makes sure that this only has the traditional \b
64935bc6
KW
2511 meaning */
2512 assert(FLAGS(c) == TRADITIONAL_BOUND);
2513
44129e46 2514 FBC_NBOUND_A(isWORDCHAR_A);
73104a1b 2515 break;
64935bc6 2516
dd8dc88c 2517 case NBOUNDU_t8_pb: case NBOUNDU_t8_p8: case NBOUNDU_tb_pb: case NBOUNDU_tb_p8:
64935bc6 2518 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
7a207065 2519 FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
64935bc6
KW
2520 break;
2521 }
2522
2523 do_nboundu:
2524
2525 to_complement = 1;
6ebdcce0 2526 goto do_boundu;
64935bc6 2527
dd8dc88c 2528 case BOUNDU_t8_pb: case BOUNDU_t8_p8: case BOUNDU_tb_pb: case BOUNDU_tb_p8:
6ebdcce0
KW
2529 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2530 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2531 break;
2532 }
2533
64935bc6 2534 do_boundu:
1a651719
KW
2535 if (s == reginfo->strbeg) {
2536 if (reginfo->intuit || regtry(reginfo, &s))
2537 {
2538 goto got_it;
2539 }
a7a8bd1e 2540
1a651719
KW
2541 /* Didn't match. Try at the next position (if there is one) */
2542 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2543 if (UNLIKELY(s >= reginfo->strend)) {
6ebdcce0 2544 break;
1a651719
KW
2545 }
2546 }
64935bc6 2547
1a651719
KW
2548 switch((bound_type) FLAGS(c)) {
2549 case TRADITIONAL_BOUND: /* Should have already been handled */
2550 assert(0);
2551 break;
2552
2553 case GCB_BOUND:
2554 if (utf8_target) {
2555 GCB_enum before = getGCB_VAL_UTF8(
2556 reghop3((U8*)s, -1,
2557 (U8*)(reginfo->strbeg)),
2558 (U8*) reginfo->strend);
2559 while (s < strend) {
2560 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2561 (U8*) reginfo->strend);
2562 if ( (to_complement ^ isGCB(before,
2563 after,
2564 (U8*) reginfo->strbeg,
2565 (U8*) s,
2566 utf8_target))
2567 && (reginfo->intuit || regtry(reginfo, &s)))
2568 {
2569 goto got_it;
64935bc6 2570 }
1a651719
KW
2571 before = after;
2572 s += UTF8_SAFE_SKIP(s, reginfo->strend);
64935bc6 2573 }
1a651719
KW
2574 }
2575 else { /* Not utf8. Everything is a GCB except between CR and
2576 LF */
2577 while (s < strend) {
2578 if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
2579 || UCHARAT(s) != '\n'))
2580 && (reginfo->intuit || regtry(reginfo, &s)))
2581 {
2582 goto got_it;
64935bc6 2583 }
1a651719 2584 s++;
64935bc6 2585 }
1a651719 2586 }
64935bc6 2587
1a651719 2588 break;
ae3bb8ea 2589
1a651719
KW
2590 case LB_BOUND:
2591 if (utf8_target) {
2592 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2593 -1,
2594 (U8*)(reginfo->strbeg)),
2595 (U8*) reginfo->strend);
2596 while (s < strend) {
2597 LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2598 if (to_complement ^ isLB(before,
2599 after,
2600 (U8*) reginfo->strbeg,
2601 (U8*) s,
2602 (U8*) reginfo->strend,
2603 utf8_target)
2604 && (reginfo->intuit || regtry(reginfo, &s)))
2605 {
2606 goto got_it;
6b659339 2607 }
1a651719
KW
2608 before = after;
2609 s += UTF8_SAFE_SKIP(s, reginfo->strend);
6b659339 2610 }
1a651719
KW
2611 }
2612 else { /* Not utf8. */
2613 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2614 while (s < strend) {
2615 LB_enum after = getLB_VAL_CP((U8) *s);
2616 if (to_complement ^ isLB(before,
2617 after,
2618 (U8*) reginfo->strbeg,
2619 (U8*) s,
2620 (U8*) reginfo->strend,
2621 utf8_target)
2622 && (reginfo->intuit || regtry(reginfo, &s)))
2623 {
2624 goto got_it;
6b659339 2625 }
1a651719
KW
2626 before = after;
2627 s++;
6b659339 2628 }
1a651719 2629 }
6b659339 2630
1a651719 2631 break;
6b659339 2632
1a651719
KW
2633 case SB_BOUND:
2634 if (utf8_target) {
2635 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2636 -1,
2637 (U8*)(reginfo->strbeg)),
2638 (U8*) reginfo->strend);
2639 while (s < strend) {
2640 SB_enum after = getSB_VAL_UTF8((U8*) s,
2641 (U8*) reginfo->strend);
2642 if ((to_complement ^ isSB(before,
2643 after,
2644 (U8*) reginfo->strbeg,
2645 (U8*) s,
2646 (U8*) reginfo->strend,
2647 utf8_target))
2648 && (reginfo->intuit || regtry(reginfo, &s)))
2649 {
2650 goto got_it;
06ae2722 2651 }
1a651719
KW
2652 before = after;
2653 s += UTF8_SAFE_SKIP(s, reginfo->strend);
06ae2722 2654 }
1a651719
KW
2655 }
2656 else { /* Not utf8. */
2657 SB_enum before = getSB_VAL_CP((U8) *(s -1));
2658 while (s < strend) {
2659 SB_enum after = getSB_VAL_CP((U8) *s);
2660 if ((to_complement ^ isSB(before,
2661 after,
2662 (U8*) reginfo->strbeg,
2663 (U8*) s,
2664 (U8*) reginfo->strend,
2665 utf8_target))
2666 && (reginfo->intuit || regtry(reginfo, &s)))
2667 {
2668 goto got_it;
06ae2722 2669 }
1a651719
KW
2670 before = after;
2671 s++;
06ae2722 2672 }
1a651719 2673 }
06ae2722 2674
1a651719 2675 break;
06ae2722 2676
1a651719
KW
2677 case WB_BOUND:
2678 if (utf8_target) {
2679 /* We are at a boundary between char_sub_0 and char_sub_1.
2680 * We also keep track of the value for char_sub_-1 as we
2681 * loop through the line. Context may be needed to make a
2682 * determination, and if so, this can save having to
2683 * recalculate it */
2684 WB_enum previous = WB_UNKNOWN;
2685 WB_enum before = getWB_VAL_UTF8(
2686 reghop3((U8*)s,
2687 -1,
2688 (U8*)(reginfo->strbeg)),
2689 (U8*) reginfo->strend);
2690 while (s < strend) {
2691 WB_enum after = getWB_VAL_UTF8((U8*) s,
2692 (U8*) reginfo->strend);
2693 if ((to_complement ^ isWB(previous,
2694 before,
2695 after,
2696 (U8*) reginfo->strbeg,
2697 (U8*) s,
2698 (U8*) reginfo->strend,
2699 utf8_target))
2700 && (reginfo->intuit || regtry(reginfo, &s)))
2701 {
2702 goto got_it;
ae3bb8ea 2703 }
1a651719
KW
2704 previous = before;
2705 before = after;
2706 s += UTF8_SAFE_SKIP(s, reginfo->strend);
ae3bb8ea 2707 }
1a651719
KW
2708 }
2709 else { /* Not utf8. */
2710 WB_enum previous = WB_UNKNOWN;
2711 WB_enum before = getWB_VAL_CP((U8) *(s -1));
2712 while (s < strend) {
2713 WB_enum after = getWB_VAL_CP((U8) *s);
2714 if ((to_complement ^ isWB(previous,
2715 before,
2716 after,
2717 (U8*) reginfo->strbeg,
2718 (U8*) s,
2719 (U8*) reginfo->strend,
2720 utf8_target))
2721 && (reginfo->intuit || regtry(reginfo, &s)))
2722 {
2723 goto got_it;
ae3bb8ea 2724 }
1a651719
KW
2725 previous = before;
2726 before = after;
2727 s++;
ae3bb8ea 2728 }
1a651719 2729 }
6ebdcce0
KW
2730 }
2731
2732 /* Here are at the final position in the target string, which is a
2733 * boundary by definition, so matches, depending on other constraints.
2734 * */
ae3bb8ea 2735
8c9c2723
KW
2736 if ( reginfo->intuit
2737 || (s <= reginfo->strend && regtry(reginfo, &s)))
2738 {
ae3bb8ea
KW
2739 goto got_it;
2740 }
73104a1b 2741 break;
64935bc6 2742
dd8dc88c 2743 case LNBREAK_t8_pb: case LNBREAK_t8_p8: case LNBREAK_tb_pb: case LNBREAK_tb_p8:
73104a1b
KW
2744 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2745 is_LNBREAK_latin1_safe(s, strend)
2746 );
2747 break;
3018b823
KW
2748
2749 /* The argument to all the POSIX node types is the class number to pass to
2750 * _generic_isCC() to build a mask for searching in PL_charclass[] */
2751
dd8dc88c 2752 case NPOSIXL_t8_pb: case NPOSIXL_t8_p8: case NPOSIXL_tb_pb: case NPOSIXL_tb_p8:
3018b823
KW
2753 to_complement = 1;
2754 /* FALLTHROUGH */
2755
dd8dc88c 2756 case POSIXL_t8_pb: case POSIXL_t8_p8: case POSIXL_tb_pb: case POSIXL_tb_p8:
780fcc9f 2757 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
a78c2fa6 2758 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, (U8 *) strend)),
3018b823 2759 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 2760 break;
3018b823 2761
dd8dc88c 2762 case NPOSIXD_t8_pb: case NPOSIXD_t8_p8: case NPOSIXD_tb_pb: case NPOSIXD_tb_p8:
3018b823
KW
2763 to_complement = 1;
2764 /* FALLTHROUGH */
2765
dd8dc88c 2766 case POSIXD_t8_pb: case POSIXD_t8_p8: case POSIXD_tb_pb: case POSIXD_tb_p8:
3018b823
KW
2767 if (utf8_target) {
2768 goto posix_utf8;
2769 }
2770 goto posixa;
2771
dd8dc88c 2772 case NPOSIXA_t8_pb: case NPOSIXA_t8_p8: case NPOSIXA_tb_pb: case NPOSIXA_tb_p8:
3018b823
KW
2773 if (utf8_target) {
2774 /* The complement of something that matches only ASCII matches all
837226c8 2775 * non-ASCII, plus everything in ASCII that isn't in the class. */
da10aa09
KW
2776 REXEC_FBC_CLASS_SCAN(1, ! isASCII_utf8_safe(s, strend)
2777 || ! _generic_isCC_A(*s, FLAGS(c)));
3018b823
KW
2778 break;
2779 }
2780
2781 to_complement = 1;
4a6c6db5 2782 goto posixa;
3018b823 2783
dd8dc88c 2784 case POSIXA_t8_pb: case POSIXA_t8_p8: case POSIXA_tb_pb: case POSIXA_tb_p8:
73104a1b 2785 /* Don't need to worry about utf8, as it can match only a single
4a6c6db5
KW
2786 * byte invariant character. But we do anyway for performance reasons,
2787 * as otherwise we would have to examine all the continuation
2788 * characters */
2789 if (utf8_target) {
da10aa09 2790 REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c)));
4a6c6db5
KW
2791 break;
2792 }
2793
2794 posixa:
da10aa09 2795 REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
3018b823 2796 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 2797 break;
3018b823 2798
dd8dc88c 2799 case NPOSIXU_t8_pb: case NPOSIXU_t8_p8: case NPOSIXU_tb_pb: case NPOSIXU_tb_p8:
3018b823
KW
2800 to_complement = 1;
2801 /* FALLTHROUGH */
2802
dd8dc88c 2803 case POSIXU_t8_pb: case POSIXU_t8_p8: case POSIXU_tb_pb: case POSIXU_tb_p8:
3018b823 2804 if (! utf8_target) {
da10aa09
KW
2805 REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2806 to_complement ^ cBOOL(_generic_isCC(*s,
3018b823
KW
2807 FLAGS(c))));
2808 }
2809 else {
2810
c52b8b12 2811 posix_utf8:
3018b823 2812 classnum = (_char_class_number) FLAGS(c);
8d692afb
KW
2813 switch (classnum) {
2814 default:
2815 REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2816 to_complement ^ cBOOL(_invlist_contains_cp(
2817 PL_XPosix_ptrs[classnum],
2818 utf8_to_uvchr_buf((U8 *) s,
2819 (U8 *) strend,
2820 NULL))));
2821 break;
779cf272 2822 case _CC_ENUM_SPACE:
da10aa09 2823 REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
7a207065 2824 to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
3018b823
KW
2825 break;
2826
2827 case _CC_ENUM_BLANK:
da10aa09 2828 REXEC_FBC_CLASS_SCAN(1,
7a207065 2829 to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
3018b823
KW
2830 break;
2831
2832 case _CC_ENUM_XDIGIT:
da10aa09 2833 REXEC_FBC_CLASS_SCAN(1,
7a207065 2834 to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3018b823
KW
2835 break;
2836
2837 case _CC_ENUM_VERTSPACE:
da10aa09 2838 REXEC_FBC_CLASS_SCAN(1,
7a207065 2839 to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3018b823
KW
2840 break;
2841
2842 case _CC_ENUM_CNTRL:
da10aa09 2843 REXEC_FBC_CLASS_SCAN(1,
7a207065 2844 to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
3018b823 2845 break;
3018b823
KW
2846 }
2847 }
2848 break;
2849
dd8dc88c
KW
2850 case AHOCORASICKC_t8_pb: case AHOCORASICKC_t8_p8: case AHOCORASICKC_tb_pb: case AHOCORASICKC_tb_p8:
2851 case AHOCORASICK_t8_pb: case AHOCORASICK_t8_p8: case AHOCORASICK_tb_pb: case AHOCORASICK_tb_p8:
73104a1b
KW
2852 {
2853 DECL_TRIE_TYPE(c);
2854 /* what trie are we using right now */
2855 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2856 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2857 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2858
2859 const char *last_start = strend - trie->minlen;
6148ee25 2860#ifdef DEBUGGING
73104a1b 2861 const char *real_start = s;
6148ee25 2862#endif
73104a1b
KW
2863 STRLEN maxlen = trie->maxlen;
2864 SV *sv_points;
2865 U8 **points; /* map of where we were in the input string
2866 when reading a given char. For ASCII this
2867 is unnecessary overhead as the relationship
2868 is always 1:1, but for Unicode, especially
2869 case folded Unicode this is not true. */
2870 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2871 U8 *bitmap=NULL;
2872
2873
271b36b1 2874 DECLARE_AND_GET_RE_DEBUG_FLAGS;
73104a1b
KW
2875
2876 /* We can't just allocate points here. We need to wrap it in
2877 * an SV so it gets freed properly if there is a croak while
2878 * running the match */
2879 ENTER;
2880 SAVETMPS;
2881 sv_points=newSV(maxlen * sizeof(U8 *));
2882 SvCUR_set(sv_points,
2883 maxlen * sizeof(U8 *));
2884 SvPOK_on(sv_points);
2885 sv_2mortal(sv_points);
2886 points=(U8**)SvPV_nolen(sv_points );
2887 if ( trie_type != trie_utf8_fold
2888 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2889 {
2890 if (trie->bitmap)
2891 bitmap=(U8*)trie->bitmap;
2892 else
2893 bitmap=(U8*)ANYOF_BITMAP(c);
2894 }
2895 /* this is the Aho-Corasick algorithm modified a touch
2896 to include special handling for long "unknown char" sequences.
2897 The basic idea being that we use AC as long as we are dealing
2898 with a possible matching char, when we encounter an unknown char
2899 (and we have not encountered an accepting state) we scan forward
2900 until we find a legal starting char.
2901 AC matching is basically that of trie matching, except that when
2902 we encounter a failing transition, we fall back to the current
2903 states "fail state", and try the current char again, a process
2904 we repeat until we reach the root state, state 1, or a legal
2905 transition. If we fail on the root state then we can either
2906 terminate if we have reached an accepting state previously, or
2907 restart the entire process from the beginning if we have not.
2908
2909 */
2910 while (s <= last_start) {
2911 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2912 U8 *uc = (U8*)s;
2913 U16 charid = 0;
2914 U32 base = 1;
2915 U32 state = 1;
2916 UV uvc = 0;
2917 STRLEN len = 0;
2918 STRLEN foldlen = 0;
2919 U8 *uscan = (U8*)NULL;
2920 U8 *leftmost = NULL;
2921#ifdef DEBUGGING
2922 U32 accepted_word= 0;
786e8c11 2923#endif
73104a1b
KW
2924 U32 pointpos = 0;
2925
2926 while ( state && uc <= (U8*)strend ) {
2927 int failed=0;
2928 U32 word = aho->states[ state ].wordnum;
2929
2930 if( state==1 ) {
2931 if ( bitmap ) {
2932 DEBUG_TRIE_EXECUTE_r(
2933 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2934 dump_exec_pos( (char *)uc, c, strend, real_start,
cb41e5d6 2935 (char *)uc, utf8_target, 0 );
6ad9a8ab 2936 Perl_re_printf( aTHX_
73104a1b
KW
2937 " Scanning for legal start char...\n");
2938 }
2939 );
2940 if (utf8_target) {
2941 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2942 uc += UTF8SKIP(uc);
2943 }
2944 } else {
2945 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2946 uc++;
2947 }
786e8c11 2948 }
73104a1b 2949 s= (char *)uc;
07be1b83 2950 }
73104a1b
KW
2951 if (uc >(U8*)last_start) break;
2952 }
2953
2954 if ( word ) {
2955 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2956 if (!leftmost || lpos < leftmost) {
2957 DEBUG_r(accepted_word=word);
2958 leftmost= lpos;
7016d6eb 2959 }
73104a1b 2960 if (base==0) break;
7016d6eb 2961
73104a1b
KW
2962 }
2963 points[pointpos++ % maxlen]= uc;
2964 if (foldlen || uc < (U8*)strend) {
9ad8cac4
KW
2965 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
2966 (U8 *) strend, uscan, len, uvc,
2967 charid, foldlen, foldbuf,
2968 uniflags);
73104a1b
KW
2969 DEBUG_TRIE_EXECUTE_r({
2970 dump_exec_pos( (char *)uc, c, strend,
cb41e5d6 2971 real_start, s, utf8_target, 0);
6ad9a8ab 2972 Perl_re_printf( aTHX_
147e3846 2973 " Charid:%3u CP:%4" UVxf " ",
73104a1b
KW
2974 charid, uvc);
2975 });
2976 }
2977 else {
2978 len = 0;
2979 charid = 0;
2980 }
07be1b83 2981
73104a1b
KW
2982
2983 do {
6148ee25 2984#ifdef DEBUGGING
73104a1b 2985 word = aho->states[ state ].wordnum;
6148ee25 2986#endif
73104a1b
KW
2987 base = aho->states[ state ].trans.base;
2988
2989 DEBUG_TRIE_EXECUTE_r({
2990 if (failed)
2991 dump_exec_pos( (char *)uc, c, strend, real_start,
cb41e5d6 2992 s, utf8_target, 0 );
6ad9a8ab 2993 Perl_re_printf( aTHX_
147e3846 2994 "%sState: %4" UVxf ", word=%" UVxf,
73104a1b
KW
2995 failed ? " Fail transition to " : "",
2996 (UV)state, (UV)word);
2997 });
2998 if ( base ) {
2999 U32 tmp;
3000 I32 offset;
3001 if (charid &&
3002 ( ((offset = base + charid
3003 - 1 - trie->uniquecharcount)) >= 0)
3004 && ((U32)offset < trie->lasttrans)
3005 && trie->trans[offset].check == state
3006 && (tmp=trie->trans[offset].next))
3007 {
3008 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 3009 Perl_re_printf( aTHX_ " - legal\n"));
73104a1b
KW
3010 state = tmp;
3011 break;
07be1b83
YO
3012 }
3013 else {
786e8c11 3014 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 3015 Perl_re_printf( aTHX_ " - fail\n"));
786e8c11 3016 failed = 1;
73104a1b 3017 state = aho->fail[state];
07be1b83 3018 }
07be1b83 3019 }
73104a1b
KW
3020 else {
3021 /* we must be accepting here */
3022 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 3023 Perl_re_printf( aTHX_ " - accepting\n"));
73104a1b
KW
3024 failed = 1;
3025 break;
786e8c11 3026 }
73104a1b
KW
3027 } while(state);
3028 uc += len;
3029 if (failed) {
3030 if (leftmost)
3031 break;
3032 if (!state) state = 1;
07be1b83 3033 }
73104a1b
KW
3034 }
3035 if ( aho->states[ state ].wordnum ) {
3036 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
3037 if (!leftmost || lpos < leftmost) {
3038 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3039 leftmost = lpos;
07be1b83
YO
3040 }
3041 }
73104a1b
KW
3042 if (leftmost) {
3043 s = (char*)leftmost;
3044 DEBUG_TRIE_EXECUTE_r({
147e3846 3045 Perl_re_printf( aTHX_ "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
73104a1b
KW
3046 (UV)accepted_word, (IV)(s - real_start)
3047 );
3048 });
02d5137b 3049 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
3050 FREETMPS;
3051 LEAVE;
3052 goto got_it;
3053 }
8c9c2723
KW
3054 if (s < reginfo->strend) {
3055 s = HOPc(s,1);
3056 }
73104a1b 3057 DEBUG_TRIE_EXECUTE_r({
6ad9a8ab 3058 Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
73104a1b
KW
3059 });
3060 } else {
3061 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 3062 Perl_re_printf( aTHX_ "No match.\n"));
73104a1b
KW
3063 break;
3064 }
3065 }
3066 FREETMPS;
3067 LEAVE;
3068 }
3069 break;
3070 default:
3071 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
73104a1b
KW
3072 }
3073 return 0;
3074 got_it:
3075 return s;
6eb5f6b9
JH
3076}
3077
60165aa4
DM
3078/* set RX_SAVED_COPY, RX_SUBBEG etc.
3079 * flags have same meanings as with regexec_flags() */
3080
749f4950
DM
3081static void
3082S_reg_set_capture_string(pTHX_ REGEXP * const rx,
60165aa4
DM
3083 char *strbeg,
3084 char *strend,
3085 SV *sv,
3086 U32 flags,
3087 bool utf8_target)
3088{
3089 struct regexp *const prog = ReANY(rx);
3090
60165aa4
DM
3091 if (flags & REXEC_COPY_STR) {
3092#ifdef PERL_ANY_COW
3093 if (SvCANCOW(sv)) {
eb8fc9fe 3094 DEBUG_C(Perl_re_printf( aTHX_
60165aa4 3095 "Copy on write: regexp capture, type %d\n",
eb8fc9fe 3096 (int) SvTYPE(sv)));
5411a0e5
DM
3097 /* Create a new COW SV to share the match string and store
3098 * in saved_copy, unless the current COW SV in saved_copy
3099 * is valid and suitable for our purpose */
3100 if (( prog->saved_copy
3101 && SvIsCOW(prog->saved_copy)
3102 && SvPOKp(prog->saved_copy)
3103 && SvIsCOW(sv)
3104 && SvPOKp(sv)
3105 && SvPVX(sv) == SvPVX(prog->saved_copy)))
a76b0e90 3106 {
5411a0e5
DM
3107 /* just reuse saved_copy SV */
3108 if (RXp_MATCH_COPIED(prog)) {
3109 Safefree(prog->subbeg);
3110 RXp_MATCH_COPIED_off(prog);
3111 }
3112 }
3113 else {
3114 /* create new COW SV to share string */
196a02af 3115 RXp_MATCH_COPY_FREE(prog);
a76b0e90 3116 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
a76b0e90 3117 }
5411a0e5
DM
3118 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3119 assert (SvPOKp(prog->saved_copy));
60165aa4
DM
3120 prog->sublen = strend - strbeg;
3121 prog->suboffset = 0;
3122 prog->subcoffset = 0;
3123 } else
3124#endif
3125 {
99a90e59
FC
3126 SSize_t min = 0;
3127 SSize_t max = strend - strbeg;
ea3daa5d 3128 SSize_t sublen;
60165aa4
DM
3129
3130 if ( (flags & REXEC_COPY_SKIP_POST)
e322109a 3131 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
3132 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3133 ) { /* don't copy $' part of string */
3134 U32 n = 0;
3135 max = -1;
3136 /* calculate the right-most part of the string covered
f67a5002 3137 * by a capture. Due to lookahead, this may be to
60165aa4
DM
3138 * the right of $&, so we have to scan all captures */
3139 while (n <= prog->lastparen) {
3140 if (prog->offs[n].end > max)
3141 max = prog->offs[n].end;
3142 n++;
3143 }
3144 if (max == -1)
3145 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3146 ? prog->offs[0].start
3147 : 0;
3148 assert(max >= 0 && max <= strend - strbeg);
3149 }
3150
3151 if ( (flags & REXEC_COPY_SKIP_PRE)
e322109a 3152 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
3153 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3154 ) { /* don't copy $` part of string */
3155 U32 n = 0;
3156 min = max;
3157 /* calculate the left-most part of the string covered
f67a5002 3158 * by a capture. Due to lookbehind, this may be to
60165aa4
DM
3159 * the left of $&, so we have to scan all captures */
3160 while (min && n <= prog->lastparen) {
3161 if ( prog->offs[n].start != -1
3162 && prog->offs[n].start < min)
3163 {
3164 min = prog->offs[n].start;
3165 }
3166 n++;
3167 }
3168 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3169 && min > prog->offs[0].end
3170 )
3171 min = prog->offs[0].end;
3172
3173 }
3174
3175 assert(min >= 0 && min <= max && min <= strend - strbeg);
3176 sublen = max - min;
3177
196a02af 3178 if (RXp_MATCH_COPIED(prog)) {
60165aa4
DM
3179 if (sublen > prog->sublen)
3180 prog->subbeg =
3181 (char*)saferealloc(prog->subbeg, sublen+1);
3182 }
3183 else
3184 prog->subbeg = (char*)safemalloc(sublen+1);
3185 Copy(strbeg + min, prog->subbeg, sublen, char);
3186 prog->subbeg[sublen] = '\0';
3187 prog->suboffset = min;
3188 prog->sublen = sublen;
196a02af 3189 RXp_MATCH_COPIED_on(prog);
60165aa4
DM
3190 }
3191 prog->subcoffset = prog->suboffset;
3192 if (prog->suboffset && utf8_target) {
3193 /* Convert byte offset to chars.
3194 * XXX ideally should only compute this if @-/@+
3195 * has been seen, a la PL_sawampersand ??? */
3196
3197 /* If there's a direct correspondence between the
3198 * string which we're matching and the original SV,
3199 * then we can use the utf8 len cache associated with
3200 * the SV. In particular, it means that under //g,
3201 * sv_pos_b2u() will use the previously cached
3202 * position to speed up working out the new length of
3203 * subcoffset, rather than counting from the start of
3204 * the string each time. This stops
3205 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3206 * from going quadratic */
3207 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
ea3daa5d
FC
3208 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3209 SV_GMAGIC|SV_CONST_RETURN);
60165aa4
DM
3210 else
3211 prog->subcoffset = utf8_length((U8*)strbeg,
3212 (U8*)(strbeg+prog->suboffset));
3213 }
3214 }
3215 else {
196a02af 3216 RXp_MATCH_COPY_FREE(prog);
60165aa4
DM
3217 prog->subbeg = strbeg;
3218 prog->suboffset = 0;
3219 prog->subcoffset = 0;
3220 prog->sublen = strend - strbeg;
3221 }
3222}
3223
3224
3225
fae667d5 3226
6eb5f6b9
JH
3227/*
3228 - regexec_flags - match a regexp against a string
3229 */
3230I32
5aaab254 3231Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
ea3daa5d 3232 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
3233/* stringarg: the point in the string at which to begin matching */
3234/* strend: pointer to null at end of string */
3235/* strbeg: real beginning of string */
3236/* minend: end of match must be >= minend bytes after stringarg. */
3237/* sv: SV being matched: only used for utf8 flag, pos() etc; string
3238 * itself is accessed via the pointers above */
3239/* data: May be used for some additional optimizations.
d058ec57 3240 Currently unused. */
a340edde 3241/* flags: For optimizations. See REXEC_* in regexp.h */
8fd1a950 3242
6eb5f6b9 3243{
8d919b0a 3244 struct regexp *const prog = ReANY(rx);
5aaab254 3245 char *s;
eb578fdb 3246 regnode *c;
03c83e26 3247 char *startpos;
ea3daa5d
FC
3248 SSize_t minlen; /* must match at least this many chars */
3249 SSize_t dontbother = 0; /* how many characters not to try at end */
f2ed9b32 3250 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 3251 I32 multiline;
f8fc2ecf 3252 RXi_GET_DECL(prog,progi);
02d5137b
DM
3253 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
3254 regmatch_info *const reginfo = &reginfo_buf;
e9105d30 3255 regexp_paren_pair *swap = NULL;
006f26b2 3256 I32 oldsave;
271b36b1 3257 DECLARE_AND_GET_RE_DEBUG_FLAGS;
a3621e74 3258
7918f24d 3259 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 3260 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
3261
3262 /* Be paranoid... */
3dc78631 3263 if (prog == NULL) {
6eb5f6b9 3264 Perl_croak(aTHX_ "NULL regexp parameter");
6eb5f6b9
JH
3265 }
3266
6c3fea77 3267 DEBUG_EXECUTE_r(
03c83e26 3268 debug_start_match(rx, utf8_target, stringarg, strend,
6c3fea77
DM
3269 "Matching");
3270 );
8adc0f72 3271
b342a604
DM
3272 startpos = stringarg;
3273
4cf1a867
DM
3274 /* set these early as they may be used by the HOP macros below */
3275 reginfo->strbeg = strbeg;
3276 reginfo->strend = strend;
3277 reginfo->is_utf8_target = cBOOL(utf8_target);
3278
58430ea8 3279 if (prog->intflags & PREGf_GPOS_SEEN) {
d307c076
DM
3280 MAGIC *mg;
3281
fef7148b
DM
3282 /* set reginfo->ganch, the position where \G can match */
3283
3284 reginfo->ganch =
3285 (flags & REXEC_IGNOREPOS)
3286 ? stringarg /* use start pos rather than pos() */
3dc78631 3287 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
25fdce4a
FC
3288 /* Defined pos(): */
3289 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
fef7148b
DM
3290 : strbeg; /* pos() not defined; use start of string */
3291
6ad9a8ab 3292 DEBUG_GPOS_r(Perl_re_printf( aTHX_
147e3846 3293 "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
fef7148b 3294
03c83e26
DM
3295 /* in the presence of \G, we may need to start looking earlier in
3296 * the string than the suggested start point of stringarg:
0b2c2a84 3297 * if prog->gofs is set, then that's a known, fixed minimum
03c83e26
DM
3298 * offset, such as
3299 * /..\G/: gofs = 2
3300 * /ab|c\G/: gofs = 1
3301 * or if the minimum offset isn't known, then we have to go back
3302 * to the start of the string, e.g. /w+\G/
3303 */
2bfbe302 3304
8e1490ee 3305 if (prog->intflags & PREGf_ANCH_GPOS) {
4cf1a867
DM
3306 if (prog->gofs) {
3307 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3308 if (!startpos ||
3309 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3310 {
6d4548a7 3311 DEBUG_GPOS_r(Perl_re_printf( aTHX_
4cf1a867
DM
3312 "fail: ganch-gofs before earliest possible start\n"));
3313 return 0;
3314 }
2bfbe302 3315 }
4cf1a867
DM
3316 else
3317 startpos = reginfo->ganch;
2bfbe302
DM
3318 }
3319 else if (prog->gofs) {
4cf1a867
DM
3320 startpos = HOPBACKc(startpos, prog->gofs);
3321 if (!startpos)
b342a604 3322 startpos = strbeg;
03c83e26 3323 }
58430ea8 3324 else if (prog->intflags & PREGf_GPOS_FLOAT)
b342a604 3325 startpos = strbeg;
03c83e26
DM
3326 }
3327
3328 minlen = prog->minlen;
b342a604 3329 if ((startpos + minlen) > strend || startpos < strbeg) {
6a86f90c
KW
3330 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3331 "Regex match can't succeed, so not even tried\n"));
03c83e26
DM
3332 return 0;
3333 }
3334
63a3746a
DM
3335 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3336 * which will call destuctors to reset PL_regmatch_state, free higher
3337 * PL_regmatch_slabs, and clean up regmatch_info_aux and
3338 * regmatch_info_aux_eval */
3339
3340 oldsave = PL_savestack_ix;
3341
dfa77d06
DM
3342 s = startpos;
3343
e322109a 3344 if ((prog->extflags & RXf_USE_INTUIT)
7fadf4a7
DM
3345 && !(flags & REXEC_CHECKED))
3346 {
dfa77d06 3347 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
7fadf4a7 3348 flags, NULL);
dfa77d06 3349 if (!s)
7fadf4a7
DM
3350 return 0;
3351
e322109a 3352 if (prog->extflags & RXf_CHECK_ALL) {
7fadf4a7
DM
3353 /* we can match based purely on the result of INTUIT.
3354 * Set up captures etc just for $& and $-[0]
3355 * (an intuit-only match wont have $1,$2,..) */
3356 assert(!prog->nparens);
d5e7783a
DM
3357
3358 /* s/// doesn't like it if $& is earlier than where we asked it to
3359 * start searching (which can happen on something like /.\G/) */
3360 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3361 && (s < stringarg))
3362 {
3363 /* this should only be possible under \G */
58430ea8 3364 assert(prog->intflags & PREGf_GPOS_SEEN);
6ad9a8ab 3365 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
d5e7783a
DM
3366 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3367 goto phooey;
3368 }
3369
7fadf4a7
DM
3370 /* match via INTUIT shouldn't have any captures.
3371 * Let @-, @+, $^N know */
3372 prog->lastparen = prog->lastcloseparen = 0;
196a02af 3373 RXp_MATCH_UTF8_set(prog, utf8_target);
3ff69bd6
DM
3374 prog->offs[0].start = s - strbeg;
3375 prog->offs[0].end = utf8_target
8875b6de 3376 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3ff69bd6 3377 : s - strbeg + prog->minlenret;
7fadf4a7 3378 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 3379 S_reg_set_capture_string(aTHX_ rx,
7fadf4a7
DM
3380 strbeg, strend,
3381 sv, flags, utf8_target);
3382
7fadf4a7
DM
3383 return 1;
3384 }
3385 }
3386
6c3fea77 3387 multiline = prog->extflags & RXf_PMf_MULTILINE;
1de06328 3388
dfa77d06 3389 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
6ad9a8ab 3390 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
a72c7584
JH
3391 "String too short [regexec_flags]...\n"));
3392 goto phooey;
1aa99e6b 3393 }
1de06328 3394
6eb5f6b9 3395 /* Check validity of program. */
f8fc2ecf 3396 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
3397 Perl_croak(aTHX_ "corrupted regexp program");
3398 }
3399
196a02af
DM
3400 RXp_MATCH_TAINTED_off(prog);
3401 RXp_MATCH_UTF8_set(prog, utf8_target);
1738e041 3402
6c3fea77
DM
3403 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
3404 reginfo->intuit = 0;
02d5137b
DM
3405 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3406 reginfo->warned = FALSE;
02d5137b 3407 reginfo->sv = sv;
1cb48e53 3408 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
6eb5f6b9 3409 /* see how far we have to get to not match where we matched before */
fe3974be 3410 reginfo->till = stringarg + minend;
6eb5f6b9 3411
60779a30 3412 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
82c23608
FC
3413 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3414 S_cleanup_regmatch_info_aux has executed (registered by
3415 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
3416 magic belonging to this SV.
3417 Not newSVsv, either, as it does not COW.
3418 */
3419 reginfo->sv = newSV(0);
4cba5ac0 3420 SvSetSV_nosteal(reginfo->sv, sv);
82c23608
FC
3421 SAVEFREESV(reginfo->sv);
3422 }
3423
331b2dcc
DM
3424 /* reserve next 2 or 3 slots in PL_regmatch_state:
3425 * slot N+0: may currently be in use: skip it
3426 * slot N+1: use for regmatch_info_aux struct
3427 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3428 * slot N+3: ready for use by regmatch()
3429 */
bf2039a9 3430
331b2dcc
DM
3431 {
3432 regmatch_state *old_regmatch_state;
3433 regmatch_slab *old_regmatch_slab;
3434 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3435
3436 /* on first ever match, allocate first slab */
3437 if (!PL_regmatch_slab) {
3438 Newx(PL_regmatch_slab, 1, regmatch_slab);
3439 PL_regmatch_slab->prev = NULL;
3440 PL_regmatch_slab->next = NULL;
3441 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3442 }
bf2039a9 3443
331b2dcc
DM
3444 old_regmatch_state = PL_regmatch_state;
3445 old_regmatch_slab = PL_regmatch_slab;
bf2039a9 3446
331b2dcc
DM
3447 for (i=0; i <= max; i++) {
3448 if (i == 1)
3449 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3450 else if (i ==2)
3451 reginfo->info_aux_eval =
3452 reginfo->info_aux->info_aux_eval =
3453 &(PL_regmatch_state->u.info_aux_eval);
bf2039a9 3454
331b2dcc
DM
3455