This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regmatch(): add asserts for legal values of nextchr
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
7e0d5ad7
KW
40/* At least one required character in the target string is expressible only in
41 * UTF-8. */
42const char* const non_utf8_target_but_utf8_required
43 = "Can't match, because target string needs to be in UTF-8\n";
44
a687059c 45/*
e50aee73 46 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
47 *
48 * Copyright (c) 1986 by University of Toronto.
49 * Written by Henry Spencer. Not derived from licensed software.
50 *
51 * Permission is granted to anyone to use this software for any
52 * purpose on any computer system, and to redistribute it freely,
53 * subject to the following restrictions:
54 *
55 * 1. The author is not responsible for the consequences of use of
56 * this software, no matter how awful, even if they arise
57 * from defects in it.
58 *
59 * 2. The origin of this software must not be misrepresented, either
60 * by explicit claim or by omission.
61 *
62 * 3. Altered versions must be plainly marked as such, and must not
63 * be misrepresented as being the original software.
64 *
65 **** Alterations to Henry's code are...
66 ****
4bb101f2 67 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
68 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
69 **** by Larry Wall and others
a687059c 70 ****
9ef589d8
LW
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
a687059c
LW
73 *
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
77 */
78#include "EXTERN.h"
864dbfa3 79#define PERL_IN_REGEXEC_C
a687059c 80#include "perl.h"
0f5d15d6 81
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
81e983c1 88#include "inline_invlist.c"
1b0f46bf 89#include "unicode_constants.h"
81e983c1 90
ef07e810 91#define RF_tainted 1 /* tainted information used? e.g. locale */
c277df42 92#define RF_warned 2 /* warned about big count? */
faec1544 93
ab3bbdeb 94#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 95
f2ed9b32 96#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02 97
c74f6de9
KW
98#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
e0193e47 104/* Valid for non-utf8 strings: avoids the reginclass
7e2509c1
KW
105 * call if there are no complications: i.e., if everything matchable is
106 * straight forward in the bitmap */
af364d03
KW
107#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
108 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 109
c277df42
IZ
110/*
111 * Forwards.
112 */
113
f2ed9b32 114#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 115#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 116
3dab1dad
YO
117#define HOPc(pos,off) \
118 (char *)(PL_reg_match_utf8 \
52657f30 119 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
120 : (U8*)(pos + off))
121#define HOPBACKc(pos, off) \
07be1b83
YO
122 (char*)(PL_reg_match_utf8\
123 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
124 : (pos - off >= PL_bostr) \
8e11feef 125 ? (U8*)pos - off \
3dab1dad 126 : NULL)
efb30f32 127
e7409c1b 128#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 129#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 130
7016d6eb
DM
131
132#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
133#define NEXTCHR_IS_EOS (nextchr < 0)
134
135#define SET_nextchr \
136 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
137
138#define SET_locinput(p) \
139 locinput = (p); \
140 SET_nextchr
141
142
20d0b1e9 143/* these are unrolled below in the CCC_TRY_XXX defined */
61dad979 144#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
9c4fdda1 145 if (!CAT2(PL_utf8_,class)) { \
cf54f63a 146 bool ok; \
9c4fdda1 147 ENTER; save_re_context(); \
cf54f63a
JL
148 ok=CAT2(is_utf8_,class)((const U8*)str); \
149 PERL_UNUSED_VAR(ok); \
150 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
37e2e78e
KW
151/* Doesn't do an assert to verify that is correct */
152#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
9c4fdda1 153 if (!CAT2(PL_utf8_,class)) { \
9d63fa07 154 bool throw_away PERL_UNUSED_DECL; \
9c4fdda1
RB
155 ENTER; save_re_context(); \
156 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
157 LEAVE; } } STMT_END
37e2e78e 158
1a4fad37
AL
159#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
160#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
161#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 162
37e2e78e 163#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
61dad979
KW
164 /* No asserts are done for some of these, in case called on a */ \
165 /* Unicode version in which they map to nothing */ \
27d4fc33 166 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
61dad979 167 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
20d0b1e9 168
1dcf4a1b 169#define PLACEHOLDER /* Something for the preprocessor to grab onto */
d1eb3177 170
ee9a90b8
KW
171/* The actual code for CCC_TRY, which uses several variables from the routine
172 * it's callable from. It is designed to be the bulk of a case statement.
173 * FUNC is the macro or function to call on non-utf8 targets that indicate if
174 * nextchr matches the class.
175 * UTF8_TEST is the whole test string to use for utf8 targets
176 * LOAD is what to use to test, and if not present to load in the swash for the
177 * class
178 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
179 * UTF8_TEST test.
180 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
181 * utf8 and a variant, load the swash if necessary and test using the utf8
182 * test. Advance to the next character if test is ok, otherwise fail; If not
183 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
184 * fails, or advance to the next character */
185
186#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
7016d6eb 187 if (NEXTCHR_IS_EOS) { \
ee9a90b8
KW
188 sayNO; \
189 } \
190 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
191 LOAD_UTF8_CHARCLASS(CLASS, STR); \
192 if (POS_OR_NEG (UTF8_TEST)) { \
193 sayNO; \
194 } \
ee9a90b8 195 } \
28b98f76
DM
196 else if (POS_OR_NEG (FUNC(nextchr))) { \
197 sayNO; \
ee9a90b8 198 } \
28b98f76 199 goto increment_locinput;
980866de 200
ee9a90b8
KW
201/* Handle the non-locale cases for a character class and its complement. It
202 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
203 * This is because that code fails when the test succeeds, so we want to have
204 * the test fail so that the code succeeds. The swash is stored in a
205 * predictable PL_ place */
cfaf538b
KW
206#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
207 CLASS, STR) \
ee9a90b8
KW
208 case NAME: \
209 _CCC_TRY_CODE( !, FUNC, \
210 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
211 (U8*)locinput, TRUE)), \
212 CLASS, STR) \
213 case NNAME: \
1dcf4a1b 214 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
ee9a90b8
KW
215 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
216 (U8*)locinput, TRUE)), \
217 CLASS, STR) \
218
219/* Generate the case statements for both locale and non-locale character
220 * classes in regmatch for classes that don't have special unicode semantics.
221 * Locales don't use an immediate swash, but an intermediary special locale
222 * function that is called on the pointer to the current place in the input
223 * string. That function will resolve to needing the same swash. One might
224 * think that because we don't know what the locale will match, we shouldn't
225 * check with the swash loading function that it loaded properly; ie, that we
226 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
227 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
228 * irrelevant here */
229#define CCC_TRY(NAME, NNAME, FUNC, \
230 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
cfaf538b 231 NAMEA, NNAMEA, FUNCA, \
ee9a90b8
KW
232 CLASS, STR) \
233 case NAMEL: \
234 PL_reg_flags |= RF_tainted; \
235 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
236 case NNAMEL: \
237 PL_reg_flags |= RF_tainted; \
1dcf4a1b
KW
238 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
239 CLASS, STR) \
cfaf538b 240 case NAMEA: \
7016d6eb 241 if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) { \
cfaf538b
KW
242 sayNO; \
243 } \
244 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
3640db6b 245 locinput++; \
cfaf538b
KW
246 break; \
247 case NNAMEA: \
7016d6eb 248 if (NEXTCHR_IS_EOS || FUNCA(nextchr)) { \
cfaf538b
KW
249 sayNO; \
250 } \
28b98f76 251 goto increment_locinput; \
ee9a90b8
KW
252 /* Generate the non-locale cases */ \
253 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
254
255/* This is like CCC_TRY, but has an extra set of parameters for generating case
256 * statements to handle separate Unicode semantics nodes */
257#define CCC_TRY_U(NAME, NNAME, FUNC, \
258 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
259 NAMEU, NNAMEU, FUNCU, \
cfaf538b 260 NAMEA, NNAMEA, FUNCA, \
ee9a90b8 261 CLASS, STR) \
cfaf538b
KW
262 CCC_TRY(NAME, NNAME, FUNC, \
263 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
264 NAMEA, NNAMEA, FUNCA, \
265 CLASS, STR) \
ee9a90b8 266 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 267
3dab1dad
YO
268/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
269
5f80c4cf 270/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
271/* it would be nice to rework regcomp.sym to generate this stuff. sigh
272 *
273 * NOTE that *nothing* that affects backtracking should be in here, specifically
274 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
275 * node that is in between two EXACT like nodes when ascertaining what the required
276 * "follow" character is. This should probably be moved to regex compile time
277 * although it may be done at run time beause of the REF possibility - more
278 * investigation required. -- demerphq
279*/
3e901dc0
YO
280#define JUMPABLE(rn) ( \
281 OP(rn) == OPEN || \
282 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
283 OP(rn) == EVAL || \
cca55fe3
JP
284 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
285 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 286 OP(rn) == KEEPS || \
3dab1dad 287 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 288)
ee9b8eae 289#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 290
ee9b8eae
YO
291#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
292
293#if 0
294/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
295 we don't need this definition. */
296#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 297#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
298#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
299
300#else
301/* ... so we use this as its faster. */
302#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 303#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
304#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
305#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
306
307#endif
e2d8ce26 308
a84d97b6
HS
309/*
310 Search for mandatory following text node; for lookahead, the text must
311 follow but for lookbehind (rn->flags != 0) we skip to the next step.
312*/
cca55fe3 313#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
314 while (JUMPABLE(rn)) { \
315 const OPCODE type = OP(rn); \
316 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 317 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 318 else if (type == PLUS) \
cca55fe3 319 rn = NEXTOPER(rn); \
3dab1dad 320 else if (type == IFMATCH) \
a84d97b6 321 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 322 else rn += NEXT_OFF(rn); \
3dab1dad 323 } \
5f80c4cf 324} STMT_END
74750237 325
c476f425 326
acfe0abc 327static void restore_pos(pTHX_ void *arg);
51371543 328
87c0511b 329#define REGCP_PAREN_ELEMS 3
f067efbf 330#define REGCP_OTHER_ELEMS 3
e0fa7e2b 331#define REGCP_FRAME_ELEMS 1
620d5b66
NC
332/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
333 * are needed for the regexp context stack bookkeeping. */
334
76e3520e 335STATIC CHECKPOINT
b93070ed 336S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
a0d0e21e 337{
97aff369 338 dVAR;
a3b680e6 339 const int retval = PL_savestack_ix;
a3b680e6 340 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
341 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
342 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 343 I32 p;
40a82448 344 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 345
b93070ed
DM
346 PERL_ARGS_ASSERT_REGCPPUSH;
347
e49a9654 348 if (paren_elems_to_push < 0)
5637ef5b
NC
349 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
350 paren_elems_to_push);
e49a9654 351
e0fa7e2b
NC
352 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
353 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
354 " out of range (%lu-%ld)",
355 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 356
620d5b66 357 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 358
495f47a5
DM
359 DEBUG_BUFFERS_r(
360 if ((int)PL_regsize > (int)parenfloor)
361 PerlIO_printf(Perl_debug_log,
362 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
363 PTR2UV(rex),
364 PTR2UV(rex->offs)
365 );
366 );
87c0511b 367 for (p = parenfloor+1; p <= (I32)PL_regsize; p++) {
b1ce53c5 368/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
369 SSPUSHINT(rex->offs[p].end);
370 SSPUSHINT(rex->offs[p].start);
1ca2007e 371 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 372 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
373 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
374 (UV)p,
375 (IV)rex->offs[p].start,
376 (IV)rex->offs[p].start_tmp,
377 (IV)rex->offs[p].end
40a82448 378 ));
a0d0e21e 379 }
b1ce53c5 380/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22 381 SSPUSHINT(PL_regsize);
b93070ed
DM
382 SSPUSHINT(rex->lastparen);
383 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 384 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 385
a0d0e21e
LW
386 return retval;
387}
388
c277df42 389/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
390#define REGCP_SET(cp) \
391 DEBUG_STATE_r( \
ab3bbdeb 392 PerlIO_printf(Perl_debug_log, \
e4f74956 393 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
394 (IV)PL_savestack_ix)); \
395 cp = PL_savestack_ix
c3464db5 396
ab3bbdeb 397#define REGCP_UNWIND(cp) \
e4f74956 398 DEBUG_STATE_r( \
ab3bbdeb 399 if (cp != PL_savestack_ix) \
e4f74956
YO
400 PerlIO_printf(Perl_debug_log, \
401 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
402 (IV)(cp), (IV)PL_savestack_ix)); \
403 regcpblow(cp)
c277df42 404
a8d1f4b4
DM
405#define UNWIND_PAREN(lp, lcp) \
406 for (n = rex->lastparen; n > lp; n--) \
407 rex->offs[n].end = -1; \
408 rex->lastparen = n; \
409 rex->lastcloseparen = lcp;
410
411
f067efbf 412STATIC void
b93070ed 413S_regcppop(pTHX_ regexp *rex)
a0d0e21e 414{
97aff369 415 dVAR;
e0fa7e2b 416 UV i;
87c0511b 417 U32 paren;
a3621e74
YO
418 GET_RE_DEBUG_FLAGS_DECL;
419
7918f24d
NC
420 PERL_ARGS_ASSERT_REGCPPOP;
421
b1ce53c5 422 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 423 i = SSPOPUV;
e0fa7e2b
NC
424 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
425 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
426 rex->lastcloseparen = SSPOPINT;
427 rex->lastparen = SSPOPINT;
3280af22 428 PL_regsize = SSPOPINT;
b1ce53c5 429
620d5b66 430 i -= REGCP_OTHER_ELEMS;
b1ce53c5 431 /* Now restore the parentheses context. */
495f47a5
DM
432 DEBUG_BUFFERS_r(
433 if (i || rex->lastparen + 1 <= rex->nparens)
434 PerlIO_printf(Perl_debug_log,
435 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
436 PTR2UV(rex),
437 PTR2UV(rex->offs)
438 );
439 );
87c0511b 440 paren = PL_regsize;
620d5b66 441 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 442 I32 tmps;
1ca2007e 443 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 444 rex->offs[paren].start = SSPOPINT;
cf93c79d 445 tmps = SSPOPINT;
b93070ed
DM
446 if (paren <= rex->lastparen)
447 rex->offs[paren].end = tmps;
495f47a5
DM
448 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
449 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
450 (UV)paren,
451 (IV)rex->offs[paren].start,
452 (IV)rex->offs[paren].start_tmp,
453 (IV)rex->offs[paren].end,
454 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 455 );
87c0511b 456 paren--;
a0d0e21e 457 }
daf18116 458#if 1
dafc8851
JH
459 /* It would seem that the similar code in regtry()
460 * already takes care of this, and in fact it is in
461 * a better location to since this code can #if 0-ed out
462 * but the code in regtry() is needed or otherwise tests
463 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
464 * (as of patchlevel 7877) will fail. Then again,
465 * this code seems to be necessary or otherwise
225593e1
DM
466 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
467 * --jhi updated by dapm */
b93070ed 468 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
097eb12c 469 if (i > PL_regsize)
b93070ed
DM
470 rex->offs[i].start = -1;
471 rex->offs[i].end = -1;
495f47a5
DM
472 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
473 " \\%"UVuf": %s ..-1 undeffing\n",
474 (UV)i,
475 (i > PL_regsize) ? "-1" : " "
476 ));
a0d0e21e 477 }
dafc8851 478#endif
a0d0e21e
LW
479}
480
74088413
DM
481/* restore the parens and associated vars at savestack position ix,
482 * but without popping the stack */
483
484STATIC void
485S_regcp_restore(pTHX_ regexp *rex, I32 ix)
486{
487 I32 tmpix = PL_savestack_ix;
488 PL_savestack_ix = ix;
489 regcppop(rex);
490 PL_savestack_ix = tmpix;
491}
492
02db2b7b 493#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 494
a687059c 495/*
e50aee73 496 * pregexec and friends
a687059c
LW
497 */
498
76234dfb 499#ifndef PERL_IN_XSUB_RE
a687059c 500/*
c277df42 501 - pregexec - match a regexp against a string
a687059c 502 */
c277df42 503I32
49d7dfbc 504Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 505 char *strbeg, I32 minend, SV *screamer, U32 nosave)
8fd1a950
DM
506/* stringarg: the point in the string at which to begin matching */
507/* strend: pointer to null at end of string */
508/* strbeg: real beginning of string */
509/* minend: end of match must be >= minend bytes after stringarg. */
510/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
511 * itself is accessed via the pointers above */
512/* nosave: For optimizations. */
c277df42 513{
7918f24d
NC
514 PERL_ARGS_ASSERT_PREGEXEC;
515
c277df42 516 return
9041c2e3 517 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
518 nosave ? 0 : REXEC_COPY_STR);
519}
76234dfb 520#endif
22e551b9 521
9041c2e3 522/*
cad2e5aa
JH
523 * Need to implement the following flags for reg_anch:
524 *
525 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
526 * USE_INTUIT_ML
527 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
528 * INTUIT_AUTORITATIVE_ML
529 * INTUIT_ONCE_NOML - Intuit can match in one location only.
530 * INTUIT_ONCE_ML
531 *
532 * Another flag for this function: SECOND_TIME (so that float substrs
533 * with giant delta may be not rechecked).
534 */
535
536/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
537
3f7c398e 538/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
539 Otherwise, only SvCUR(sv) is used to get strbeg. */
540
541/* XXXX We assume that strpos is strbeg unless sv. */
542
6eb5f6b9
JH
543/* XXXX Some places assume that there is a fixed substring.
544 An update may be needed if optimizer marks as "INTUITable"
545 RExen without fixed substrings. Similarly, it is assumed that
546 lengths of all the strings are no more than minlen, thus they
547 cannot come from lookahead.
40d049e4
YO
548 (Or minlen should take into account lookahead.)
549 NOTE: Some of this comment is not correct. minlen does now take account
550 of lookahead/behind. Further research is required. -- demerphq
551
552*/
6eb5f6b9 553
2c2d71f5
JH
554/* A failure to find a constant substring means that there is no need to make
555 an expensive call to REx engine, thus we celebrate a failure. Similarly,
556 finding a substring too deep into the string means that less calls to
30944b6d
IZ
557 regtry() should be needed.
558
559 REx compiler's optimizer found 4 possible hints:
560 a) Anchored substring;
561 b) Fixed substring;
562 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 563 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 564 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
565 string which does not contradict any of them.
566 */
2c2d71f5 567
6eb5f6b9
JH
568/* Most of decisions we do here should have been done at compile time.
569 The nodes of the REx which we used for the search should have been
570 deleted from the finite automaton. */
571
cad2e5aa 572char *
288b8c02 573Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 574 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 575{
97aff369 576 dVAR;
288b8c02 577 struct regexp *const prog = (struct regexp *)SvANY(rx);
eb578fdb 578 I32 start_shift = 0;
cad2e5aa 579 /* Should be nonnegative! */
eb578fdb
KW
580 I32 end_shift = 0;
581 char *s;
582 SV *check;
a1933d95 583 char *strbeg;
cad2e5aa 584 char *t;
f2ed9b32 585 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 586 I32 ml_anch;
eb578fdb 587 char *other_last = NULL; /* other substr checked before this */
bd61b366 588 char *check_at = NULL; /* check substr found at this pos */
d8080198 589 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 590 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 591 RXi_GET_DECL(prog,progi);
30944b6d 592#ifdef DEBUGGING
890ce7af 593 const char * const i_strpos = strpos;
30944b6d 594#endif
a3621e74
YO
595 GET_RE_DEBUG_FLAGS_DECL;
596
7918f24d 597 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
598 PERL_UNUSED_ARG(flags);
599 PERL_UNUSED_ARG(data);
7918f24d 600
f2ed9b32 601 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 602
3c8556c3 603 if (RX_UTF8(rx)) {
b8d68ded
JH
604 PL_reg_flags |= RF_utf8;
605 }
ab3bbdeb 606 DEBUG_EXECUTE_r(
f2ed9b32 607 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
608 sv ? "Guessing start of match in sv for"
609 : "Guessing start of match in string for");
2a782b5b 610 );
cad2e5aa 611
c344f387
JH
612 /* CHR_DIST() would be more correct here but it makes things slow. */
613 if (prog->minlen > strend - strpos) {
a3621e74 614 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 615 "String too short... [re_intuit_start]\n"));
cad2e5aa 616 goto fail;
2c2d71f5 617 }
1de06328 618
7016d6eb
DM
619 /* XXX we need to pass strbeg as a separate arg: the following is
620 * guesswork and can be wrong... */
621 if (sv && SvPOK(sv)) {
622 char * p = SvPVX(sv);
623 STRLEN cur = SvCUR(sv);
624 if (p <= strpos && strpos < p + cur) {
625 strbeg = p;
626 assert(p <= strend && strend <= p + cur);
627 }
628 else
629 strbeg = strend - cur;
630 }
631 else
632 strbeg = strpos;
633
1aa99e6b 634 PL_regeol = strend;
f2ed9b32 635 if (utf8_target) {
33b8afdf
JH
636 if (!prog->check_utf8 && prog->check_substr)
637 to_utf8_substr(prog);
638 check = prog->check_utf8;
639 } else {
7e0d5ad7
KW
640 if (!prog->check_substr && prog->check_utf8) {
641 if (! to_byte_substr(prog)) {
642 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
643 non_utf8_target_but_utf8_required));
644 goto fail;
645 }
646 }
33b8afdf
JH
647 check = prog->check_substr;
648 }
bbe252da
YO
649 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
650 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
651 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 652 && !multiline ) ); /* Check after \n? */
cad2e5aa 653
7e25d62c 654 if (!ml_anch) {
bbe252da
YO
655 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
656 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 657 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
658 && sv && !SvROK(sv)
659 && (strpos != strbeg)) {
a3621e74 660 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
661 goto fail;
662 }
663 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 664 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 665 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
666 I32 slen;
667
1aa99e6b 668 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 669
653099ff
GS
670 if (SvTAIL(check)) {
671 slen = SvCUR(check); /* >= 1 */
cad2e5aa 672
9041c2e3 673 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 674 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 675 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 676 goto fail_finish;
cad2e5aa
JH
677 }
678 /* Now should match s[0..slen-2] */
679 slen--;
3f7c398e 680 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 681 || (slen > 1
3f7c398e 682 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 683 report_neq:
a3621e74 684 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
685 goto fail_finish;
686 }
cad2e5aa 687 }
3f7c398e 688 else if (*SvPVX_const(check) != *s
653099ff 689 || ((slen = SvCUR(check)) > 1
3f7c398e 690 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 691 goto report_neq;
c315bfe8 692 check_at = s;
2c2d71f5 693 goto success_at_start;
7e25d62c 694 }
cad2e5aa 695 }
2c2d71f5 696 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 697 s = strpos;
2c2d71f5 698 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
699 end_shift = prog->check_end_shift;
700
2c2d71f5 701 if (!ml_anch) {
a3b680e6 702 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 703 - (SvTAIL(check) != 0);
a3b680e6 704 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
705
706 if (end_shift < eshift)
707 end_shift = eshift;
708 }
cad2e5aa 709 }
2c2d71f5 710 else { /* Can match at random position */
cad2e5aa
JH
711 ml_anch = 0;
712 s = strpos;
1de06328
YO
713 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
714 end_shift = prog->check_end_shift;
715
716 /* end shift should be non negative here */
cad2e5aa
JH
717 }
718
bcdf7404 719#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 720 if (end_shift < 0)
1de06328 721 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 722 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
723#endif
724
2c2d71f5
JH
725 restart:
726 /* Find a possible match in the region s..strend by looking for
727 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
728
729 {
730 I32 srch_start_shift = start_shift;
731 I32 srch_end_shift = end_shift;
c33e64f0
FC
732 U8* start_point;
733 U8* end_point;
1de06328
YO
734 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
735 srch_end_shift -= ((strbeg - s) - srch_start_shift);
736 srch_start_shift = strbeg - s;
737 }
6bda09f9 738 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
739 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
740 (IV)prog->check_offset_min,
741 (IV)srch_start_shift,
742 (IV)srch_end_shift,
743 (IV)prog->check_end_shift);
744 });
745
bbe252da 746 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
747 start_point= (U8*)(s + srch_start_shift);
748 end_point= (U8*)(strend - srch_end_shift);
749 } else {
750 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
751 end_point= HOP3(strend, -srch_end_shift, strbeg);
752 }
6bda09f9 753 DEBUG_OPTIMISE_MORE_r({
56570a2c 754 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 755 (int)(end_point - start_point),
fc8cd66c 756 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
757 start_point);
758 });
759
760 s = fbm_instr( start_point, end_point,
7fba1cd6 761 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 762 }
cad2e5aa
JH
763 /* Update the count-of-usability, remove useless subpatterns,
764 unshift s. */
2c2d71f5 765
ab3bbdeb 766 DEBUG_EXECUTE_r({
f2ed9b32 767 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
768 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
769 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 770 (s ? "Found" : "Did not find"),
f2ed9b32 771 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
772 ? "anchored" : "floating"),
773 quoted,
774 RE_SV_TAIL(check),
775 (s ? " at offset " : "...\n") );
776 });
2c2d71f5
JH
777
778 if (!s)
779 goto fail_finish;
2c2d71f5 780 /* Finish the diagnostic message */
a3621e74 781 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 782
1de06328
YO
783 /* XXX dmq: first branch is for positive lookbehind...
784 Our check string is offset from the beginning of the pattern.
785 So we need to do any stclass tests offset forward from that
786 point. I think. :-(
787 */
788
789
790
791 check_at=s;
792
793
2c2d71f5
JH
794 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
795 Start with the other substr.
796 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 797 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
798 *always* match. Probably should be marked during compile...
799 Probably it is right to do no SCREAM here...
800 */
801
f2ed9b32 802 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
803 : (prog->float_substr && prog->anchored_substr))
804 {
30944b6d 805 /* Take into account the "other" substring. */
2c2d71f5
JH
806 /* XXXX May be hopelessly wrong for UTF... */
807 if (!other_last)
6eb5f6b9 808 other_last = strpos;
f2ed9b32 809 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
810 do_other_anchored:
811 {
890ce7af
AL
812 char * const last = HOP3c(s, -start_shift, strbeg);
813 char *last1, *last2;
be8e71aa 814 char * const saved_s = s;
33b8afdf 815 SV* must;
2c2d71f5 816
2c2d71f5
JH
817 t = s - prog->check_offset_max;
818 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 819 && (!utf8_target
0ce71af7 820 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 821 && t > strpos)))
6f207bd3 822 NOOP;
2c2d71f5
JH
823 else
824 t = strpos;
1aa99e6b 825 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
826 if (t < other_last) /* These positions already checked */
827 t = other_last;
1aa99e6b 828 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
829 if (last < last1)
830 last1 = last;
1de06328
YO
831 /* XXXX It is not documented what units *_offsets are in.
832 We assume bytes, but this is clearly wrong.
833 Meaning this code needs to be carefully reviewed for errors.
834 dmq.
835 */
836
2c2d71f5 837 /* On end-of-str: see comment below. */
f2ed9b32 838 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
839 if (must == &PL_sv_undef) {
840 s = (char*)NULL;
1de06328 841 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
842 }
843 else
844 s = fbm_instr(
845 (unsigned char*)t,
846 HOP3(HOP3(last1, prog->anchored_offset, strend)
847 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
848 must,
7fba1cd6 849 multiline ? FBMrf_MULTILINE : 0
33b8afdf 850 );
ab3bbdeb 851 DEBUG_EXECUTE_r({
f2ed9b32 852 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
853 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
854 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 855 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
856 quoted, RE_SV_TAIL(must));
857 });
858
859
2c2d71f5
JH
860 if (!s) {
861 if (last1 >= last2) {
a3621e74 862 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
863 ", giving up...\n"));
864 goto fail_finish;
865 }
a3621e74 866 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 867 ", trying floating at offset %ld...\n",
be8e71aa 868 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
869 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
870 s = HOP3c(last, 1, strend);
2c2d71f5
JH
871 goto restart;
872 }
873 else {
a3621e74 874 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 875 (long)(s - i_strpos)));
1aa99e6b
IH
876 t = HOP3c(s, -prog->anchored_offset, strbeg);
877 other_last = HOP3c(s, 1, strend);
be8e71aa 878 s = saved_s;
2c2d71f5
JH
879 if (t == strpos)
880 goto try_at_start;
2c2d71f5
JH
881 goto try_at_offset;
882 }
30944b6d 883 }
2c2d71f5
JH
884 }
885 else { /* Take into account the floating substring. */
33b8afdf 886 char *last, *last1;
be8e71aa 887 char * const saved_s = s;
33b8afdf
JH
888 SV* must;
889
890 t = HOP3c(s, -start_shift, strbeg);
891 last1 = last =
892 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
893 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
894 last = HOP3c(t, prog->float_max_offset, strend);
895 s = HOP3c(t, prog->float_min_offset, strend);
896 if (s < other_last)
897 s = other_last;
2c2d71f5 898 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 899 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
900 /* fbm_instr() takes into account exact value of end-of-str
901 if the check is SvTAIL(ed). Since false positives are OK,
902 and end-of-str is not later than strend we are OK. */
903 if (must == &PL_sv_undef) {
904 s = (char*)NULL;
1de06328 905 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
906 }
907 else
2c2d71f5 908 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
909 (unsigned char*)last + SvCUR(must)
910 - (SvTAIL(must)!=0),
7fba1cd6 911 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 912 DEBUG_EXECUTE_r({
f2ed9b32 913 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
914 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
915 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 916 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
917 quoted, RE_SV_TAIL(must));
918 });
33b8afdf
JH
919 if (!s) {
920 if (last1 == last) {
a3621e74 921 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
922 ", giving up...\n"));
923 goto fail_finish;
2c2d71f5 924 }
a3621e74 925 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 926 ", trying anchored starting at offset %ld...\n",
be8e71aa 927 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
928 other_last = last;
929 s = HOP3c(t, 1, strend);
930 goto restart;
931 }
932 else {
a3621e74 933 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
934 (long)(s - i_strpos)));
935 other_last = s; /* Fix this later. --Hugo */
be8e71aa 936 s = saved_s;
33b8afdf
JH
937 if (t == strpos)
938 goto try_at_start;
939 goto try_at_offset;
940 }
2c2d71f5 941 }
cad2e5aa 942 }
2c2d71f5 943
1de06328 944
9ef43ace 945 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 946
6bda09f9 947 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
948 PerlIO_printf(Perl_debug_log,
949 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
950 (IV)prog->check_offset_min,
951 (IV)prog->check_offset_max,
952 (IV)(s-strpos),
953 (IV)(t-strpos),
954 (IV)(t-s),
955 (IV)(strend-strpos)
956 )
957 );
958
2c2d71f5 959 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 960 && (!utf8_target
9ef43ace 961 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
962 && t > strpos)))
963 {
2c2d71f5
JH
964 /* Fixed substring is found far enough so that the match
965 cannot start at strpos. */
966 try_at_offset:
cad2e5aa 967 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
968 /* Eventually fbm_*() should handle this, but often
969 anchored_offset is not 0, so this check will not be wasted. */
970 /* XXXX In the code below we prefer to look for "^" even in
971 presence of anchored substrings. And we search even
972 beyond the found float position. These pessimizations
973 are historical artefacts only. */
974 find_anchor:
2c2d71f5 975 while (t < strend - prog->minlen) {
cad2e5aa 976 if (*t == '\n') {
4ee3650e 977 if (t < check_at - prog->check_offset_min) {
f2ed9b32 978 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
979 /* Since we moved from the found position,
980 we definitely contradict the found anchored
30944b6d
IZ
981 substr. Due to the above check we do not
982 contradict "check" substr.
983 Thus we can arrive here only if check substr
984 is float. Redo checking for "other"=="fixed".
985 */
9041c2e3 986 strpos = t + 1;
a3621e74 987 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 988 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
989 goto do_other_anchored;
990 }
4ee3650e
GS
991 /* We don't contradict the found floating substring. */
992 /* XXXX Why not check for STCLASS? */
cad2e5aa 993 s = t + 1;
a3621e74 994 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 995 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
996 goto set_useful;
997 }
4ee3650e
GS
998 /* Position contradicts check-string */
999 /* XXXX probably better to look for check-string
1000 than for "\n", so one should lower the limit for t? */
a3621e74 1001 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 1002 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 1003 other_last = strpos = s = t + 1;
cad2e5aa
JH
1004 goto restart;
1005 }
1006 t++;
1007 }
a3621e74 1008 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 1009 PL_colors[0], PL_colors[1]));
2c2d71f5 1010 goto fail_finish;
cad2e5aa 1011 }
f5952150 1012 else {
a3621e74 1013 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 1014 PL_colors[0], PL_colors[1]));
f5952150 1015 }
cad2e5aa
JH
1016 s = t;
1017 set_useful:
f2ed9b32 1018 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1019 }
1020 else {
f5952150 1021 /* The found string does not prohibit matching at strpos,
2c2d71f5 1022 - no optimization of calling REx engine can be performed,
f5952150
GS
1023 unless it was an MBOL and we are not after MBOL,
1024 or a future STCLASS check will fail this. */
2c2d71f5
JH
1025 try_at_start:
1026 /* Even in this situation we may use MBOL flag if strpos is offset
1027 wrt the start of the string. */
05b4157f 1028 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1029 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1030 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1031 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1032 {
cad2e5aa
JH
1033 t = strpos;
1034 goto find_anchor;
1035 }
a3621e74 1036 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1037 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1038 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1039 );
2c2d71f5 1040 success_at_start:
bbe252da 1041 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1042 && (utf8_target ? (
33b8afdf
JH
1043 prog->check_utf8 /* Could be deleted already */
1044 && --BmUSEFUL(prog->check_utf8) < 0
1045 && (prog->check_utf8 == prog->float_utf8)
1046 ) : (
1047 prog->check_substr /* Could be deleted already */
1048 && --BmUSEFUL(prog->check_substr) < 0
1049 && (prog->check_substr == prog->float_substr)
1050 )))
66e933ab 1051 {
cad2e5aa 1052 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1053 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1054 /* XXX Does the destruction order has to change with utf8_target? */
1055 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1056 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1057 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1058 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1059 check = NULL; /* abort */
cad2e5aa 1060 s = strpos;
486ec47a 1061 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1062 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1063 if (prog->intflags & PREGf_IMPLICIT)
1064 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1065 /* XXXX This is a remnant of the old implementation. It
1066 looks wasteful, since now INTUIT can use many
6eb5f6b9 1067 other heuristics. */
bbe252da 1068 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1069 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1070 }
1071 else
1072 s = strpos;
1073 }
1074
6eb5f6b9
JH
1075 /* Last resort... */
1076 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1077 /* trie stclasses are too expensive to use here, we are better off to
1078 leave it to regmatch itself */
f8fc2ecf 1079 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1080 /* minlen == 0 is possible if regstclass is \b or \B,
1081 and the fixed substr is ''$.
1082 Since minlen is already taken into account, s+1 is before strend;
1083 accidentally, minlen >= 1 guaranties no false positives at s + 1
1084 even for \b or \B. But (minlen? 1 : 0) below assumes that
1085 regstclass does not come from lookahead... */
1086 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1087 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1088 const U8* const str = (U8*)STRING(progi->regstclass);
1089 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1090 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1091 : 1);
1de06328
YO
1092 char * endpos;
1093 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1094 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1095 else if (prog->float_substr || prog->float_utf8)
1096 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1097 else
1098 endpos= strend;
1099
d8080198
YO
1100 if (checked_upto < s)
1101 checked_upto = s;
1102 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1103 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1104
6eb5f6b9 1105 t = s;
d8080198
YO
1106 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1107 if (s) {
1108 checked_upto = s;
1109 } else {
6eb5f6b9 1110#ifdef DEBUGGING
cbbf8932 1111 const char *what = NULL;
6eb5f6b9
JH
1112#endif
1113 if (endpos == strend) {
a3621e74 1114 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1115 "Could not match STCLASS...\n") );
1116 goto fail;
1117 }
a3621e74 1118 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1119 "This position contradicts STCLASS...\n") );
bbe252da 1120 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1121 goto fail;
d8080198
YO
1122 checked_upto = HOPBACKc(endpos, start_shift);
1123 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1124 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1125 /* Contradict one of substrings */
33b8afdf 1126 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1127 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1128 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1129 hop_and_restart:
1aa99e6b 1130 s = HOP3c(t, 1, strend);
66e933ab
GS
1131 if (s + start_shift + end_shift > strend) {
1132 /* XXXX Should be taken into account earlier? */
a3621e74 1133 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1134 "Could not match STCLASS...\n") );
1135 goto fail;
1136 }
5e39e1e5
HS
1137 if (!check)
1138 goto giveup;
a3621e74 1139 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1140 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1141 what, (long)(s + start_shift - i_strpos)) );
1142 goto restart;
1143 }
66e933ab 1144 /* Have both, check_string is floating */
6eb5f6b9
JH
1145 if (t + start_shift >= check_at) /* Contradicts floating=check */
1146 goto retry_floating_check;
1147 /* Recheck anchored substring, but not floating... */
9041c2e3 1148 s = check_at;
5e39e1e5
HS
1149 if (!check)
1150 goto giveup;
a3621e74 1151 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1152 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1153 (long)(other_last - i_strpos)) );
1154 goto do_other_anchored;
1155 }
60e71179
GS
1156 /* Another way we could have checked stclass at the
1157 current position only: */
1158 if (ml_anch) {
1159 s = t = t + 1;
5e39e1e5
HS
1160 if (!check)
1161 goto giveup;
a3621e74 1162 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1163 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1164 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1165 goto try_at_offset;
66e933ab 1166 }
f2ed9b32 1167 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1168 goto fail;
486ec47a 1169 /* Check is floating substring. */
6eb5f6b9
JH
1170 retry_floating_check:
1171 t = check_at - start_shift;
a3621e74 1172 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1173 goto hop_and_restart;
1174 }
b7953727 1175 if (t != s) {
a3621e74 1176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1177 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1178 (long)(t - i_strpos), (long)(s - i_strpos))
1179 );
1180 }
1181 else {
a3621e74 1182 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1183 "Does not contradict STCLASS...\n");
1184 );
1185 }
6eb5f6b9 1186 }
5e39e1e5 1187 giveup:
a3621e74 1188 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1189 PL_colors[4], (check ? "Guessed" : "Giving up"),
1190 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1191 return s;
2c2d71f5
JH
1192
1193 fail_finish: /* Substring not found */
33b8afdf 1194 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1195 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1196 fail:
a3621e74 1197 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1198 PL_colors[4], PL_colors[5]));
bd61b366 1199 return NULL;
cad2e5aa 1200}
9661b544 1201
a0a388a1
YO
1202#define DECL_TRIE_TYPE(scan) \
1203 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1204 trie_type = ((scan->flags == EXACT) \
1205 ? (utf8_target ? trie_utf8 : trie_plain) \
1206 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1207
1208#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1209uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1210 STRLEN skiplen; \
1211 switch (trie_type) { \
1212 case trie_utf8_fold: \
1213 if ( foldlen>0 ) { \
1214 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1215 foldlen -= len; \
1216 uscan += len; \
1217 len=0; \
1218 } else { \
1219 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1220 len = UTF8SKIP(uc); \
1221 skiplen = UNISKIP( uvc ); \
1222 foldlen -= skiplen; \
1223 uscan = foldbuf + skiplen; \
1224 } \
1225 break; \
1226 case trie_latin_utf8_fold: \
1227 if ( foldlen>0 ) { \
1228 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1229 foldlen -= len; \
1230 uscan += len; \
1231 len=0; \
1232 } else { \
1233 len = 1; \
1234 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1235 skiplen = UNISKIP( uvc ); \
1236 foldlen -= skiplen; \
1237 uscan = foldbuf + skiplen; \
1238 } \
1239 break; \
1240 case trie_utf8: \
1241 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1242 break; \
1243 case trie_plain: \
1244 uvc = (UV)*uc; \
1245 len = 1; \
1246 } \
1247 if (uvc < 256) { \
1248 charid = trie->charmap[ uvc ]; \
1249 } \
1250 else { \
1251 charid = 0; \
1252 if (widecharmap) { \
1253 SV** const svpp = hv_fetch(widecharmap, \
1254 (char*)&uvc, sizeof(UV), 0); \
1255 if (svpp) \
1256 charid = (U16)SvIV(*svpp); \
1257 } \
1258 } \
4cadc6a9
YO
1259} STMT_END
1260
4cadc6a9
YO
1261#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1262STMT_START { \
1263 while (s <= e) { \
1264 if ( (CoNd) \
fac1af77 1265 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1266 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1267 goto got_it; \
1268 s++; \
1269 } \
1270} STMT_END
1271
1272#define REXEC_FBC_UTF8_SCAN(CoDe) \
1273STMT_START { \
7016d6eb 1274 while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) { \
4cadc6a9
YO
1275 CoDe \
1276 s += uskip; \
1277 } \
1278} STMT_END
1279
1280#define REXEC_FBC_SCAN(CoDe) \
1281STMT_START { \
1282 while (s < strend) { \
1283 CoDe \
1284 s++; \
1285 } \
1286} STMT_END
1287
1288#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1289REXEC_FBC_UTF8_SCAN( \
1290 if (CoNd) { \
24b23f37 1291 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1292 goto got_it; \
1293 else \
1294 tmp = doevery; \
1295 } \
1296 else \
1297 tmp = 1; \
1298)
1299
1300#define REXEC_FBC_CLASS_SCAN(CoNd) \
1301REXEC_FBC_SCAN( \
1302 if (CoNd) { \
24b23f37 1303 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1304 goto got_it; \
1305 else \
1306 tmp = doevery; \
1307 } \
1308 else \
1309 tmp = 1; \
1310)
1311
1312#define REXEC_FBC_TRYIT \
24b23f37 1313if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1314 goto got_it
1315
e1d1eefb 1316#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1317 if (utf8_target) { \
e1d1eefb
YO
1318 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1319 } \
1320 else { \
1321 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1322 }
e1d1eefb 1323
4cadc6a9 1324#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1325 if (utf8_target) { \
4cadc6a9
YO
1326 UtFpReLoAd; \
1327 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1328 } \
1329 else { \
1330 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1331 }
4cadc6a9
YO
1332
1333#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1334 PL_reg_flags |= RF_tainted; \
f2ed9b32 1335 if (utf8_target) { \
4cadc6a9
YO
1336 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1337 } \
1338 else { \
1339 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1340 }
4cadc6a9 1341
786e8c11
YO
1342#define DUMP_EXEC_POS(li,s,doutf8) \
1343 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1344
cfaf538b
KW
1345
1346#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1347 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1348 tmp = TEST_NON_UTF8(tmp); \
1349 REXEC_FBC_UTF8_SCAN( \
1350 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1351 tmp = !tmp; \
1352 IF_SUCCESS; \
1353 } \
1354 else { \
1355 IF_FAIL; \
1356 } \
1357 ); \
1358
1359#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1360 if (s == PL_bostr) { \
1361 tmp = '\n'; \
1362 } \
1363 else { \
1364 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1365 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1366 } \
1367 tmp = TeSt1_UtF8; \
1368 LOAD_UTF8_CHARCLASS_ALNUM(); \
1369 REXEC_FBC_UTF8_SCAN( \
1370 if (tmp == ! (TeSt2_UtF8)) { \
1371 tmp = !tmp; \
1372 IF_SUCCESS; \
1373 } \
1374 else { \
1375 IF_FAIL; \
1376 } \
1377 ); \
1378
63ac0dad
KW
1379/* The only difference between the BOUND and NBOUND cases is that
1380 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1381 * NBOUND. This is accomplished by passing it in either the if or else clause,
1382 * with the other one being empty */
1383#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1384 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1385
1386#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1387 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1388
1389#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1390 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1391
1392#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1393 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1394
63ac0dad
KW
1395
1396/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1397 * be passed in completely with the variable name being tested, which isn't
1398 * such a clean interface, but this is easier to read than it was before. We
1399 * are looking for the boundary (or non-boundary between a word and non-word
1400 * character. The utf8 and non-utf8 cases have the same logic, but the details
1401 * must be different. Find the "wordness" of the character just prior to this
1402 * one, and compare it with the wordness of this one. If they differ, we have
1403 * a boundary. At the beginning of the string, pretend that the previous
1404 * character was a new-line */
cfaf538b 1405#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1406 if (utf8_target) { \
cfaf538b 1407 UTF8_CODE \
63ac0dad
KW
1408 } \
1409 else { /* Not utf8 */ \
1410 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1411 tmp = TEST_NON_UTF8(tmp); \
1412 REXEC_FBC_SCAN( \
1413 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1414 tmp = !tmp; \
1415 IF_SUCCESS; \
1416 } \
1417 else { \
1418 IF_FAIL; \
1419 } \
1420 ); \
1421 } \
1422 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1423 goto got_it;
1424
786e8c11
YO
1425/* We know what class REx starts with. Try to find this position... */
1426/* if reginfo is NULL, its a dryrun */
1427/* annoyingly all the vars in this routine have different names from their counterparts
1428 in regmatch. /grrr */
1429
3c3eec57 1430STATIC char *
07be1b83 1431S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1432 const char *strend, regmatch_info *reginfo)
a687059c 1433{
27da23d5 1434 dVAR;
bbe252da 1435 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
fac1af77
KW
1436 char *pat_string; /* The pattern's exactish string */
1437 char *pat_end; /* ptr to end char of pat_string */
1438 re_fold_t folder; /* Function for computing non-utf8 folds */
1439 const U8 *fold_array; /* array for folding ords < 256 */
d8093b23 1440 STRLEN ln;
5dab1207 1441 STRLEN lnc;
eb578fdb 1442 STRLEN uskip;
fac1af77
KW
1443 U8 c1;
1444 U8 c2;
6eb5f6b9 1445 char *e;
eb578fdb
KW
1446 I32 tmp = 1; /* Scratch variable? */
1447 const bool utf8_target = PL_reg_match_utf8;
453bfd44 1448 UV utf8_fold_flags = 0;
f8fc2ecf 1449 RXi_GET_DECL(prog,progi);
7918f24d
NC
1450
1451 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1452
6eb5f6b9
JH
1453 /* We know what class it must start with. */
1454 switch (OP(c)) {
6eb5f6b9 1455 case ANYOF:
e0193e47 1456 if (utf8_target) {
b1e3e569
KW
1457 STRLEN inclasslen = strend - s;
1458 REXEC_FBC_UTF8_CLASS_SCAN(
1459 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
388cc4de
HS
1460 }
1461 else {
6ef69d56 1462 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
a0d0e21e 1463 }
6eb5f6b9 1464 break;
f33976b4 1465 case CANY:
4cadc6a9 1466 REXEC_FBC_SCAN(
24b23f37 1467 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1468 goto got_it;
1469 else
1470 tmp = doevery;
4cadc6a9 1471 );
f33976b4 1472 break;
2f7f8cb1
KW
1473
1474 case EXACTFA:
1475 if (UTF_PATTERN || utf8_target) {
1476 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1477 goto do_exactf_utf8;
1478 }
1479 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1480 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1481 goto do_exactf_non_utf8; /* isn't dealt with by these */
1482
6eb5f6b9 1483 case EXACTF:
62bf7766 1484 if (utf8_target) {
77a6d856
KW
1485
1486 /* regcomp.c already folded this if pattern is in UTF-8 */
62bf7766 1487 utf8_fold_flags = 0;
fac1af77
KW
1488 goto do_exactf_utf8;
1489 }
1490 fold_array = PL_fold;
1491 folder = foldEQ;
1492 goto do_exactf_non_utf8;
1493
1494 case EXACTFL:
1495 if (UTF_PATTERN || utf8_target) {
17580e7a 1496 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
fac1af77
KW
1497 goto do_exactf_utf8;
1498 }
1499 fold_array = PL_fold_locale;
1500 folder = foldEQ_locale;
16d951b7
KW
1501 goto do_exactf_non_utf8;
1502
3c760661
KW
1503 case EXACTFU_SS:
1504 if (UTF_PATTERN) {
1505 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1506 }
1507 goto do_exactf_utf8;
1508
fab2782b 1509 case EXACTFU_TRICKYFOLD:
16d951b7
KW
1510 case EXACTFU:
1511 if (UTF_PATTERN || utf8_target) {
77a6d856 1512 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
16d951b7
KW
1513 goto do_exactf_utf8;
1514 }
1515
1516 /* Any 'ss' in the pattern should have been replaced by regcomp,
1517 * so we don't have to worry here about this single special case
1518 * in the Latin1 range */
1519 fold_array = PL_fold_latin1;
1520 folder = foldEQ_latin1;
fac1af77
KW
1521
1522 /* FALL THROUGH */
1523
62bf7766
KW
1524 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1525 are no glitches with fold-length differences
1526 between the target string and pattern */
fac1af77
KW
1527
1528 /* The idea in the non-utf8 EXACTF* cases is to first find the
1529 * first character of the EXACTF* node and then, if necessary,
1530 * case-insensitively compare the full text of the node. c1 is the
1531 * first character. c2 is its fold. This logic will not work for
1532 * Unicode semantics and the german sharp ss, which hence should
1533 * not be compiled into a node that gets here. */
1534 pat_string = STRING(c);
1535 ln = STR_LEN(c); /* length to match in octets/bytes */
1536
8a90a8fe
KW
1537 /* We know that we have to match at least 'ln' bytes (which is the
1538 * same as characters, since not utf8). If we have to match 3
1539 * characters, and there are only 2 availabe, we know without
1540 * trying that it will fail; so don't start a match past the
1541 * required minimum number from the far end */
fac1af77
KW
1542 e = HOP3c(strend, -((I32)ln), s);
1543
1544 if (!reginfo && e < s) {
1545 e = s; /* Due to minlen logic of intuit() */
1546 }
1547
1548 c1 = *pat_string;
1549 c2 = fold_array[c1];
1550 if (c1 == c2) { /* If char and fold are the same */
1551 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1552 }
1553 else {
1554 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1555 }
1556 break;
1557
1558 do_exactf_utf8:
e067297c
KW
1559 {
1560 unsigned expansion;
1561
fac1af77
KW
1562
1563 /* If one of the operands is in utf8, we can't use the simpler
1564 * folding above, due to the fact that many different characters
1565 * can have the same fold, or portion of a fold, or different-
1566 * length fold */
1567 pat_string = STRING(c);
1568 ln = STR_LEN(c); /* length to match in octets/bytes */
1569 pat_end = pat_string + ln;
1570 lnc = (UTF_PATTERN) /* length to match in characters */
1571 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1572 : ln;
1573
e067297c
KW
1574 /* We have 'lnc' characters to match in the pattern, but because of
1575 * multi-character folding, each character in the target can match
1576 * up to 3 characters (Unicode guarantees it will never exceed
1577 * this) if it is utf8-encoded; and up to 2 if not (based on the
1578 * fact that the Latin 1 folds are already determined, and the
1579 * only multi-char fold in that range is the sharp-s folding to
1580 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
89378d8a
KW
1581 * string character. Adjust lnc accordingly, rounding up, so that
1582 * if we need to match at least 4+1/3 chars, that really is 5. */
e067297c 1583 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
89378d8a 1584 lnc = (lnc + expansion - 1) / expansion;
e067297c
KW
1585
1586 /* As in the non-UTF8 case, if we have to match 3 characters, and
1587 * only 2 are left, it's guaranteed to fail, so don't start a
1588 * match that would require us to go beyond the end of the string
1589 */
1590 e = HOP3c(strend, -((I32)lnc), s);
fac1af77
KW
1591
1592 if (!reginfo && e < s) {
1593 e = s; /* Due to minlen logic of intuit() */
1594 }
1595
b33105db
KW
1596 /* XXX Note that we could recalculate e to stop the loop earlier,
1597 * as the worst case expansion above will rarely be met, and as we
1598 * go along we would usually find that e moves further to the left.
1599 * This would happen only after we reached the point in the loop
1600 * where if there were no expansion we should fail. Unclear if
1601 * worth the expense */
e067297c 1602
fac1af77
KW
1603 while (s <= e) {
1604 char *my_strend= (char *)strend;
1605 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1606 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1607 && (!reginfo || regtry(reginfo, &s)) )
1608 {
1609 goto got_it;
1610 }
bbdd8bad 1611 s += (utf8_target) ? UTF8SKIP(s) : 1;
fac1af77
KW
1612 }
1613 break;
e067297c 1614 }
bbce6d69 1615 case BOUNDL:
3280af22 1616 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1617 FBC_BOUND(isALNUM_LC,
1618 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1619 isALNUM_LC_utf8((U8*)s));
a0ed51b3 1620 break;
bbce6d69 1621 case NBOUNDL:
3280af22 1622 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1623 FBC_NBOUND(isALNUM_LC,
1624 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1625 isALNUM_LC_utf8((U8*)s));
1626 break;
1627 case BOUND:
1628 FBC_BOUND(isWORDCHAR,
1629 isALNUM_uni(tmp),
1630 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1631 break;
cfaf538b
KW
1632 case BOUNDA:
1633 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1634 isWORDCHAR_A(tmp),
1635 isWORDCHAR_A((U8*)s));
1636 break;
a0d0e21e 1637 case NBOUND:
63ac0dad
KW
1638 FBC_NBOUND(isWORDCHAR,
1639 isALNUM_uni(tmp),
1640 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1641 break;
cfaf538b
KW
1642 case NBOUNDA:
1643 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1644 isWORDCHAR_A(tmp),
1645 isWORDCHAR_A((U8*)s));
1646 break;
63ac0dad
KW
1647 case BOUNDU:
1648 FBC_BOUND(isWORDCHAR_L1,
1649 isALNUM_uni(tmp),
1650 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1651 break;
1652 case NBOUNDU:
1653 FBC_NBOUND(isWORDCHAR_L1,
1654 isALNUM_uni(tmp),
1655 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
a0ed51b3 1656 break;
bbce6d69 1657 case ALNUML:
4cadc6a9
YO
1658 REXEC_FBC_CSCAN_TAINT(
1659 isALNUM_LC_utf8((U8*)s),
1660 isALNUM_LC(*s)
1661 );
6895a8aa 1662 break;
980866de
KW
1663 case ALNUMU:
1664 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1665 LOAD_UTF8_CHARCLASS_ALNUM(),
1666 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1667 isWORDCHAR_L1((U8) *s)
1668 );
6895a8aa 1669 break;
980866de
KW
1670 case ALNUM:
1671 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1672 LOAD_UTF8_CHARCLASS_ALNUM(),
1673 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1674 isWORDCHAR((U8) *s)
1675 );
6895a8aa 1676 break;
cfaf538b 1677 case ALNUMA:
8e9da4d4
KW
1678 /* Don't need to worry about utf8, as it can match only a single
1679 * byte invariant character */
cfaf538b 1680 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
6895a8aa 1681 break;
980866de
KW
1682 case NALNUMU:
1683 REXEC_FBC_CSCAN_PRELOAD(
779d7b58 1684 LOAD_UTF8_CHARCLASS_ALNUM(),
359960d4 1685 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1686 ! isWORDCHAR_L1((U8) *s)
1687 );
6895a8aa 1688 break;
a0d0e21e 1689 case NALNUM:
4cadc6a9 1690 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1691 LOAD_UTF8_CHARCLASS_ALNUM(),
1692 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
980866de 1693 ! isALNUM(*s)
4cadc6a9 1694 );
6895a8aa 1695 break;
cfaf538b 1696 case NALNUMA:
8e9da4d4
KW
1697 REXEC_FBC_CSCAN(
1698 !isWORDCHAR_A(*s),
1699 !isWORDCHAR_A(*s)
1700 );
1701 break;
bbce6d69 1702 case NALNUML:
4cadc6a9
YO
1703 REXEC_FBC_CSCAN_TAINT(
1704 !isALNUM_LC_utf8((U8*)s),
1705 !isALNUM_LC(*s)
1706 );
6895a8aa 1707 break;
980866de
KW
1708 case SPACEU:
1709 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1710 LOAD_UTF8_CHARCLASS_SPACE(),
1711 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de
KW
1712 isSPACE_L1((U8) *s)
1713 );
6895a8aa 1714 break;
a0d0e21e 1715 case SPACE:
4cadc6a9 1716 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1717 LOAD_UTF8_CHARCLASS_SPACE(),
1718 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de 1719 isSPACE((U8) *s)
4cadc6a9 1720 );
6895a8aa 1721 break;
cfaf538b 1722 case SPACEA:
8e9da4d4
KW
1723 /* Don't need to worry about utf8, as it can match only a single
1724 * byte invariant character */
cfaf538b 1725 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
6895a8aa 1726 break;
bbce6d69 1727 case SPACEL:
4cadc6a9 1728 REXEC_FBC_CSCAN_TAINT(
6bbba904 1729 isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1730 isSPACE_LC(*s)
1731 );
6895a8aa 1732 break;
980866de
KW
1733 case NSPACEU:
1734 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1735 LOAD_UTF8_CHARCLASS_SPACE(),
1736 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de
KW
1737 ! isSPACE_L1((U8) *s)
1738 );
6895a8aa 1739 break;
a0d0e21e 1740 case NSPACE:
4cadc6a9 1741 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1742 LOAD_UTF8_CHARCLASS_SPACE(),
1743 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de 1744 ! isSPACE((U8) *s)
4cadc6a9 1745 );
6895a8aa 1746 break;
cfaf538b 1747 case NSPACEA:
8e9da4d4
KW
1748 REXEC_FBC_CSCAN(
1749 !isSPACE_A(*s),
1750 !isSPACE_A(*s)
1751 );
1752 break;
bbce6d69 1753 case NSPACEL:
4cadc6a9 1754 REXEC_FBC_CSCAN_TAINT(
6bbba904 1755 !isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1756 !isSPACE_LC(*s)
1757 );
6895a8aa 1758 break;
a0d0e21e 1759 case DIGIT:
4cadc6a9 1760 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1761 LOAD_UTF8_CHARCLASS_DIGIT(),
1762 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1763 isDIGIT(*s)
1764 );
6895a8aa 1765 break;
cfaf538b 1766 case DIGITA:
8e9da4d4
KW
1767 /* Don't need to worry about utf8, as it can match only a single
1768 * byte invariant character */
cfaf538b 1769 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
6895a8aa 1770 break;
b8c5462f 1771 case DIGITL:
4cadc6a9
YO
1772 REXEC_FBC_CSCAN_TAINT(
1773 isDIGIT_LC_utf8((U8*)s),
1774 isDIGIT_LC(*s)
1775 );
6895a8aa 1776 break;
a0d0e21e 1777 case NDIGIT:
4cadc6a9 1778 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1779 LOAD_UTF8_CHARCLASS_DIGIT(),
1780 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1781 !isDIGIT(*s)
1782 );
6895a8aa 1783 break;
cfaf538b 1784 case NDIGITA:
8e9da4d4
KW
1785 REXEC_FBC_CSCAN(
1786 !isDIGIT_A(*s),
1787 !isDIGIT_A(*s)
1788 );
1789 break;
b8c5462f 1790 case NDIGITL:
4cadc6a9
YO
1791 REXEC_FBC_CSCAN_TAINT(
1792 !isDIGIT_LC_utf8((U8*)s),
1793 !isDIGIT_LC(*s)
1794 );
6895a8aa 1795 break;
e1d1eefb
YO
1796 case LNBREAK:
1797 REXEC_FBC_CSCAN(
7016d6eb
DM
1798 is_LNBREAK_utf8_safe(s, strend),
1799 is_LNBREAK_latin1_safe(s, strend)
e1d1eefb 1800 );
6895a8aa 1801 break;
e1d1eefb
YO
1802 case VERTWS:
1803 REXEC_FBC_CSCAN(
7016d6eb
DM
1804 is_VERTWS_utf8_safe(s, strend),
1805 is_VERTWS_latin1_safe(s, strend)
e1d1eefb 1806 );
6895a8aa 1807 break;
e1d1eefb
YO
1808 case NVERTWS:
1809 REXEC_FBC_CSCAN(
7016d6eb
DM
1810 !is_VERTWS_utf8_safe(s, strend),
1811 !is_VERTWS_latin1_safe(s, strend)
e1d1eefb 1812 );
6895a8aa 1813 break;
e1d1eefb
YO
1814 case HORIZWS:
1815 REXEC_FBC_CSCAN(
7016d6eb
DM
1816 is_HORIZWS_utf8_safe(s, strend),
1817 is_HORIZWS_latin1_safe(s, strend)
e1d1eefb 1818 );
6895a8aa 1819 break;
e1d1eefb
YO
1820 case NHORIZWS:
1821 REXEC_FBC_CSCAN(
7016d6eb
DM
1822 !is_HORIZWS_utf8_safe(s, strend),
1823 !is_HORIZWS_latin1_safe(s, strend)
e1d1eefb 1824 );
6895a8aa 1825 break;
0658cdde
KW
1826 case POSIXA:
1827 /* Don't need to worry about utf8, as it can match only a single
1828 * byte invariant character. The flag in this node type is the
1829 * class number to pass to _generic_isCC() to build a mask for
1830 * searching in PL_charclass[] */
1831 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1832 break;
1833 case NPOSIXA:
1834 REXEC_FBC_CSCAN(
1835 !_generic_isCC_A(*s, FLAGS(c)),
1836 !_generic_isCC_A(*s, FLAGS(c))
1837 );
1838 break;
1839
1de06328
YO
1840 case AHOCORASICKC:
1841 case AHOCORASICK:
07be1b83 1842 {
a0a388a1 1843 DECL_TRIE_TYPE(c);
07be1b83
YO
1844 /* what trie are we using right now */
1845 reg_ac_data *aho
f8fc2ecf 1846 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1847 reg_trie_data *trie
1848 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1849 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1850
1851 const char *last_start = strend - trie->minlen;
6148ee25 1852#ifdef DEBUGGING
07be1b83 1853 const char *real_start = s;
6148ee25 1854#endif
07be1b83 1855 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1856 SV *sv_points;
1857 U8 **points; /* map of where we were in the input string
786e8c11 1858 when reading a given char. For ASCII this
be8e71aa 1859 is unnecessary overhead as the relationship
38a44b82
NC
1860 is always 1:1, but for Unicode, especially
1861 case folded Unicode this is not true. */
f9e705e8 1862 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1863 U8 *bitmap=NULL;
1864
07be1b83
YO
1865
1866 GET_RE_DEBUG_FLAGS_DECL;
1867
be8e71aa
YO
1868 /* We can't just allocate points here. We need to wrap it in
1869 * an SV so it gets freed properly if there is a croak while
1870 * running the match */
1871 ENTER;
1872 SAVETMPS;
1873 sv_points=newSV(maxlen * sizeof(U8 *));
1874 SvCUR_set(sv_points,
1875 maxlen * sizeof(U8 *));
1876 SvPOK_on(sv_points);
1877 sv_2mortal(sv_points);
1878 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1879 if ( trie_type != trie_utf8_fold
1880 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1881 {
786e8c11
YO
1882 if (trie->bitmap)
1883 bitmap=(U8*)trie->bitmap;
1884 else
1885 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1886 }
786e8c11
YO
1887 /* this is the Aho-Corasick algorithm modified a touch
1888 to include special handling for long "unknown char"
1889 sequences. The basic idea being that we use AC as long
1890 as we are dealing with a possible matching char, when
1891 we encounter an unknown char (and we have not encountered
1892 an accepting state) we scan forward until we find a legal
1893 starting char.
1894 AC matching is basically that of trie matching, except
1895 that when we encounter a failing transition, we fall back
1896 to the current states "fail state", and try the current char
1897 again, a process we repeat until we reach the root state,
1898 state 1, or a legal transition. If we fail on the root state
1899 then we can either terminate if we have reached an accepting
1900 state previously, or restart the entire process from the beginning
1901 if we have not.
1902
1903 */
07be1b83
YO
1904 while (s <= last_start) {
1905 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1906 U8 *uc = (U8*)s;
1907 U16 charid = 0;
1908 U32 base = 1;
1909 U32 state = 1;
1910 UV uvc = 0;
1911 STRLEN len = 0;
1912 STRLEN foldlen = 0;
1913 U8 *uscan = (U8*)NULL;
1914 U8 *leftmost = NULL;
786e8c11
YO
1915#ifdef DEBUGGING
1916 U32 accepted_word= 0;
1917#endif
07be1b83
YO
1918 U32 pointpos = 0;
1919
1920 while ( state && uc <= (U8*)strend ) {
1921 int failed=0;
786e8c11
YO
1922 U32 word = aho->states[ state ].wordnum;
1923
1de06328
YO
1924 if( state==1 ) {
1925 if ( bitmap ) {
1926 DEBUG_TRIE_EXECUTE_r(
1927 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1928 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1929 (char *)uc, utf8_target );
1de06328
YO
1930 PerlIO_printf( Perl_debug_log,
1931 " Scanning for legal start char...\n");
1932 }
d085b490
YO
1933 );
1934 if (utf8_target) {
1935 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1936 uc += UTF8SKIP(uc);
1937 }
1938 } else {
1939 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1940 uc++;
1941 }
1942 }
1de06328 1943 s= (char *)uc;
786e8c11 1944 }
786e8c11
YO
1945 if (uc >(U8*)last_start) break;
1946 }
1947
1948 if ( word ) {
2e64971a 1949 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1950 if (!leftmost || lpos < leftmost) {
1951 DEBUG_r(accepted_word=word);
07be1b83 1952 leftmost= lpos;
786e8c11 1953 }
07be1b83 1954 if (base==0) break;
786e8c11 1955
07be1b83
YO
1956 }
1957 points[pointpos++ % maxlen]= uc;
7016d6eb
DM
1958 if (foldlen || uc < (U8*)strend) {
1959 REXEC_TRIE_READ_CHAR(trie_type, trie,
1960 widecharmap, uc,
55eed653
NC
1961 uscan, len, uvc, charid, foldlen,
1962 foldbuf, uniflags);
7016d6eb
DM
1963 DEBUG_TRIE_EXECUTE_r({
1964 dump_exec_pos( (char *)uc, c, strend,
1965 real_start, s, utf8_target);
1966 PerlIO_printf(Perl_debug_log,
1967 " Charid:%3u CP:%4"UVxf" ",
1968 charid, uvc);
1969 });
1970 }
1971 else {
1972 len = 0;
1973 charid = 0;
1974 }
1975
07be1b83
YO
1976
1977 do {
6148ee25 1978#ifdef DEBUGGING
786e8c11 1979 word = aho->states[ state ].wordnum;
6148ee25 1980#endif
07be1b83
YO
1981 base = aho->states[ state ].trans.base;
1982
786e8c11
YO
1983 DEBUG_TRIE_EXECUTE_r({
1984 if (failed)
1985 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1986 s, utf8_target );
07be1b83 1987 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1988 "%sState: %4"UVxf", word=%"UVxf,
1989 failed ? " Fail transition to " : "",
1990 (UV)state, (UV)word);
1991 });
07be1b83
YO
1992 if ( base ) {
1993 U32 tmp;
6dd2be57 1994 I32 offset;
07be1b83 1995 if (charid &&
6dd2be57
DM
1996 ( ((offset = base + charid
1997 - 1 - trie->uniquecharcount)) >= 0)
1998 && ((U32)offset < trie->lasttrans)
1999 && trie->trans[offset].check == state
2000 && (tmp=trie->trans[offset].next))
07be1b83 2001 {
786e8c11
YO
2002 DEBUG_TRIE_EXECUTE_r(
2003 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
2004 state = tmp;
2005 break;
2006 }
2007 else {
786e8c11
YO
2008 DEBUG_TRIE_EXECUTE_r(
2009 PerlIO_printf( Perl_debug_log," - fail\n"));
2010 failed = 1;
2011 state = aho->fail[state];
07be1b83
YO
2012 }
2013 }
2014 else {
2015 /* we must be accepting here */
786e8c11
YO
2016 DEBUG_TRIE_EXECUTE_r(
2017 PerlIO_printf( Perl_debug_log," - accepting\n"));
2018 failed = 1;
07be1b83
YO
2019 break;
2020 }
2021 } while(state);
786e8c11 2022 uc += len;
07be1b83
YO
2023 if (failed) {
2024 if (leftmost)
2025 break;
786e8c11 2026 if (!state) state = 1;
07be1b83
YO
2027 }
2028 }
2029 if ( aho->states[ state ].wordnum ) {
2e64971a 2030 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
2031 if (!leftmost || lpos < leftmost) {
2032 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 2033 leftmost = lpos;
786e8c11 2034 }
07be1b83 2035 }
07be1b83
YO
2036 if (leftmost) {
2037 s = (char*)leftmost;
786e8c11
YO
2038 DEBUG_TRIE_EXECUTE_r({
2039 PerlIO_printf(
70685ca0
JH
2040 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2041 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
2042 );
2043 });
24b23f37 2044 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
2045 FREETMPS;
2046 LEAVE;
07be1b83 2047 goto got_it;
be8e71aa 2048 }
07be1b83 2049 s = HOPc(s,1);
786e8c11
YO
2050 DEBUG_TRIE_EXECUTE_r({
2051 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2052 });
07be1b83 2053 } else {
786e8c11
YO
2054 DEBUG_TRIE_EXECUTE_r(
2055 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
2056 break;
2057 }
2058 }
be8e71aa
YO
2059 FREETMPS;
2060 LEAVE;
07be1b83
YO
2061 }
2062 break;
b3c9acc1 2063 default:
3c3eec57
GS
2064 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2065 break;
d6a28714 2066 }
6eb5f6b9
JH
2067 return 0;
2068 got_it:
2069 return s;
2070}
2071
fae667d5 2072
6eb5f6b9
JH
2073/*
2074 - regexec_flags - match a regexp against a string
2075 */
2076I32
288b8c02 2077Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9 2078 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2079/* stringarg: the point in the string at which to begin matching */
2080/* strend: pointer to null at end of string */
2081/* strbeg: real beginning of string */
2082/* minend: end of match must be >= minend bytes after stringarg. */
2083/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2084 * itself is accessed via the pointers above */
2085/* data: May be used for some additional optimizations.
2086 Currently its only used, with a U32 cast, for transmitting
2087 the ganch offset when doing a /g match. This will change */
2088/* nosave: For optimizations. */
2089
6eb5f6b9 2090{
97aff369 2091 dVAR;
288b8c02 2092 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 2093 /*register*/ char *s;
eb578fdb 2094 regnode *c;
24b23f37 2095 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
2096 I32 minlen; /* must match at least this many chars */
2097 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2098 I32 end_shift = 0; /* Same for the end. */ /* CC */
2099 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2100 char *scream_olds = NULL;
f2ed9b32 2101 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2102 I32 multiline;
f8fc2ecf 2103 RXi_GET_DECL(prog,progi);
3b0527fe 2104 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2105 regexp_paren_pair *swap = NULL;
a3621e74
YO
2106 GET_RE_DEBUG_FLAGS_DECL;
2107
7918f24d 2108 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2109 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2110
2111 /* Be paranoid... */
2112 if (prog == NULL || startpos == NULL) {
2113 Perl_croak(aTHX_ "NULL regexp parameter");
2114 return 0;
2115 }
2116
bbe252da 2117 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2118 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2119
f2ed9b32 2120 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2121 DEBUG_EXECUTE_r(
f2ed9b32 2122 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2123 "Matching");
2124 );
bac06658 2125
6eb5f6b9 2126 minlen = prog->minlen;
1de06328
YO
2127
2128 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2129 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2130 "String too short [regexec_flags]...\n"));
2131 goto phooey;
1aa99e6b 2132 }
6eb5f6b9 2133
1de06328 2134
6eb5f6b9 2135 /* Check validity of program. */
f8fc2ecf 2136 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2137 Perl_croak(aTHX_ "corrupted regexp program");
2138 }
2139
2140 PL_reg_flags = 0;
ed301438 2141 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2142 PL_reg_maxiter = 0;
2143
3c8556c3 2144 if (RX_UTF8(rx))
6eb5f6b9
JH
2145 PL_reg_flags |= RF_utf8;
2146
2147 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2148 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2149 PL_bostr = strbeg;
3b0527fe 2150 reginfo.sv = sv;
6eb5f6b9
JH
2151
2152 /* Mark end of line for $ (and such) */
2153 PL_regeol = strend;
2154
2155 /* see how far we have to get to not match where we matched before */
3b0527fe 2156 reginfo.till = startpos+minend;
6eb5f6b9 2157
6eb5f6b9
JH
2158 /* If there is a "must appear" string, look for it. */
2159 s = startpos;
2160
bbe252da 2161 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2162 MAGIC *mg;
2c296965 2163 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2164 reginfo.ganch = startpos + prog->gofs;
2c296965 2165 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2166 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2167 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2168 && SvMAGIC(sv)
14befaf4
DM
2169 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2170 && mg->mg_len >= 0) {
3b0527fe 2171 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2172 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2173 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2174
bbe252da 2175 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2176 if (s > reginfo.ganch)
6eb5f6b9 2177 goto phooey;
58e23c8d 2178 s = reginfo.ganch - prog->gofs;
2c296965 2179 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2180 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2181 if (s < strbeg)
2182 goto phooey;
6eb5f6b9
JH
2183 }
2184 }
58e23c8d 2185 else if (data) {
70685ca0 2186 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2187 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2188 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2189
2190 } else { /* pos() not defined */
3b0527fe 2191 reginfo.ganch = strbeg;
2c296965
YO
2192 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2193 "GPOS: reginfo.ganch = strbeg\n"));
2194 }
6eb5f6b9 2195 }
288b8c02 2196 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2197 /* We have to be careful. If the previous successful match
2198 was from this regex we don't want a subsequent partially
2199 successful match to clobber the old results.
2200 So when we detect this possibility we add a swap buffer
2201 to the re, and switch the buffer each match. If we fail
2202 we switch it back, otherwise we leave it swapped.
2203 */
2204 swap = prog->offs;
2205 /* do we need a save destructor here for eval dies? */
2206 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2207 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2208 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2209 PTR2UV(prog),
2210 PTR2UV(swap),
2211 PTR2UV(prog->offs)
2212 ));
c74340f9 2213 }
a0714e2c 2214 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2215 re_scream_pos_data d;
2216
2217 d.scream_olds = &scream_olds;
2218 d.scream_pos = &scream_pos;
288b8c02 2219 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2220 if (!s) {
a3621e74 2221 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2222 goto phooey; /* not present */
3fa9c3d7 2223 }
6eb5f6b9
JH
2224 }
2225
1de06328 2226
6eb5f6b9
JH
2227
2228 /* Simplest case: anchored match need be tried only once. */
2229 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2230 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2231 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2232 goto got_it;
bbe252da
YO
2233 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2234 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2235 {
2236 char *end;
2237
2238 if (minlen)
2239 dontbother = minlen - 1;
1aa99e6b 2240 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2241 /* for multiline we only have to try after newlines */
33b8afdf 2242 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2243 /* because of the goto we can not easily reuse the macros for bifurcating the
2244 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2245 if (utf8_target) {
2246 if (s == startpos)
2247 goto after_try_utf8;
2248 while (1) {
2249 if (regtry(&reginfo, &s)) {
2250 goto got_it;
2251 }
2252 after_try_utf8:
2253 if (s > end) {
2254 goto phooey;
2255 }
2256 if (prog->extflags & RXf_USE_INTUIT) {
2257 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2258 if (!s) {
2259 goto phooey;
2260 }
2261 }
2262 else {
2263 s += UTF8SKIP(s);
2264 }
2265 }
2266 } /* end search for check string in unicode */
2267 else {
2268 if (s == startpos) {
2269 goto after_try_latin;
2270 }
2271 while (1) {
2272 if (regtry(&reginfo, &s)) {
2273 goto got_it;
2274 }
2275 after_try_latin:
2276 if (s > end) {
2277 goto phooey;
2278 }
2279 if (prog->extflags & RXf_USE_INTUIT) {
2280 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2281 if (!s) {
2282 goto phooey;
2283 }
2284 }
2285 else {
2286 s++;
2287 }
2288 }
2289 } /* end search for check string in latin*/
2290 } /* end search for check string */
2291 else { /* search for newline */
2292 if (s > startpos) {
2293 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2294 s--;
92f3d482 2295 }
21eede78
YO
2296 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2297 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2298 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2299 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2300 goto got_it;
2301 }
92f3d482
YO
2302 }
2303 } /* end search for newline */
2304 } /* end anchored/multiline check string search */
6eb5f6b9 2305 goto phooey;
bbe252da 2306 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2307 {
486ec47a 2308 /* the warning about reginfo.ganch being used without initialization
bbe252da 2309 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2310 and we only enter this block when the same bit is set. */
58e23c8d 2311 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2312
2313 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2314 goto got_it;
2315 goto phooey;
2316 }
2317
2318 /* Messy cases: unanchored match. */
bbe252da 2319 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2320 /* we have /x+whatever/ */
f2ed9b32 2321 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2322 char ch;
bf93d4cc
GS
2323#ifdef DEBUGGING
2324 int did_match = 0;
2325#endif
f2ed9b32 2326 if (utf8_target) {
7e0d5ad7
KW
2327 if (! prog->anchored_utf8) {
2328 to_utf8_substr(prog);
2329 }
2330 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2331 REXEC_FBC_SCAN(
6eb5f6b9 2332 if (*s == ch) {
a3621e74 2333 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2334 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2335 s += UTF8SKIP(s);
2336 while (s < strend && *s == ch)
2337 s += UTF8SKIP(s);
2338 }
4cadc6a9 2339 );
7e0d5ad7 2340
6eb5f6b9
JH
2341 }
2342 else {
7e0d5ad7
KW
2343 if (! prog->anchored_substr) {
2344 if (! to_byte_substr(prog)) {
2345 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2346 non_utf8_target_but_utf8_required));
2347 goto phooey;
2348 }
2349 }
2350 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2351 REXEC_FBC_SCAN(
6eb5f6b9 2352 if (*s == ch) {
a3621e74 2353 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2354 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2355 s++;
2356 while (s < strend && *s == ch)
2357 s++;
2358 }
4cadc6a9 2359 );
6eb5f6b9 2360 }
a3621e74 2361 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2362 PerlIO_printf(Perl_debug_log,
b7953727
JH
2363 "Did not find anchored character...\n")
2364 );
6eb5f6b9 2365 }
a0714e2c
SS
2366 else if (prog->anchored_substr != NULL
2367 || prog->anchored_utf8 != NULL
2368 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2369 && prog->float_max_offset < strend - s)) {
2370 SV *must;
2371 I32 back_max;
2372 I32 back_min;
2373 char *last;
6eb5f6b9 2374 char *last1; /* Last position checked before */
bf93d4cc
GS
2375#ifdef DEBUGGING
2376 int did_match = 0;
2377#endif
33b8afdf 2378 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2379 if (utf8_target) {
2380 if (! prog->anchored_utf8) {
2381 to_utf8_substr(prog);
2382 }
2383 must = prog->anchored_utf8;
2384 }
2385 else {
2386 if (! prog->anchored_substr) {
2387 if (! to_byte_substr(prog)) {
2388 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2389 non_utf8_target_but_utf8_required));
2390 goto phooey;
2391 }
2392 }
2393 must = prog->anchored_substr;
2394 }
33b8afdf
JH
2395 back_max = back_min = prog->anchored_offset;
2396 } else {
7e0d5ad7
KW
2397 if (utf8_target) {
2398 if (! prog->float_utf8) {
2399 to_utf8_substr(prog);
2400 }
2401 must = prog->float_utf8;
2402 }
2403 else {
2404 if (! prog->float_substr) {
2405 if (! to_byte_substr(prog)) {
2406 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2407 non_utf8_target_but_utf8_required));
2408 goto phooey;
2409 }
2410 }
2411 must = prog->float_substr;
2412 }
33b8afdf
JH
2413 back_max = prog->float_max_offset;
2414 back_min = prog->float_min_offset;
2415 }
1de06328 2416
1de06328
YO
2417 if (back_min<0) {
2418 last = strend;
2419 } else {
2420 last = HOP3c(strend, /* Cannot start after this */
2421 -(I32)(CHR_SVLEN(must)
2422 - (SvTAIL(must) != 0) + back_min), strbeg);
2423 }
6eb5f6b9
JH
2424 if (s > PL_bostr)
2425 last1 = HOPc(s, -1);
2426 else
2427 last1 = s - 1; /* bogus */
2428
a0288114 2429 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2430 check_substr==must. */
2431 scream_pos = -1;
2432 dontbother = end_shift;
2433 strend = HOPc(strend, -dontbother);
2434 while ( (s <= last) &&
c33e64f0 2435 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2436 (unsigned char*)strend, must,
c33e64f0 2437 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2438 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2439 if (HOPc(s, -back_max) > last1) {
2440 last1 = HOPc(s, -back_min);
2441 s = HOPc(s, -back_max);
2442 }
2443 else {
52657f30 2444 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2445
2446 last1 = HOPc(s, -back_min);
52657f30 2447 s = t;
6eb5f6b9 2448 }
f2ed9b32 2449 if (utf8_target) {
6eb5f6b9 2450 while (s <= last1) {
24b23f37 2451 if (regtry(&reginfo, &s))
6eb5f6b9 2452 goto got_it;
7016d6eb
DM
2453 if (s >= last1) {
2454 s++; /* to break out of outer loop */
2455 break;
2456 }
2457 s += UTF8SKIP(s);
6eb5f6b9
JH
2458 }
2459 }
2460 else {
2461 while (s <= last1) {
24b23f37 2462 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2463 goto got_it;
2464 s++;
2465 }
2466 }
2467 }
ab3bbdeb 2468 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2469 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2470 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2471 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2472 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2473 ? "anchored" : "floating"),
ab3bbdeb
YO
2474 quoted, RE_SV_TAIL(must));
2475 });
6eb5f6b9
JH
2476 goto phooey;
2477 }
f8fc2ecf 2478 else if ( (c = progi->regstclass) ) {
f14c76ed 2479 if (minlen) {
f8fc2ecf 2480 const OPCODE op = OP(progi->regstclass);
66e933ab 2481 /* don't bother with what can't match */
786e8c11 2482 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2483 strend = HOPc(strend, -(minlen - 1));
2484 }
a3621e74 2485 DEBUG_EXECUTE_r({
be8e71aa 2486 SV * const prop = sv_newmortal();
32fc9b6a 2487 regprop(prog, prop, c);
0df25f3d 2488 {
f2ed9b32 2489 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2490 s,strend-s,60);
0df25f3d 2491 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2492 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2493 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2494 quoted, (int)(strend - s));
0df25f3d 2495 }
ffc61ed2 2496 });
3b0527fe 2497 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2498 goto got_it;
07be1b83 2499 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2500 }
2501 else {
2502 dontbother = 0;
a0714e2c 2503 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2504 /* Trim the end. */
6af40bd7 2505 char *last= NULL;
33b8afdf 2506 SV* float_real;
c33e64f0
FC
2507 STRLEN len;
2508 const char *little;
33b8afdf 2509
7e0d5ad7
KW
2510 if (utf8_target) {
2511 if (! prog->float_utf8) {
2512 to_utf8_substr(prog);
2513 }
2514 float_real = prog->float_utf8;
2515 }
2516 else {
2517 if (! prog->float_substr) {
2518 if (! to_byte_substr(prog)) {
2519 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2520 non_utf8_target_but_utf8_required));
2521 goto phooey;
2522 }
2523 }
2524 float_real = prog->float_substr;
2525 }
d6a28714 2526
c33e64f0
FC
2527 little = SvPV_const(float_real, len);
2528 if (SvTAIL(float_real)) {
7f18ad16
KW
2529 /* This means that float_real contains an artificial \n on
2530 * the end due to the presence of something like this:
2531 * /foo$/ where we can match both "foo" and "foo\n" at the
2532 * end of the string. So we have to compare the end of the
2533 * string first against the float_real without the \n and
2534 * then against the full float_real with the string. We
2535 * have to watch out for cases where the string might be
2536 * smaller than the float_real or the float_real without
2537 * the \n. */
1a13b075
YO
2538 char *checkpos= strend - len;
2539 DEBUG_OPTIMISE_r(
2540 PerlIO_printf(Perl_debug_log,
2541 "%sChecking for float_real.%s\n",
2542 PL_colors[4], PL_colors[5]));
2543 if (checkpos + 1 < strbeg) {
7f18ad16
KW
2544 /* can't match, even if we remove the trailing \n
2545 * string is too short to match */
1a13b075
YO
2546 DEBUG_EXECUTE_r(
2547 PerlIO_printf(Perl_debug_log,
2548 "%sString shorter than required trailing substring, cannot match.%s\n",
2549 PL_colors[4], PL_colors[5]));
2550 goto phooey;
2551 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
2552 /* can match, the end of the string matches without the
2553 * "\n" */
1a13b075
YO
2554 last = checkpos + 1;
2555 } else if (checkpos < strbeg) {
7f18ad16
KW
2556 /* cant match, string is too short when the "\n" is
2557 * included */
1a13b075
YO
2558 DEBUG_EXECUTE_r(
2559 PerlIO_printf(Perl_debug_log,
2560 "%sString does not contain required trailing substring, cannot match.%s\n",
2561 PL_colors[4], PL_colors[5]));
2562 goto phooey;
2563 } else if (!multiline) {
7f18ad16
KW
2564 /* non multiline match, so compare with the "\n" at the
2565 * end of the string */
1a13b075
YO
2566 if (memEQ(checkpos, little, len)) {
2567 last= checkpos;
2568 } else {
2569 DEBUG_EXECUTE_r(
2570 PerlIO_printf(Perl_debug_log,
2571 "%sString does not contain required trailing substring, cannot match.%s\n",
2572 PL_colors[4], PL_colors[5]));
2573 goto phooey;
2574 }
2575 } else {
7f18ad16
KW
2576 /* multiline match, so we have to search for a place
2577 * where the full string is located */
d6a28714 2578 goto find_last;
1a13b075 2579 }
c33e64f0 2580 } else {
d6a28714 2581 find_last:
9041c2e3 2582 if (len)
d6a28714 2583 last = rninstr(s, strend, little, little + len);
b8c5462f 2584 else
a0288114 2585 last = strend; /* matching "$" */
b8c5462f 2586 }
6af40bd7 2587 if (!last) {
7f18ad16
KW
2588 /* at one point this block contained a comment which was
2589 * probably incorrect, which said that this was a "should not
2590 * happen" case. Even if it was true when it was written I am
2591 * pretty sure it is not anymore, so I have removed the comment
2592 * and replaced it with this one. Yves */
6bda09f9
YO
2593 DEBUG_EXECUTE_r(
2594 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2595 "String does not contain required substring, cannot match.\n"
2596 ));
2597 goto phooey;
bf93d4cc 2598 }
d6a28714
JH
2599 dontbother = strend - last + prog->float_min_offset;
2600 }
2601 if (minlen && (dontbother < minlen))
2602 dontbother = minlen - 1;
2603 strend -= dontbother; /* this one's always in bytes! */
2604 /* We don't know much -- general case. */
f2ed9b32 2605 if (utf8_target) {
d6a28714 2606 for (;;) {
24b23f37 2607 if (regtry(&reginfo, &s))
d6a28714
JH
2608 goto got_it;
2609 if (s >= strend)
2610 break;
b8c5462f 2611 s += UTF8SKIP(s);
d6a28714
JH
2612 };
2613 }
2614 else {
2615 do {
24b23f37 2616 if (regtry(&reginfo, &s))
d6a28714
JH
2617 goto got_it;
2618 } while (s++ < strend);
2619 }
2620 }
2621
2622 /* Failure. */
2623 goto phooey;
2624
2625got_it:
495f47a5
DM
2626 DEBUG_BUFFERS_r(
2627 if (swap)
2628 PerlIO_printf(Perl_debug_log,
2629 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2630 PTR2UV(prog),
2631 PTR2UV(swap)
2632 );
2633 );
e9105d30 2634 Safefree(swap);
288b8c02 2635 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2636
ed301438 2637 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2638 restore_pos(aTHX_ prog);
5daac39c
NC
2639 if (RXp_PAREN_NAMES(prog))
2640 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2641
2642 /* make sure $`, $&, $', and $digit will work later */
2643 if ( !(flags & REXEC_NOT_FIRST) ) {
d6a28714 2644 if (flags & REXEC_COPY_STR) {
f8c7b90f 2645#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2646 if ((SvIsCOW(sv)
2647 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2648 if (DEBUG_C_TEST) {
2649 PerlIO_printf(Perl_debug_log,
2650 "Copy on write: regexp capture, type %d\n",
2651 (int) SvTYPE(sv));
2652 }
77f8f7c1 2653 RX_MATCH_COPY_FREE(rx);
ed252734 2654 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2655 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734 2656 assert (SvPOKp(prog->saved_copy));
6502e081
DM
2657 prog->sublen = PL_regeol - strbeg;
2658 prog->suboffset = 0;
2659 prog->subcoffset = 0;
ed252734
NC
2660 } else
2661#endif
2662 {
6502e081
DM
2663 I32 min = 0;
2664 I32 max = PL_regeol - strbeg;
2665 I32 sublen;
2666
2667 if ( (flags & REXEC_COPY_SKIP_POST)
2668 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2669 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2670 ) { /* don't copy $' part of string */
3de645a8 2671 U32 n = 0;
6502e081
DM
2672 max = -1;
2673 /* calculate the right-most part of the string covered
2674 * by a capture. Due to look-ahead, this may be to
2675 * the right of $&, so we have to scan all captures */
2676 while (n <= prog->lastparen) {
2677 if (prog->offs[n].end > max)
2678 max = prog->offs[n].end;
2679 n++;
2680 }
2681 if (max == -1)
2682 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2683 ? prog->offs[0].start
2684 : 0;
2685 assert(max >= 0 && max <= PL_regeol - strbeg);
2686 }
2687
2688 if ( (flags & REXEC_COPY_SKIP_PRE)
2689 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2690 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2691 ) { /* don't copy $` part of string */
3de645a8 2692 U32 n = 0;
6502e081
DM
2693 min = max;
2694 /* calculate the left-most part of the string covered
2695 * by a capture. Due to look-behind, this may be to
2696 * the left of $&, so we have to scan all captures */
2697 while (min && n <= prog->lastparen) {
2698 if ( prog->offs[n].start != -1
2699 && prog->offs[n].start < min)
2700 {
2701 min = prog->offs[n].start;
2702 }
2703 n++;
2704 }
2705 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2706 && min > prog->offs[0].end
2707 )
2708 min = prog->offs[0].end;
2709
2710 }
2711
2712 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2713 sublen = max - min;
2714
2715 if (RX_MATCH_COPIED(rx)) {
2716 if (sublen > prog->sublen)
2717 prog->subbeg =
2718 (char*)saferealloc(prog->subbeg, sublen+1);
2719 }
2720 else
2721 prog->subbeg = (char*)safemalloc(sublen+1);
2722 Copy(strbeg + min, prog->subbeg, sublen, char);
2723 prog->subbeg[sublen] = '\0';
2724 prog->suboffset = min;
2725 prog->sublen = sublen;
77f8f7c1 2726 RX_MATCH_COPIED_on(rx);
6502e081 2727 }
6502e081
DM
2728 prog->subcoffset = prog->suboffset;
2729 if (prog->suboffset && utf8_target) {
2730 /* Convert byte offset to chars.
2731 * XXX ideally should only compute this if @-/@+
2732 * has been seen, a la PL_sawampersand ??? */
2733
2734 /* If there's a direct correspondence between the
2735 * string which we're matching and the original SV,
2736 * then we can use the utf8 len cache associated with
2737 * the SV. In particular, it means that under //g,
2738 * sv_pos_b2u() will use the previously cached
2739 * position to speed up working out the new length of
2740 * subcoffset, rather than counting from the start of
2741 * the string each time. This stops
2742 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2743 * from going quadratic */
2744 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2745 sv_pos_b2u(sv, &(prog->subcoffset));
2746 else
2747 prog->subcoffset = utf8_length((U8*)strbeg,
2748 (U8*)(strbeg+prog->suboffset));
2749 }
d6a28714
JH
2750 }
2751 else {
6502e081 2752 RX_MATCH_COPY_FREE(rx);
d6a28714 2753 prog->subbeg = strbeg;
6502e081
DM
2754 prog->suboffset = 0;
2755 prog->subcoffset = 0;
d6a28714
JH
2756 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2757 }
2758 }
9041c2e3 2759
d6a28714
JH
2760 return 1;
2761
2762phooey:
a3621e74 2763 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2764 PL_colors[4], PL_colors[5]));
ed301438 2765 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2766 restore_pos(aTHX_ prog);
e9105d30 2767 if (swap) {
c74340f9 2768 /* we failed :-( roll it back */
495f47a5
DM
2769 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2770 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2771 PTR2UV(prog),
2772 PTR2UV(prog->offs),
2773 PTR2UV(swap)
2774 ));
e9105d30
GG
2775 Safefree(prog->offs);
2776 prog->offs = swap;
2777 }
d6a28714
JH
2778 return 0;
2779}
2780
6bda09f9 2781
ec43f78b
DM
2782/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2783 * Do inc before dec, in case old and new rex are the same */
2784#define SET_reg_curpm(Re2) \
2785 if (PL_reg_state.re_state_eval_setup_done) { \
2786 (void)ReREFCNT_inc(Re2); \
2787 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2788 PM_SETRE((PL_reg_curpm), (Re2)); \
2789 }
2790
2791
d6a28714
JH
2792/*
2793 - regtry - try match at specific point
2794 */
2795STATIC I32 /* 0 failure, 1 success */
f73aaa43 2796S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 2797{
97aff369 2798 dVAR;
d6a28714 2799 CHECKPOINT lastcp;
288b8c02
NC
2800 REGEXP *const rx = reginfo->prog;
2801 regexp *const prog = (struct regexp *)SvANY(rx);
f73aaa43 2802 I32 result;
f8fc2ecf 2803 RXi_GET_DECL(prog,progi);
a3621e74 2804 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2805
2806 PERL_ARGS_ASSERT_REGTRY;
2807
24b23f37 2808 reginfo->cutpoint=NULL;
d6a28714 2809
ed301438
DM
2810 if ((prog->extflags & RXf_EVAL_SEEN)
2811 && !PL_reg_state.re_state_eval_setup_done)
2812 {
d6a28714
JH
2813 MAGIC *mg;
2814
ed301438 2815 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2816 if (reginfo->sv) {
d6a28714 2817 /* Make $_ available to executed code. */
3b0527fe 2818 if (reginfo->sv != DEFSV) {
59f00321 2819 SAVE_DEFSV;
414bf5ae 2820 DEFSV_set(reginfo->sv);
b8c5462f 2821 }
d6a28714 2822
3b0527fe
DM
2823 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2824 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2825 /* prepare for quick setting of pos */
d300d9fa 2826#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2827 if (SvIsCOW(reginfo->sv))
2828 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2829#endif
3dab1dad 2830 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2831 &PL_vtbl_mglob, NULL, 0);
d6a28714 2832 mg->mg_len = -1;
b8c5462f 2833 }
d6a28714
JH
2834 PL_reg_magic = mg;
2835 PL_reg_oldpos = mg->mg_len;
4f639d21 2836 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2837 }
09687e5a 2838 if (!PL_reg_curpm) {
a02a5408 2839 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2840#ifdef USE_ITHREADS
2841 {
14a49a24 2842 SV* const repointer = &PL_sv_undef;
92313705
NC
2843 /* this regexp is also owned by the new PL_reg_curpm, which
2844 will try to free it. */
d2ece331 2845 av_push(PL_regex_padav, repointer);
09687e5a
AB
2846 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2847 PL_regex_pad = AvARRAY(PL_regex_padav);
2848 }
2849#endif
2850 }
ec43f78b 2851 SET_reg_curpm(rx);
d6a28714
JH
2852 PL_reg_oldcurpm = PL_curpm;
2853 PL_curpm = PL_reg_curpm;
07bc277f 2854 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2855 /* Here is a serious problem: we cannot rewrite subbeg,
2856 since it may be needed if this match fails. Thus
2857 $` inside (?{}) could fail... */
2858 PL_reg_oldsaved = prog->subbeg;
2859 PL_reg_oldsavedlen = prog->sublen;
6502e081
DM
2860 PL_reg_oldsavedoffset = prog->suboffset;
2861 PL_reg_oldsavedcoffset = prog->suboffset;
f8c7b90f 2862#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2863 PL_nrs = prog->saved_copy;
2864#endif
07bc277f 2865 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2866 }
2867 else
bd61b366 2868 PL_reg_oldsaved = NULL;
d6a28714 2869 prog->subbeg = PL_bostr;
6502e081
DM
2870 prog->suboffset = 0;
2871 prog->subcoffset = 0;
d6a28714
JH
2872 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2873 }
97ca13b7 2874#ifdef DEBUGGING
f73aaa43 2875 PL_reg_starttry = *startposp;
97ca13b7 2876#endif
f73aaa43 2877 prog->offs[0].start = *startposp - PL_bostr;
d6a28714 2878 prog->lastparen = 0;
03994de8 2879 prog->lastcloseparen = 0;
d6a28714 2880 PL_regsize = 0;
d6a28714
JH
2881
2882 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2883 to do this again and again, prog->lastparen should take care of
3dd2943c 2884 this! --ilya*/
dafc8851
JH
2885
2886 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2887 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2888 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2889 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2890 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2891 * Meanwhile, this code *is* needed for the
daf18116
JH
2892 * above-mentioned test suite tests to succeed. The common theme
2893 * on those tests seems to be returning null fields from matches.
225593e1 2894 * --jhi updated by dapm */
dafc8851 2895#if 1
d6a28714 2896 if (prog->nparens) {
b93070ed 2897 regexp_paren_pair *pp = prog->offs;
eb578fdb 2898 I32 i;
b93070ed 2899 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2900 ++pp;
2901 pp->start = -1;
2902 pp->end = -1;
d6a28714
JH
2903 }
2904 }
dafc8851 2905#endif
02db2b7b 2906 REGCP_SET(lastcp);
f73aaa43
DM
2907 result = regmatch(reginfo, *startposp, progi->program + 1);
2908 if (result != -1) {
2909 prog->offs[0].end = result;
d6a28714
JH
2910 return 1;
2911 }
24b23f37 2912 if (reginfo->cutpoint)
f73aaa43 2913 *startposp= reginfo->cutpoint;
02db2b7b 2914 REGCP_UNWIND(lastcp);
d6a28714
JH
2915 return 0;
2916}
2917
02db2b7b 2918
8ba1375e
MJD
2919#define sayYES goto yes
2920#define sayNO goto no
262b90c4 2921#define sayNO_SILENT goto no_silent
8ba1375e 2922
f9f4320a
YO
2923/* we dont use STMT_START/END here because it leads to
2924 "unreachable code" warnings, which are bogus, but distracting. */
2925#define CACHEsayNO \
c476f425
DM
2926 if (ST.cache_mask) \
2927 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2928 sayNO
3298f257 2929
a3621e74 2930/* this is used to determine how far from the left messages like
265c4333
YO
2931 'failed...' are printed. It should be set such that messages
2932 are inline with the regop output that created them.
a3621e74 2933*/
265c4333 2934#define REPORT_CODE_OFF 32
a3621e74
YO
2935
2936
40a82448
DM
2937#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2938#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
2939#define CHRTEST_NOT_A_CP_1 -999
2940#define CHRTEST_NOT_A_CP_2 -998
9e137952 2941
86545054
DM
2942#define SLAB_FIRST(s) (&(s)->states[0])
2943#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2944
5d9a96ca
DM
2945/* grab a new slab and return the first slot in it */
2946
2947STATIC regmatch_state *
2948S_push_slab(pTHX)
2949{
a35a87e7 2950#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2951 dMY_CXT;
2952#endif
5d9a96ca
DM
2953 regmatch_slab *s = PL_regmatch_slab->next;
2954 if (!s) {
2955 Newx(s, 1, regmatch_slab);
2956 s->prev = PL_regmatch_slab;
2957 s->next = NULL;
2958 PL_regmatch_slab->next = s;
2959 }
2960 PL_regmatch_slab = s;
86545054 2961 return SLAB_FIRST(s);
5d9a96ca 2962}
5b47454d 2963
95b24440 2964
40a82448
DM
2965/* push a new state then goto it */
2966
4d5016e5
DM
2967#define PUSH_STATE_GOTO(state, node, input) \
2968 pushinput = input; \
40a82448
DM
2969 scan = node; \
2970 st->resume_state = state; \
2971 goto push_state;
2972
2973/* push a new state with success backtracking, then goto it */
2974
4d5016e5
DM
2975#define PUSH_YES_STATE_GOTO(state, node, input) \
2976 pushinput = input; \
40a82448
DM
2977 scan = node; \
2978 st->resume_state = state; \
2979 goto push_yes_state;
2980
aa283a38 2981
aa283a38 2982
4d5016e5 2983
d6a28714 2984/*
95b24440 2985
bf1f174e
DM
2986regmatch() - main matching routine
2987
2988This is basically one big switch statement in a loop. We execute an op,
2989set 'next' to point the next op, and continue. If we come to a point which
2990we may need to backtrack to on failure such as (A|B|C), we push a
2991backtrack state onto the backtrack stack. On failure, we pop the top
2992state, and re-enter the loop at the state indicated. If there are no more
2993states to pop, we return failure.
2994
2995Sometimes we also need to backtrack on success; for example /A+/, where
2996after successfully matching one A, we need to go back and try to
2997match another one; similarly for lookahead assertions: if the assertion
2998completes successfully, we backtrack to the state just before the assertion
2999and then carry on. In these cases, the pushed state is marked as
3000'backtrack on success too'. This marking is in fact done by a chain of
3001pointers, each pointing to the previous 'yes' state. On success, we pop to
3002the nearest yes state, discarding any intermediate failure-only states.
3003Sometimes a yes state is pushed just to force some cleanup code to be
3004called at the end of a successful match or submatch; e.g. (??{$re}) uses
3005it to free the inner regex.
3006
3007Note that failure backtracking rewinds the cursor position, while
3008success backtracking leaves it alone.
3009
3010A pattern is complete when the END op is executed, while a subpattern
3011such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3012ops trigger the "pop to last yes state if any, otherwise return true"
3013behaviour.
3014
3015A common convention in this function is to use A and B to refer to the two
3016subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3017the subpattern to be matched possibly multiple times, while B is the entire
3018rest of the pattern. Variable and state names reflect this convention.
3019
3020The states in the main switch are the union of ops and failure/success of
3021substates associated with with that op. For example, IFMATCH is the op
3022that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3023'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3024successfully matched A and IFMATCH_A_fail is a state saying that we have
3025just failed to match A. Resume states always come in pairs. The backtrack
3026state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3027at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3028on success or failure.
3029
3030The struct that holds a backtracking state is actually a big union, with
3031one variant for each major type of op. The variable st points to the
3032top-most backtrack struct. To make the code clearer, within each
3033block of code we #define ST to alias the relevant union.
3034
3035Here's a concrete example of a (vastly oversimplified) IFMATCH
3036implementation:
3037
3038 switch (state) {
3039 ....
3040
3041#define ST st->u.ifmatch
3042
3043 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3044 ST.foo = ...; // some state we wish to save
95b24440 3045 ...
bf1f174e
DM
3046 // push a yes backtrack state with a resume value of
3047 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3048 // first node of A:
4d5016e5 3049 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3050 // NOTREACHED
3051
3052 case IFMATCH_A: // we have successfully executed A; now continue with B
3053 next = B;
3054 bar = ST.foo; // do something with the preserved value
3055 break;
3056
3057 case IFMATCH_A_fail: // A failed, so the assertion failed
3058 ...; // do some housekeeping, then ...
3059 sayNO; // propagate the failure
3060
3061#undef ST
95b24440 3062
bf1f174e
DM
3063 ...
3064 }
95b24440 3065
bf1f174e
DM
3066For any old-timers reading this who are familiar with the old recursive
3067approach, the code above is equivalent to:
95b24440 3068
bf1f174e
DM
3069 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3070 {
3071 int foo = ...
95b24440 3072 ...
bf1f174e
DM
3073 if (regmatch(A)) {
3074 next = B;
3075 bar = foo;
3076 break;
95b24440 3077 }
bf1f174e
DM
3078 ...; // do some housekeeping, then ...
3079 sayNO; // propagate the failure
95b24440 3080 }
bf1f174e
DM
3081
3082The topmost backtrack state, pointed to by st, is usually free. If you
3083want to claim it, populate any ST.foo fields in it with values you wish to
3084save, then do one of
3085
4d5016e5
DM
3086 PUSH_STATE_GOTO(resume_state, node, newinput);
3087 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3088
3089which sets that backtrack state's resume value to 'resume_state', pushes a
3090new free entry to the top of the backtrack stack, then goes to 'node'.
3091On backtracking, the free slot is popped, and the saved state becomes the
3092new free state. An ST.foo field in this new top state can be temporarily
3093accessed to retrieve values, but once the main loop is re-entered, it
3094becomes available for reuse.
3095
3096Note that the depth of the backtrack stack constantly increases during the
3097left-to-right execution of the pattern, rather than going up and down with
3098the pattern nesting. For example the stack is at its maximum at Z at the
3099end of the pattern, rather than at X in the following:
3100
3101 /(((X)+)+)+....(Y)+....Z/
3102
3103The only exceptions to this are lookahead/behind assertions and the cut,
3104(?>A), which pop all the backtrack states associated with A before
3105continuing.
3106
486ec47a 3107Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3108PL_regmatch_state and st always point to the currently active state,
3109and PL_regmatch_slab points to the slab currently containing
3110PL_regmatch_state. The first time regmatch() is called, the first slab is
3111allocated, and is never freed until interpreter destruction. When the slab
3112is full, a new one is allocated and chained to the end. At exit from
3113regmatch(), slabs allocated since entry are freed.
3114
3115*/
95b24440 3116
40a82448 3117
5bc10b2c 3118#define DEBUG_STATE_pp(pp) \
265c4333 3119 DEBUG_STATE_r({ \
f2ed9b32 3120 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3121 PerlIO_printf(Perl_debug_log, \
5d458dd8 3122 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3123 depth*2, "", \
13d6edb4 3124 PL_reg_name[st->resume_state], \
5d458dd8
YO
3125 ((st==yes_state||st==mark_state) ? "[" : ""), \
3126 ((st==yes_state) ? "Y" : ""), \
3127 ((st==mark_state) ? "M" : ""), \
3128 ((st==yes_state||st==mark_state) ? "]" : "") \
3129 ); \
265c4333 3130 });
5bc10b2c 3131
40a82448 3132
3dab1dad 3133#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3134
3df15adc 3135#ifdef DEBUGGING
5bc10b2c 3136
ab3bbdeb 3137STATIC void
f2ed9b32 3138S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3139 const char *start, const char *end, const char *blurb)
3140{
efd26800 3141 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3142
3143 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3144
ab3bbdeb
YO
3145 if (!PL_colorset)
3146 reginitcolors();
3147 {
3148 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3149 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3150
f2ed9b32 3151 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3152 start, end - start, 60);
3153
3154 PerlIO_printf(Perl_debug_log,
3155 "%s%s REx%s %s against %s\n",
3156 PL_colors[4], blurb, PL_colors[5], s0, s1);
3157
f2ed9b32 3158 if (utf8_target||utf8_pat)
1de06328
YO
3159 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3160 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3161 utf8_pat && utf8_target ? " and " : "",
3162 utf8_target ? "string" : ""
ab3bbdeb
YO
3163 );
3164 }
3165}
3df15adc
YO
3166
3167STATIC void
786e8c11
YO
3168S_dump_exec_pos(pTHX_ const char *locinput,
3169 const regnode *scan,
3170 const char *loc_regeol,
3171 const char *loc_bostr,
3172 const char *loc_reg_starttry,
f2ed9b32 3173 const bool utf8_target)
07be1b83 3174{
786e8c11 3175 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3176 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3177 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3178 /* The part of the string before starttry has one color
3179 (pref0_len chars), between starttry and current
3180 position another one (pref_len - pref0_len chars),
3181 after the current position the third one.
3182 We assume that pref0_len <= pref_len, otherwise we
3183 decrease pref0_len. */
786e8c11
YO
3184 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3185 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3186 int pref0_len;
3187
7918f24d
NC
3188 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3189
f2ed9b32 3190 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3191 pref_len++;
786e8c11
YO
3192 pref0_len = pref_len - (locinput - loc_reg_starttry);
3193 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3194 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3195 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3196 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3197 l--;
3198 if (pref0_len < 0)
3199 pref0_len = 0;
3200 if (pref0_len > pref_len)
3201 pref0_len = pref_len;
3202 {
f2ed9b32 3203 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3204
ab3bbdeb 3205 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3206 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3207
ab3bbdeb 3208 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3209 (locinput - pref_len + pref0_len),
1de06328 3210 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3211
ab3bbdeb 3212 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3213 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3214
1de06328 3215 const STRLEN tlen=len0+len1+len2;
3df15adc 3216 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3217 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3218 (IV)(locinput - loc_bostr),
07be1b83 3219 len0, s0,
07be1b83 3220 len1, s1,
07be1b83 3221 (docolor ? "" : "> <"),
07be1b83 3222 len2, s2,
f9f4320a 3223 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3224 "");
3225 }
3226}
3df15adc 3227
07be1b83
YO
3228#endif
3229
0a4db386
YO
3230/* reg_check_named_buff_matched()
3231 * Checks to see if a named buffer has matched. The data array of
3232 * buffer numbers corresponding to the buffer is expected to reside
3233 * in the regexp->data->data array in the slot stored in the ARG() of
3234 * node involved. Note that this routine doesn't actually care about the
3235 * name, that information is not preserved from compilation to execution.
3236 * Returns the index of the leftmost defined buffer with the given name
3237 * or 0 if non of the buffers matched.
3238 */
3239STATIC I32
7918f24d
NC
3240S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3241{
0a4db386 3242 I32 n;
f8fc2ecf 3243 RXi_GET_DECL(rex,rexi);
ad64d0ec 3244 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3245 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3246
3247 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3248
0a4db386 3249 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3250 if ((I32)rex->lastparen >= nums[n] &&
3251 rex->offs[nums[n]].end != -1)
0a4db386
YO
3252 {
3253 return nums[n];
3254 }
3255 }
3256 return 0;
3257}
3258
2f554ef7
DM
3259
3260/* free all slabs above current one - called during LEAVE_SCOPE */
3261
3262STATIC void
3263S_clear_backtrack_stack(pTHX_ void *p)
3264{
3265 regmatch_slab *s = PL_regmatch_slab->next;
3266 PERL_UNUSED_ARG(p);
3267
3268 if (!s)
3269 return;
3270 PL_regmatch_slab->next = NULL;
3271 while (s) {
3272 regmatch_slab * const osl = s;
3273 s = s->next;
3274 Safefree(osl);
3275 }
3276}
c74f6de9 3277static bool
79a2a0e8 3278S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
c74f6de9 3279{
79a2a0e8
KW
3280 /* This function determines if there are one or two characters that match
3281 * the first character of the passed-in EXACTish node <text_node>, and if
3282 * so, returns them in the passed-in pointers.
c74f6de9 3283 *
79a2a0e8
KW
3284 * If it determines that no possible character in the target string can
3285 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3286 * the first character in <text_node> requires UTF-8 to represent, and the
3287 * target string isn't in UTF-8.)
c74f6de9 3288 *
79a2a0e8
KW
3289 * If there are more than two characters that could match the beginning of
3290 * <text_node>, or if more context is required to determine a match or not,
3291 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3292 *
3293 * The motiviation behind this function is to allow the caller to set up
3294 * tight loops for matching. If <text_node> is of type EXACT, there is
3295 * only one possible character that can match its first character, and so
3296 * the situation is quite simple. But things get much more complicated if
3297 * folding is involved. It may be that the first character of an EXACTFish
3298 * node doesn't participate in any possible fold, e.g., punctuation, so it
3299 * can be matched only by itself. The vast majority of characters that are
3300 * in folds match just two things, their lower and upper-case equivalents.
3301 * But not all are like that; some have multiple possible matches, or match
3302 * sequences of more than one character. This function sorts all that out.
3303 *
3304 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3305 * loop of trying to match A*, we know we can't exit where the thing
3306 * following it isn't a B. And something can't be a B unless it is the
3307 * beginning of B. By putting a quick test for that beginning in a tight
3308 * loop, we can rule out things that can't possibly be B without having to
3309 * break out of the loop, thus avoiding work. Similarly, if A is a single
3310 * character, we can make a tight loop matching A*, using the outputs of
3311 * this function.
3312 *
3313 * If the target string to match isn't in UTF-8, and there aren't
3314 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3315 * the one or two possible octets (which are characters in this situation)
3316 * that can match. In all cases, if there is only one character that can
3317 * match, *<c1p> and *<c2p> will be identical.
3318 *
3319 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3320 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3321 * can match the beginning of <text_node>. They should be declared with at
3322 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3323 * undefined what these contain.) If one or both of the buffers are
3324 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3325 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3326 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3327 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3328 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
c74f6de9
KW
3329
3330 const bool utf8_target = PL_reg_match_utf8;
79a2a0e8
KW
3331
3332 UV c1, c2;
3333 bool use_chrtest_void = FALSE;
3334
3335 /* Used when we have both utf8 input and utf8 output, to avoid converting
3336 * to/from code points */
3337 bool utf8_has_been_setup = FALSE;
3338
c74f6de9
KW
3339 dVAR;
3340
b4291290 3341 U8 *pat = (U8*)STRING(text_node);
c74f6de9 3342
79a2a0e8
KW
3343 if (OP(text_node) == EXACT) {
3344
3345 /* In an exact node, only one thing can be matched, that first
3346 * character. If both the pat and the target are UTF-8, we can just
3347 * copy the input to the output, avoiding finding the code point of
3348 * that character */
3349 if (! UTF_PATTERN) {
3350 c2 = c1 = *pat;
3351 }
3352 else if (utf8_target) {
3353 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3354 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3355 utf8_has_been_setup = TRUE;
3356 }
3357 else {
3358 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
c74f6de9 3359 }
79a2a0e8
KW
3360 }
3361 else /* an EXACTFish node */
3362 if ((UTF_PATTERN
3363 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3364 pat + STR_LEN(text_node)))
3365 || (! UTF_PATTERN
3366 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3367 pat + STR_LEN(text_node))))
3368 {
3369 /* Multi-character folds require more context to sort out. Also
3370 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3371 * handled outside this routine */
3372 use_chrtest_void = TRUE;
3373 }
3374 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3375 c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3376 if (c1 > 256) {
3377 /* Load the folds hash, if not already done */
3378 SV** listp;
3379 if (! PL_utf8_foldclosures) {
3380 if (! PL_utf8_tofold) {
3381 U8 dummy[UTF8_MAXBYTES+1];
3382
3383 /* Force loading this by folding an above-Latin1 char */
3384 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3385 assert(PL_utf8_tofold); /* Verify that worked */
3386 }
3387 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3388 }
3389
3390 /* The fold closures data structure is a hash with the keys being
3391 * the UTF-8 of every character that is folded to, like 'k', and
3392 * the values each an array of all code points that fold to its
3393 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3394 * not included */
3395 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3396 (char *) pat,
3397 UTF8SKIP(pat),
3398 FALSE))))
3399 {
3400 /* Not found in the hash, therefore there are no folds
3401 * containing it, so there is only a single character that
3402 * could match */
3403 c2 = c1;
3404 }
3405 else { /* Does participate in folds */
3406 AV* list = (AV*) *listp;
3407 if (av_len(list) != 1) {
3408
3409 /* If there aren't exactly two folds to this, it is outside
3410 * the scope of this function */
3411 use_chrtest_void = TRUE;
3412 }
3413 else { /* There are two. Get them */
3414 SV** c_p = av_fetch(list, 0, FALSE);
3415 if (c_p == NULL) {
3416 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3417 }
3418 c1 = SvUV(*c_p);
3419
3420 c_p = av_fetch(list, 1, FALSE);
3421 if (c_p == NULL) {
3422 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3423 }
3424 c2 = SvUV(*c_p);
3425
3426 /* Folds that cross the 255/256 boundary are forbidden if
3427 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3428 * pattern character is above 256, and its only other match
3429 * is below 256, the only legal match will be to itself.
3430 * We have thrown away the original, so have to compute
3431 * which is the one above 255 */
3432 if ((c1 < 256) != (c2 < 256)) {
3433 if (OP(text_node) == EXACTFL
3434 || (OP(text_node) == EXACTFA
3435 && (isASCII(c1) || isASCII(c2))))
3436 {
3437 if (c1 < 256) {
3438 c1 = c2;
3439 }
3440 else {
3441 c2 = c1;
3442 }
3443 }
3444 }
3445 }
3446 }
3447 }
3448 else /* Here, c1 is < 255 */
3449 if (utf8_target
3450 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3451 && OP(text_node) != EXACTFL
3452 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
c74f6de9
KW
3453 {
3454 /* Here, there could be something above Latin1 in the target which
79a2a0e8
KW
3455 * folds to this character in the pattern. All such cases except
3456 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3457 * involved in their folds, so are outside the scope of this
3458 * function */
3459 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3460 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3461 }
3462 else {
3463 use_chrtest_void = TRUE;
3464 }
c74f6de9
KW
3465 }
3466 else { /* Here nothing above Latin1 can fold to the pattern character */