This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Merge branch 'handy' into blead
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
7e0d5ad7
KW
40/* At least one required character in the target string is expressible only in
41 * UTF-8. */
991fc03a 42static const char* const non_utf8_target_but_utf8_required
7e0d5ad7
KW
43 = "Can't match, because target string needs to be in UTF-8\n";
44
6b54ddc5
YO
45#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
47 goto target; \
48} STMT_END
49
a687059c 50/*
e50aee73 51 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
52 *
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
55 *
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
59 *
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
62 * from defects in it.
63 *
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
66 *
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
69 *
70 **** Alterations to Henry's code are...
71 ****
4bb101f2 72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
a687059c 75 ****
9ef589d8
LW
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
a687059c
LW
78 *
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
82 */
83#include "EXTERN.h"
864dbfa3 84#define PERL_IN_REGEXEC_C
a687059c 85#include "perl.h"
0f5d15d6 86
54df2634
NC
87#ifdef PERL_IN_XSUB_RE
88# include "re_comp.h"
89#else
90# include "regcomp.h"
91#endif
a687059c 92
81e983c1 93#include "inline_invlist.c"
1b0f46bf 94#include "unicode_constants.h"
81e983c1 95
ef07e810 96#define RF_tainted 1 /* tainted information used? e.g. locale */
c277df42 97#define RF_warned 2 /* warned about big count? */
faec1544 98
ab3bbdeb 99#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 100
f2ed9b32 101#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02 102
c74f6de9
KW
103#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
104
a687059c
LW
105#ifndef STATIC
106#define STATIC static
107#endif
108
e0193e47 109/* Valid for non-utf8 strings: avoids the reginclass
7e2509c1
KW
110 * call if there are no complications: i.e., if everything matchable is
111 * straight forward in the bitmap */
635cd5d4 112#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
af364d03 113 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 114
c277df42
IZ
115/*
116 * Forwards.
117 */
118
f2ed9b32 119#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 120#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 121
3dab1dad
YO
122#define HOPc(pos,off) \
123 (char *)(PL_reg_match_utf8 \
52657f30 124 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
125 : (U8*)(pos + off))
126#define HOPBACKc(pos, off) \
07be1b83
YO
127 (char*)(PL_reg_match_utf8\
128 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
129 : (pos - off >= PL_bostr) \
8e11feef 130 ? (U8*)pos - off \
3dab1dad 131 : NULL)
efb30f32 132
e7409c1b 133#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 134#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 135
7016d6eb
DM
136
137#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
138#define NEXTCHR_IS_EOS (nextchr < 0)
139
140#define SET_nextchr \
141 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
142
143#define SET_locinput(p) \
144 locinput = (p); \
145 SET_nextchr
146
147
20d0b1e9 148/* these are unrolled below in the CCC_TRY_XXX defined */
61dad979 149#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
9c4fdda1 150 if (!CAT2(PL_utf8_,class)) { \
cf54f63a 151 bool ok; \
9c4fdda1 152 ENTER; save_re_context(); \
cf54f63a
JL
153 ok=CAT2(is_utf8_,class)((const U8*)str); \
154 PERL_UNUSED_VAR(ok); \
155 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
37e2e78e
KW
156/* Doesn't do an assert to verify that is correct */
157#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
9c4fdda1 158 if (!CAT2(PL_utf8_,class)) { \
9d63fa07 159 bool throw_away PERL_UNUSED_DECL; \
9c4fdda1
RB
160 ENTER; save_re_context(); \
161 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
c1976674 162 PERL_UNUSED_VAR(throw_away); \
9c4fdda1 163 LEAVE; } } STMT_END
37e2e78e 164
1a4fad37
AL
165#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
166#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
51371543 167
37e2e78e 168#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
61dad979
KW
169 /* No asserts are done for some of these, in case called on a */ \
170 /* Unicode version in which they map to nothing */ \
9566f14a 171 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
61dad979 172 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
20d0b1e9 173
1dcf4a1b 174#define PLACEHOLDER /* Something for the preprocessor to grab onto */
d1eb3177 175
ee9a90b8
KW
176/* The actual code for CCC_TRY, which uses several variables from the routine
177 * it's callable from. It is designed to be the bulk of a case statement.
178 * FUNC is the macro or function to call on non-utf8 targets that indicate if
179 * nextchr matches the class.
180 * UTF8_TEST is the whole test string to use for utf8 targets
181 * LOAD is what to use to test, and if not present to load in the swash for the
182 * class
183 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
184 * UTF8_TEST test.
185 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
186 * utf8 and a variant, load the swash if necessary and test using the utf8
187 * test. Advance to the next character if test is ok, otherwise fail; If not
188 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
189 * fails, or advance to the next character */
190
191#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
9566f14a 192 if (NEXTCHR_IS_EOS) { \
ee9a90b8
KW
193 sayNO; \
194 } \
195 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
196 LOAD_UTF8_CHARCLASS(CLASS, STR); \
197 if (POS_OR_NEG (UTF8_TEST)) { \
198 sayNO; \
199 } \
ee9a90b8 200 } \
28b98f76
DM
201 else if (POS_OR_NEG (FUNC(nextchr))) { \
202 sayNO; \
ee9a90b8 203 } \
28b98f76 204 goto increment_locinput;
980866de 205
ee9a90b8
KW
206/* Handle the non-locale cases for a character class and its complement. It
207 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
208 * This is because that code fails when the test succeeds, so we want to have
209 * the test fail so that the code succeeds. The swash is stored in a
210 * predictable PL_ place */
cfaf538b
KW
211#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
212 CLASS, STR) \
ee9a90b8
KW
213 case NAME: \
214 _CCC_TRY_CODE( !, FUNC, \
215 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
216 (U8*)locinput, TRUE)), \
217 CLASS, STR) \
218 case NNAME: \
1dcf4a1b 219 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
ee9a90b8
KW
220 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
221 (U8*)locinput, TRUE)), \
9566f14a 222 CLASS, STR)
ee9a90b8
KW
223/* Generate the case statements for both locale and non-locale character
224 * classes in regmatch for classes that don't have special unicode semantics.
225 * Locales don't use an immediate swash, but an intermediary special locale
226 * function that is called on the pointer to the current place in the input
227 * string. That function will resolve to needing the same swash. One might
228 * think that because we don't know what the locale will match, we shouldn't
229 * check with the swash loading function that it loaded properly; ie, that we
230 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
231 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
232 * irrelevant here */
233#define CCC_TRY(NAME, NNAME, FUNC, \
234 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
cfaf538b 235 NAMEA, NNAMEA, FUNCA, \
ee9a90b8
KW
236 CLASS, STR) \
237 case NAMEL: \
238 PL_reg_flags |= RF_tainted; \
239 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
240 case NNAMEL: \
241 PL_reg_flags |= RF_tainted; \
1dcf4a1b
KW
242 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
243 CLASS, STR) \
cfaf538b 244 case NAMEA: \
7016d6eb 245 if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) { \
cfaf538b
KW
246 sayNO; \
247 } \
248 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
3640db6b 249 locinput++; \
cfaf538b
KW
250 break; \
251 case NNAMEA: \
7016d6eb 252 if (NEXTCHR_IS_EOS || FUNCA(nextchr)) { \
cfaf538b
KW
253 sayNO; \
254 } \
28b98f76 255 goto increment_locinput; \
ee9a90b8
KW
256 /* Generate the non-locale cases */ \
257 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
258
259/* This is like CCC_TRY, but has an extra set of parameters for generating case
260 * statements to handle separate Unicode semantics nodes */
261#define CCC_TRY_U(NAME, NNAME, FUNC, \
262 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
263 NAMEU, NNAMEU, FUNCU, \
cfaf538b 264 NAMEA, NNAMEA, FUNCA, \
ee9a90b8 265 CLASS, STR) \
cfaf538b
KW
266 CCC_TRY(NAME, NNAME, FUNC, \
267 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
268 NAMEA, NNAMEA, FUNCA, \
269 CLASS, STR) \
ee9a90b8 270 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 271
3dab1dad
YO
272/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
273
5f80c4cf 274/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
275/* it would be nice to rework regcomp.sym to generate this stuff. sigh
276 *
277 * NOTE that *nothing* that affects backtracking should be in here, specifically
278 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
279 * node that is in between two EXACT like nodes when ascertaining what the required
280 * "follow" character is. This should probably be moved to regex compile time
281 * although it may be done at run time beause of the REF possibility - more
282 * investigation required. -- demerphq
283*/
3e901dc0
YO
284#define JUMPABLE(rn) ( \
285 OP(rn) == OPEN || \
286 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
287 OP(rn) == EVAL || \
cca55fe3
JP
288 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
289 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 290 OP(rn) == KEEPS || \
3dab1dad 291 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 292)
ee9b8eae 293#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 294
ee9b8eae
YO
295#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
296
297#if 0
298/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
299 we don't need this definition. */
300#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 301#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
302#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
303
304#else
305/* ... so we use this as its faster. */
306#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 307#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
308#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
309#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
310
311#endif
e2d8ce26 312
a84d97b6
HS
313/*
314 Search for mandatory following text node; for lookahead, the text must
315 follow but for lookbehind (rn->flags != 0) we skip to the next step.
316*/
cca55fe3 317#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
318 while (JUMPABLE(rn)) { \
319 const OPCODE type = OP(rn); \
320 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 321 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 322 else if (type == PLUS) \
cca55fe3 323 rn = NEXTOPER(rn); \
3dab1dad 324 else if (type == IFMATCH) \
a84d97b6 325 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 326 else rn += NEXT_OFF(rn); \
3dab1dad 327 } \
5f80c4cf 328} STMT_END
74750237 329
c476f425 330
acfe0abc 331static void restore_pos(pTHX_ void *arg);
51371543 332
87c0511b 333#define REGCP_PAREN_ELEMS 3
f067efbf 334#define REGCP_OTHER_ELEMS 3
e0fa7e2b 335#define REGCP_FRAME_ELEMS 1
620d5b66
NC
336/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
337 * are needed for the regexp context stack bookkeeping. */
338
76e3520e 339STATIC CHECKPOINT
b93070ed 340S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
a0d0e21e 341{
97aff369 342 dVAR;
a3b680e6 343 const int retval = PL_savestack_ix;
a3b680e6 344 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
345 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
346 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 347 I32 p;
40a82448 348 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 349
b93070ed
DM
350 PERL_ARGS_ASSERT_REGCPPUSH;
351
e49a9654 352 if (paren_elems_to_push < 0)
5637ef5b
NC
353 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
354 paren_elems_to_push);
e49a9654 355
e0fa7e2b
NC
356 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
357 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
358 " out of range (%lu-%ld)",
359 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 360
620d5b66 361 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 362
495f47a5
DM
363 DEBUG_BUFFERS_r(
364 if ((int)PL_regsize > (int)parenfloor)
365 PerlIO_printf(Perl_debug_log,
366 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
367 PTR2UV(rex),
368 PTR2UV(rex->offs)
369 );
370 );
87c0511b 371 for (p = parenfloor+1; p <= (I32)PL_regsize; p++) {
b1ce53c5 372/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
373 SSPUSHINT(rex->offs[p].end);
374 SSPUSHINT(rex->offs[p].start);
1ca2007e 375 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 376 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
377 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
378 (UV)p,
379 (IV)rex->offs[p].start,
380 (IV)rex->offs[p].start_tmp,
381 (IV)rex->offs[p].end
40a82448 382 ));
a0d0e21e 383 }
b1ce53c5 384/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22 385 SSPUSHINT(PL_regsize);
b93070ed
DM
386 SSPUSHINT(rex->lastparen);
387 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 388 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 389
a0d0e21e
LW
390 return retval;
391}
392
c277df42 393/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
394#define REGCP_SET(cp) \
395 DEBUG_STATE_r( \
ab3bbdeb 396 PerlIO_printf(Perl_debug_log, \
e4f74956 397 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
398 (IV)PL_savestack_ix)); \
399 cp = PL_savestack_ix
c3464db5 400
ab3bbdeb 401#define REGCP_UNWIND(cp) \
e4f74956 402 DEBUG_STATE_r( \
ab3bbdeb 403 if (cp != PL_savestack_ix) \
e4f74956
YO
404 PerlIO_printf(Perl_debug_log, \
405 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
406 (IV)(cp), (IV)PL_savestack_ix)); \
407 regcpblow(cp)
c277df42 408
a8d1f4b4
DM
409#define UNWIND_PAREN(lp, lcp) \
410 for (n = rex->lastparen; n > lp; n--) \
411 rex->offs[n].end = -1; \
412 rex->lastparen = n; \
413 rex->lastcloseparen = lcp;
414
415
f067efbf 416STATIC void
b93070ed 417S_regcppop(pTHX_ regexp *rex)
a0d0e21e 418{
97aff369 419 dVAR;
e0fa7e2b 420 UV i;
87c0511b 421 U32 paren;
a3621e74
YO
422 GET_RE_DEBUG_FLAGS_DECL;
423
7918f24d
NC
424 PERL_ARGS_ASSERT_REGCPPOP;
425
b1ce53c5 426 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 427 i = SSPOPUV;
e0fa7e2b
NC
428 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
429 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
430 rex->lastcloseparen = SSPOPINT;
431 rex->lastparen = SSPOPINT;
3280af22 432 PL_regsize = SSPOPINT;
b1ce53c5 433
620d5b66 434 i -= REGCP_OTHER_ELEMS;
b1ce53c5 435 /* Now restore the parentheses context. */
495f47a5
DM
436 DEBUG_BUFFERS_r(
437 if (i || rex->lastparen + 1 <= rex->nparens)
438 PerlIO_printf(Perl_debug_log,
439 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
440 PTR2UV(rex),
441 PTR2UV(rex->offs)
442 );
443 );
87c0511b 444 paren = PL_regsize;
620d5b66 445 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 446 I32 tmps;
1ca2007e 447 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 448 rex->offs[paren].start = SSPOPINT;
cf93c79d 449 tmps = SSPOPINT;
b93070ed
DM
450 if (paren <= rex->lastparen)
451 rex->offs[paren].end = tmps;
495f47a5
DM
452 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
453 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
454 (UV)paren,
455 (IV)rex->offs[paren].start,
456 (IV)rex->offs[paren].start_tmp,
457 (IV)rex->offs[paren].end,
458 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 459 );
87c0511b 460 paren--;
a0d0e21e 461 }
daf18116 462#if 1
dafc8851
JH
463 /* It would seem that the similar code in regtry()
464 * already takes care of this, and in fact it is in
465 * a better location to since this code can #if 0-ed out
466 * but the code in regtry() is needed or otherwise tests
467 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
468 * (as of patchlevel 7877) will fail. Then again,
469 * this code seems to be necessary or otherwise
225593e1
DM
470 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
471 * --jhi updated by dapm */
b93070ed 472 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
097eb12c 473 if (i > PL_regsize)
b93070ed
DM
474 rex->offs[i].start = -1;
475 rex->offs[i].end = -1;
495f47a5
DM
476 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
477 " \\%"UVuf": %s ..-1 undeffing\n",
478 (UV)i,
479 (i > PL_regsize) ? "-1" : " "
480 ));
a0d0e21e 481 }
dafc8851 482#endif
a0d0e21e
LW
483}
484
74088413
DM
485/* restore the parens and associated vars at savestack position ix,
486 * but without popping the stack */
487
488STATIC void
489S_regcp_restore(pTHX_ regexp *rex, I32 ix)
490{
491 I32 tmpix = PL_savestack_ix;
492 PL_savestack_ix = ix;
493 regcppop(rex);
494 PL_savestack_ix = tmpix;
495}
496
02db2b7b 497#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 498
a687059c 499/*
e50aee73 500 * pregexec and friends
a687059c
LW
501 */
502
76234dfb 503#ifndef PERL_IN_XSUB_RE
a687059c 504/*
c277df42 505 - pregexec - match a regexp against a string
a687059c 506 */
c277df42 507I32
5aaab254 508Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
c3464db5 509 char *strbeg, I32 minend, SV *screamer, U32 nosave)
8fd1a950
DM
510/* stringarg: the point in the string at which to begin matching */
511/* strend: pointer to null at end of string */
512/* strbeg: real beginning of string */
513/* minend: end of match must be >= minend bytes after stringarg. */
514/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
515 * itself is accessed via the pointers above */
516/* nosave: For optimizations. */
c277df42 517{
7918f24d
NC
518 PERL_ARGS_ASSERT_PREGEXEC;
519
c277df42 520 return
9041c2e3 521 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
522 nosave ? 0 : REXEC_COPY_STR);
523}
76234dfb 524#endif
22e551b9 525
9041c2e3 526/*
cad2e5aa
JH
527 * Need to implement the following flags for reg_anch:
528 *
529 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
530 * USE_INTUIT_ML
531 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
532 * INTUIT_AUTORITATIVE_ML
533 * INTUIT_ONCE_NOML - Intuit can match in one location only.
534 * INTUIT_ONCE_ML
535 *
536 * Another flag for this function: SECOND_TIME (so that float substrs
537 * with giant delta may be not rechecked).
538 */
539
540/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
541
3f7c398e 542/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
543 Otherwise, only SvCUR(sv) is used to get strbeg. */
544
545/* XXXX We assume that strpos is strbeg unless sv. */
546
6eb5f6b9
JH
547/* XXXX Some places assume that there is a fixed substring.
548 An update may be needed if optimizer marks as "INTUITable"
549 RExen without fixed substrings. Similarly, it is assumed that
550 lengths of all the strings are no more than minlen, thus they
551 cannot come from lookahead.
40d049e4
YO
552 (Or minlen should take into account lookahead.)
553 NOTE: Some of this comment is not correct. minlen does now take account
554 of lookahead/behind. Further research is required. -- demerphq
555
556*/
6eb5f6b9 557
2c2d71f5
JH
558/* A failure to find a constant substring means that there is no need to make
559 an expensive call to REx engine, thus we celebrate a failure. Similarly,
560 finding a substring too deep into the string means that less calls to
30944b6d
IZ
561 regtry() should be needed.
562
563 REx compiler's optimizer found 4 possible hints:
564 a) Anchored substring;
565 b) Fixed substring;
566 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 567 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 568 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
569 string which does not contradict any of them.
570 */
2c2d71f5 571
6eb5f6b9
JH
572/* Most of decisions we do here should have been done at compile time.
573 The nodes of the REx which we used for the search should have been
574 deleted from the finite automaton. */
575
cad2e5aa 576char *
288b8c02 577Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 578 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 579{
97aff369 580 dVAR;
8d919b0a 581 struct regexp *const prog = ReANY(rx);
eb578fdb 582 I32 start_shift = 0;
cad2e5aa 583 /* Should be nonnegative! */
eb578fdb
KW
584 I32 end_shift = 0;
585 char *s;
586 SV *check;
a1933d95 587 char *strbeg;
cad2e5aa 588 char *t;
f2ed9b32 589 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 590 I32 ml_anch;
eb578fdb 591 char *other_last = NULL; /* other substr checked before this */
bd61b366 592 char *check_at = NULL; /* check substr found at this pos */
d8080198 593 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 594 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 595 RXi_GET_DECL(prog,progi);
30944b6d 596#ifdef DEBUGGING
890ce7af 597 const char * const i_strpos = strpos;
30944b6d 598#endif
a3621e74
YO
599 GET_RE_DEBUG_FLAGS_DECL;
600
7918f24d 601 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
602 PERL_UNUSED_ARG(flags);
603 PERL_UNUSED_ARG(data);
7918f24d 604
f2ed9b32 605 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 606
3c8556c3 607 if (RX_UTF8(rx)) {
b8d68ded
JH
608 PL_reg_flags |= RF_utf8;
609 }
ab3bbdeb 610 DEBUG_EXECUTE_r(
f2ed9b32 611 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
612 sv ? "Guessing start of match in sv for"
613 : "Guessing start of match in string for");
2a782b5b 614 );
cad2e5aa 615
c344f387
JH
616 /* CHR_DIST() would be more correct here but it makes things slow. */
617 if (prog->minlen > strend - strpos) {
a3621e74 618 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 619 "String too short... [re_intuit_start]\n"));
cad2e5aa 620 goto fail;
2c2d71f5 621 }
1de06328 622
7016d6eb
DM
623 /* XXX we need to pass strbeg as a separate arg: the following is
624 * guesswork and can be wrong... */
625 if (sv && SvPOK(sv)) {
626 char * p = SvPVX(sv);
627 STRLEN cur = SvCUR(sv);
628 if (p <= strpos && strpos < p + cur) {
629 strbeg = p;
630 assert(p <= strend && strend <= p + cur);
631 }
632 else
633 strbeg = strend - cur;
634 }
635 else
636 strbeg = strpos;
637
1aa99e6b 638 PL_regeol = strend;
f2ed9b32 639 if (utf8_target) {
33b8afdf
JH
640 if (!prog->check_utf8 && prog->check_substr)
641 to_utf8_substr(prog);
642 check = prog->check_utf8;
643 } else {
7e0d5ad7
KW
644 if (!prog->check_substr && prog->check_utf8) {
645 if (! to_byte_substr(prog)) {
6b54ddc5 646 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
647 }
648 }
33b8afdf
JH
649 check = prog->check_substr;
650 }
bbe252da
YO
651 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
652 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
653 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 654 && !multiline ) ); /* Check after \n? */
cad2e5aa 655
7e25d62c 656 if (!ml_anch) {
bbe252da
YO
657 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
658 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 659 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
660 && sv && !SvROK(sv)
661 && (strpos != strbeg)) {
a3621e74 662 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
663 goto fail;
664 }
d46b78c6
KW
665 if (prog->check_offset_min == prog->check_offset_max
666 && !(prog->extflags & RXf_CANY_SEEN)
667 && ! multiline) /* /m can cause \n's to match that aren't
668 accounted for in the string max length.
669 See [perl #115242] */
670 {
2c2d71f5 671 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
672 I32 slen;
673
1aa99e6b 674 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 675
653099ff
GS
676 if (SvTAIL(check)) {
677 slen = SvCUR(check); /* >= 1 */
cad2e5aa 678
9041c2e3 679 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 680 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 681 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 682 goto fail_finish;
cad2e5aa
JH
683 }
684 /* Now should match s[0..slen-2] */
685 slen--;
3f7c398e 686 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 687 || (slen > 1
3f7c398e 688 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 689 report_neq:
a3621e74 690 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
691 goto fail_finish;
692 }
cad2e5aa 693 }
3f7c398e 694 else if (*SvPVX_const(check) != *s
653099ff 695 || ((slen = SvCUR(check)) > 1
3f7c398e 696 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 697 goto report_neq;
c315bfe8 698 check_at = s;
2c2d71f5 699 goto success_at_start;
7e25d62c 700 }
cad2e5aa 701 }
2c2d71f5 702 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 703 s = strpos;
2c2d71f5 704 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
705 end_shift = prog->check_end_shift;
706
2c2d71f5 707 if (!ml_anch) {
a3b680e6 708 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 709 - (SvTAIL(check) != 0);
a3b680e6 710 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
711
712 if (end_shift < eshift)
713 end_shift = eshift;
714 }
cad2e5aa 715 }
2c2d71f5 716 else { /* Can match at random position */
cad2e5aa
JH
717 ml_anch = 0;
718 s = strpos;
1de06328
YO
719 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
720 end_shift = prog->check_end_shift;
721
722 /* end shift should be non negative here */
cad2e5aa
JH
723 }
724
bcdf7404 725#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 726 if (end_shift < 0)
1de06328 727 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 728 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
729#endif
730
2c2d71f5
JH
731 restart:
732 /* Find a possible match in the region s..strend by looking for
733 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
734
735 {
736 I32 srch_start_shift = start_shift;
737 I32 srch_end_shift = end_shift;
c33e64f0
FC
738 U8* start_point;
739 U8* end_point;
1de06328
YO
740 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
741 srch_end_shift -= ((strbeg - s) - srch_start_shift);
742 srch_start_shift = strbeg - s;
743 }
6bda09f9 744 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
745 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
746 (IV)prog->check_offset_min,
747 (IV)srch_start_shift,
748 (IV)srch_end_shift,
749 (IV)prog->check_end_shift);
750 });
751
bbe252da 752 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
753 start_point= (U8*)(s + srch_start_shift);
754 end_point= (U8*)(strend - srch_end_shift);
755 } else {
756 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
757 end_point= HOP3(strend, -srch_end_shift, strbeg);
758 }
6bda09f9 759 DEBUG_OPTIMISE_MORE_r({
56570a2c 760 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 761 (int)(end_point - start_point),
fc8cd66c 762 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
763 start_point);
764 });
765
766 s = fbm_instr( start_point, end_point,
7fba1cd6 767 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 768 }
cad2e5aa
JH
769 /* Update the count-of-usability, remove useless subpatterns,
770 unshift s. */
2c2d71f5 771
ab3bbdeb 772 DEBUG_EXECUTE_r({
f2ed9b32 773 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
774 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
775 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 776 (s ? "Found" : "Did not find"),
f2ed9b32 777 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
778 ? "anchored" : "floating"),
779 quoted,
780 RE_SV_TAIL(check),
781 (s ? " at offset " : "...\n") );
782 });
2c2d71f5
JH
783
784 if (!s)
785 goto fail_finish;
2c2d71f5 786 /* Finish the diagnostic message */
a3621e74 787 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 788
1de06328
YO
789 /* XXX dmq: first branch is for positive lookbehind...
790 Our check string is offset from the beginning of the pattern.
791 So we need to do any stclass tests offset forward from that
792 point. I think. :-(
793 */
794
795
796
797 check_at=s;
798
799
2c2d71f5
JH
800 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
801 Start with the other substr.
802 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 803 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
804 *always* match. Probably should be marked during compile...
805 Probably it is right to do no SCREAM here...
806 */
807
f2ed9b32 808 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
809 : (prog->float_substr && prog->anchored_substr))
810 {
30944b6d 811 /* Take into account the "other" substring. */
2c2d71f5
JH
812 /* XXXX May be hopelessly wrong for UTF... */
813 if (!other_last)
6eb5f6b9 814 other_last = strpos;
f2ed9b32 815 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
816 do_other_anchored:
817 {
890ce7af
AL
818 char * const last = HOP3c(s, -start_shift, strbeg);
819 char *last1, *last2;
be8e71aa 820 char * const saved_s = s;
33b8afdf 821 SV* must;
2c2d71f5 822
2c2d71f5
JH
823 t = s - prog->check_offset_max;
824 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 825 && (!utf8_target
0ce71af7 826 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 827 && t > strpos)))
6f207bd3 828 NOOP;
2c2d71f5
JH
829 else
830 t = strpos;
1aa99e6b 831 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
832 if (t < other_last) /* These positions already checked */
833 t = other_last;
1aa99e6b 834 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
835 if (last < last1)
836 last1 = last;
1de06328
YO
837 /* XXXX It is not documented what units *_offsets are in.
838 We assume bytes, but this is clearly wrong.
839 Meaning this code needs to be carefully reviewed for errors.
840 dmq.
841 */
842
2c2d71f5 843 /* On end-of-str: see comment below. */
f2ed9b32 844 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
845 if (must == &PL_sv_undef) {
846 s = (char*)NULL;
1de06328 847 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
848 }
849 else
850 s = fbm_instr(
851 (unsigned char*)t,
852 HOP3(HOP3(last1, prog->anchored_offset, strend)
853 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
854 must,
7fba1cd6 855 multiline ? FBMrf_MULTILINE : 0
33b8afdf 856 );
ab3bbdeb 857 DEBUG_EXECUTE_r({
f2ed9b32 858 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
859 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
860 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 861 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
862 quoted, RE_SV_TAIL(must));
863 });
864
865
2c2d71f5
JH
866 if (!s) {
867 if (last1 >= last2) {
a3621e74 868 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
869 ", giving up...\n"));
870 goto fail_finish;
871 }
a3621e74 872 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 873 ", trying floating at offset %ld...\n",
be8e71aa 874 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
875 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
876 s = HOP3c(last, 1, strend);
2c2d71f5
JH
877 goto restart;
878 }
879 else {
a3621e74 880 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 881 (long)(s - i_strpos)));
1aa99e6b
IH
882 t = HOP3c(s, -prog->anchored_offset, strbeg);
883 other_last = HOP3c(s, 1, strend);
be8e71aa 884 s = saved_s;
2c2d71f5
JH
885 if (t == strpos)
886 goto try_at_start;
2c2d71f5
JH
887 goto try_at_offset;
888 }
30944b6d 889 }
2c2d71f5
JH
890 }
891 else { /* Take into account the floating substring. */
33b8afdf 892 char *last, *last1;
be8e71aa 893 char * const saved_s = s;
33b8afdf
JH
894 SV* must;
895
896 t = HOP3c(s, -start_shift, strbeg);
897 last1 = last =
898 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
899 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
900 last = HOP3c(t, prog->float_max_offset, strend);
901 s = HOP3c(t, prog->float_min_offset, strend);
902 if (s < other_last)
903 s = other_last;
2c2d71f5 904 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 905 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
906 /* fbm_instr() takes into account exact value of end-of-str
907 if the check is SvTAIL(ed). Since false positives are OK,
908 and end-of-str is not later than strend we are OK. */
909 if (must == &PL_sv_undef) {
910 s = (char*)NULL;
1de06328 911 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
912 }
913 else
2c2d71f5 914 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
915 (unsigned char*)last + SvCUR(must)
916 - (SvTAIL(must)!=0),
7fba1cd6 917 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 918 DEBUG_EXECUTE_r({
f2ed9b32 919 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
920 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
921 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 922 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
923 quoted, RE_SV_TAIL(must));
924 });
33b8afdf
JH
925 if (!s) {
926 if (last1 == last) {
a3621e74 927 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
928 ", giving up...\n"));
929 goto fail_finish;
2c2d71f5 930 }
a3621e74 931 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 932 ", trying anchored starting at offset %ld...\n",
be8e71aa 933 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
934 other_last = last;
935 s = HOP3c(t, 1, strend);
936 goto restart;
937 }
938 else {
a3621e74 939 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
940 (long)(s - i_strpos)));
941 other_last = s; /* Fix this later. --Hugo */
be8e71aa 942 s = saved_s;
33b8afdf
JH
943 if (t == strpos)
944 goto try_at_start;
945 goto try_at_offset;
946 }
2c2d71f5 947 }
cad2e5aa 948 }
2c2d71f5 949
1de06328 950
9ef43ace 951 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 952
6bda09f9 953 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
954 PerlIO_printf(Perl_debug_log,
955 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
956 (IV)prog->check_offset_min,
957 (IV)prog->check_offset_max,
958 (IV)(s-strpos),
959 (IV)(t-strpos),
960 (IV)(t-s),
961 (IV)(strend-strpos)
962 )
963 );
964
2c2d71f5 965 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 966 && (!utf8_target
9ef43ace 967 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
968 && t > strpos)))
969 {
2c2d71f5
JH
970 /* Fixed substring is found far enough so that the match
971 cannot start at strpos. */
972 try_at_offset:
cad2e5aa 973 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
974 /* Eventually fbm_*() should handle this, but often
975 anchored_offset is not 0, so this check will not be wasted. */
976 /* XXXX In the code below we prefer to look for "^" even in
977 presence of anchored substrings. And we search even
978 beyond the found float position. These pessimizations
979 are historical artefacts only. */
980 find_anchor:
2c2d71f5 981 while (t < strend - prog->minlen) {
cad2e5aa 982 if (*t == '\n') {
4ee3650e 983 if (t < check_at - prog->check_offset_min) {
f2ed9b32 984 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
985 /* Since we moved from the found position,
986 we definitely contradict the found anchored
30944b6d
IZ
987 substr. Due to the above check we do not
988 contradict "check" substr.
989 Thus we can arrive here only if check substr
990 is float. Redo checking for "other"=="fixed".
991 */
9041c2e3 992 strpos = t + 1;
a3621e74 993 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 994 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
995 goto do_other_anchored;
996 }
4ee3650e
GS
997 /* We don't contradict the found floating substring. */
998 /* XXXX Why not check for STCLASS? */
cad2e5aa 999 s = t + 1;
a3621e74 1000 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 1001 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
1002 goto set_useful;
1003 }
4ee3650e
GS
1004 /* Position contradicts check-string */
1005 /* XXXX probably better to look for check-string
1006 than for "\n", so one should lower the limit for t? */
a3621e74 1007 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 1008 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 1009 other_last = strpos = s = t + 1;
cad2e5aa
JH
1010 goto restart;
1011 }
1012 t++;
1013 }
a3621e74 1014 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 1015 PL_colors[0], PL_colors[1]));
2c2d71f5 1016 goto fail_finish;
cad2e5aa 1017 }
f5952150 1018 else {
a3621e74 1019 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 1020 PL_colors[0], PL_colors[1]));
f5952150 1021 }
cad2e5aa
JH
1022 s = t;
1023 set_useful:
f2ed9b32 1024 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1025 }
1026 else {
f5952150 1027 /* The found string does not prohibit matching at strpos,
2c2d71f5 1028 - no optimization of calling REx engine can be performed,
f5952150
GS
1029 unless it was an MBOL and we are not after MBOL,
1030 or a future STCLASS check will fail this. */
2c2d71f5
JH
1031 try_at_start:
1032 /* Even in this situation we may use MBOL flag if strpos is offset
1033 wrt the start of the string. */
05b4157f 1034 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1035 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1036 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1037 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1038 {
cad2e5aa
JH
1039 t = strpos;
1040 goto find_anchor;
1041 }
a3621e74 1042 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1043 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1044 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1045 );
2c2d71f5 1046 success_at_start:
bbe252da 1047 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1048 && (utf8_target ? (
33b8afdf
JH
1049 prog->check_utf8 /* Could be deleted already */
1050 && --BmUSEFUL(prog->check_utf8) < 0
1051 && (prog->check_utf8 == prog->float_utf8)
1052 ) : (
1053 prog->check_substr /* Could be deleted already */
1054 && --BmUSEFUL(prog->check_substr) < 0
1055 && (prog->check_substr == prog->float_substr)
1056 )))
66e933ab 1057 {
cad2e5aa 1058 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1059 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1060 /* XXX Does the destruction order has to change with utf8_target? */
1061 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1062 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1063 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1064 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1065 check = NULL; /* abort */
cad2e5aa 1066 s = strpos;
486ec47a 1067 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1068 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1069 if (prog->intflags & PREGf_IMPLICIT)
1070 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1071 /* XXXX This is a remnant of the old implementation. It
1072 looks wasteful, since now INTUIT can use many
6eb5f6b9 1073 other heuristics. */
bbe252da 1074 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1075 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1076 }
1077 else
1078 s = strpos;
1079 }
1080
6eb5f6b9
JH
1081 /* Last resort... */
1082 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1083 /* trie stclasses are too expensive to use here, we are better off to
1084 leave it to regmatch itself */
f8fc2ecf 1085 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1086 /* minlen == 0 is possible if regstclass is \b or \B,
1087 and the fixed substr is ''$.
1088 Since minlen is already taken into account, s+1 is before strend;
1089 accidentally, minlen >= 1 guaranties no false positives at s + 1
1090 even for \b or \B. But (minlen? 1 : 0) below assumes that
1091 regstclass does not come from lookahead... */
1092 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1093 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1094 const U8* const str = (U8*)STRING(progi->regstclass);
1095 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1096 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1097 : 1);
1de06328
YO
1098 char * endpos;
1099 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1100 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1101 else if (prog->float_substr || prog->float_utf8)
1102 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1103 else
1104 endpos= strend;
1105
d8080198
YO
1106 if (checked_upto < s)
1107 checked_upto = s;
1108 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1109 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1110
6eb5f6b9 1111 t = s;
d8080198
YO
1112 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1113 if (s) {
1114 checked_upto = s;
1115 } else {
6eb5f6b9 1116#ifdef DEBUGGING
cbbf8932 1117 const char *what = NULL;
6eb5f6b9
JH
1118#endif
1119 if (endpos == strend) {
a3621e74 1120 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1121 "Could not match STCLASS...\n") );
1122 goto fail;
1123 }
a3621e74 1124 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1125 "This position contradicts STCLASS...\n") );
bbe252da 1126 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1127 goto fail;
d8080198
YO
1128 checked_upto = HOPBACKc(endpos, start_shift);
1129 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1130 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1131 /* Contradict one of substrings */
33b8afdf 1132 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1133 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1134 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1135 hop_and_restart:
1aa99e6b 1136 s = HOP3c(t, 1, strend);
66e933ab
GS
1137 if (s + start_shift + end_shift > strend) {
1138 /* XXXX Should be taken into account earlier? */
a3621e74 1139 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1140 "Could not match STCLASS...\n") );
1141 goto fail;
1142 }
5e39e1e5
HS
1143 if (!check)
1144 goto giveup;
a3621e74 1145 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1146 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1147 what, (long)(s + start_shift - i_strpos)) );
1148 goto restart;
1149 }
66e933ab 1150 /* Have both, check_string is floating */
6eb5f6b9
JH
1151 if (t + start_shift >= check_at) /* Contradicts floating=check */
1152 goto retry_floating_check;
1153 /* Recheck anchored substring, but not floating... */
9041c2e3 1154 s = check_at;
5e39e1e5
HS
1155 if (!check)
1156 goto giveup;
a3621e74 1157 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1158 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1159 (long)(other_last - i_strpos)) );
1160 goto do_other_anchored;
1161 }
60e71179
GS
1162 /* Another way we could have checked stclass at the
1163 current position only: */
1164 if (ml_anch) {
1165 s = t = t + 1;
5e39e1e5
HS
1166 if (!check)
1167 goto giveup;
a3621e74 1168 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1169 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1170 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1171 goto try_at_offset;
66e933ab 1172 }
f2ed9b32 1173 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1174 goto fail;
486ec47a 1175 /* Check is floating substring. */
6eb5f6b9
JH
1176 retry_floating_check:
1177 t = check_at - start_shift;
a3621e74 1178 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1179 goto hop_and_restart;
1180 }
b7953727 1181 if (t != s) {
a3621e74 1182 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1183 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1184 (long)(t - i_strpos), (long)(s - i_strpos))
1185 );
1186 }
1187 else {
a3621e74 1188 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1189 "Does not contradict STCLASS...\n");
1190 );
1191 }
6eb5f6b9 1192 }
5e39e1e5 1193 giveup:
a3621e74 1194 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1195 PL_colors[4], (check ? "Guessed" : "Giving up"),
1196 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1197 return s;
2c2d71f5
JH
1198
1199 fail_finish: /* Substring not found */
33b8afdf 1200 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1201 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1202 fail:
a3621e74 1203 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1204 PL_colors[4], PL_colors[5]));
bd61b366 1205 return NULL;
cad2e5aa 1206}
9661b544 1207
a0a388a1
YO
1208#define DECL_TRIE_TYPE(scan) \
1209 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1210 trie_type = ((scan->flags == EXACT) \
1211 ? (utf8_target ? trie_utf8 : trie_plain) \
1212 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1213
1214#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1215uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1216 STRLEN skiplen; \
1217 switch (trie_type) { \
1218 case trie_utf8_fold: \
1219 if ( foldlen>0 ) { \
1220 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1221 foldlen -= len; \
1222 uscan += len; \
1223 len=0; \
1224 } else { \
1225 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1226 len = UTF8SKIP(uc); \
1227 skiplen = UNISKIP( uvc ); \
1228 foldlen -= skiplen; \
1229 uscan = foldbuf + skiplen; \
1230 } \
1231 break; \
1232 case trie_latin_utf8_fold: \
1233 if ( foldlen>0 ) { \
1234 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1235 foldlen -= len; \
1236 uscan += len; \
1237 len=0; \
1238 } else { \
1239 len = 1; \
1240 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1241 skiplen = UNISKIP( uvc ); \
1242 foldlen -= skiplen; \
1243 uscan = foldbuf + skiplen; \
1244 } \
1245 break; \
1246 case trie_utf8: \
1247 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1248 break; \
1249 case trie_plain: \
1250 uvc = (UV)*uc; \
1251 len = 1; \
1252 } \
1253 if (uvc < 256) { \
1254 charid = trie->charmap[ uvc ]; \
1255 } \
1256 else { \
1257 charid = 0; \
1258 if (widecharmap) { \
1259 SV** const svpp = hv_fetch(widecharmap, \
1260 (char*)&uvc, sizeof(UV), 0); \
1261 if (svpp) \
1262 charid = (U16)SvIV(*svpp); \
1263 } \
1264 } \
4cadc6a9
YO
1265} STMT_END
1266
4cadc6a9
YO
1267#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1268STMT_START { \
1269 while (s <= e) { \
1270 if ( (CoNd) \
fac1af77 1271 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1272 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1273 goto got_it; \
1274 s++; \
1275 } \
1276} STMT_END
1277
1278#define REXEC_FBC_UTF8_SCAN(CoDe) \
1279STMT_START { \
7016d6eb 1280 while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) { \
4cadc6a9
YO
1281 CoDe \
1282 s += uskip; \
1283 } \
1284} STMT_END
1285
1286#define REXEC_FBC_SCAN(CoDe) \
1287STMT_START { \
1288 while (s < strend) { \
1289 CoDe \
1290 s++; \
1291 } \
1292} STMT_END
1293
1294#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1295REXEC_FBC_UTF8_SCAN( \
1296 if (CoNd) { \
24b23f37 1297 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1298 goto got_it; \
1299 else \
1300 tmp = doevery; \
1301 } \
1302 else \
1303 tmp = 1; \
1304)
1305
1306#define REXEC_FBC_CLASS_SCAN(CoNd) \
1307REXEC_FBC_SCAN( \
1308 if (CoNd) { \
24b23f37 1309 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1310 goto got_it; \
1311 else \
1312 tmp = doevery; \
1313 } \
1314 else \
1315 tmp = 1; \
1316)
1317
1318#define REXEC_FBC_TRYIT \
24b23f37 1319if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1320 goto got_it
1321
e1d1eefb 1322#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1323 if (utf8_target) { \
e1d1eefb
YO
1324 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1325 } \
1326 else { \
1327 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1328 }
e1d1eefb 1329
4cadc6a9 1330#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1331 if (utf8_target) { \
4cadc6a9
YO
1332 UtFpReLoAd; \
1333 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1334 } \
1335 else { \
1336 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1337 }
4cadc6a9
YO
1338
1339#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1340 PL_reg_flags |= RF_tainted; \
f2ed9b32 1341 if (utf8_target) { \
4cadc6a9
YO
1342 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1343 } \
1344 else { \
1345 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1346 }
4cadc6a9 1347
786e8c11
YO
1348#define DUMP_EXEC_POS(li,s,doutf8) \
1349 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1350
cfaf538b
KW
1351
1352#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1353 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1354 tmp = TEST_NON_UTF8(tmp); \
1355 REXEC_FBC_UTF8_SCAN( \
1356 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1357 tmp = !tmp; \
1358 IF_SUCCESS; \
1359 } \
1360 else { \
1361 IF_FAIL; \
1362 } \
1363 ); \
1364
1365#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1366 if (s == PL_bostr) { \
1367 tmp = '\n'; \
1368 } \
1369 else { \
1370 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1371 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1372 } \
1373 tmp = TeSt1_UtF8; \
1374 LOAD_UTF8_CHARCLASS_ALNUM(); \
1375 REXEC_FBC_UTF8_SCAN( \
1376 if (tmp == ! (TeSt2_UtF8)) { \
1377 tmp = !tmp; \
1378 IF_SUCCESS; \
1379 } \
1380 else { \
1381 IF_FAIL; \
1382 } \
1383 ); \
1384
63ac0dad
KW
1385/* The only difference between the BOUND and NBOUND cases is that
1386 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1387 * NBOUND. This is accomplished by passing it in either the if or else clause,
1388 * with the other one being empty */
1389#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1390 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1391
1392#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1393 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1394
1395#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1396 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1397
1398#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1399 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1400
63ac0dad
KW
1401
1402/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1403 * be passed in completely with the variable name being tested, which isn't
1404 * such a clean interface, but this is easier to read than it was before. We
1405 * are looking for the boundary (or non-boundary between a word and non-word
1406 * character. The utf8 and non-utf8 cases have the same logic, but the details
1407 * must be different. Find the "wordness" of the character just prior to this
1408 * one, and compare it with the wordness of this one. If they differ, we have
1409 * a boundary. At the beginning of the string, pretend that the previous
1410 * character was a new-line */
cfaf538b 1411#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1412 if (utf8_target) { \
cfaf538b 1413 UTF8_CODE \
63ac0dad
KW
1414 } \
1415 else { /* Not utf8 */ \
1416 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1417 tmp = TEST_NON_UTF8(tmp); \
1418 REXEC_FBC_SCAN( \
1419 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1420 tmp = !tmp; \
1421 IF_SUCCESS; \
1422 } \
1423 else { \
1424 IF_FAIL; \
1425 } \
1426 ); \
1427 } \
1428 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1429 goto got_it;
1430
786e8c11
YO
1431/* We know what class REx starts with. Try to find this position... */
1432/* if reginfo is NULL, its a dryrun */
1433/* annoyingly all the vars in this routine have different names from their counterparts
1434 in regmatch. /grrr */
1435
3c3eec57 1436STATIC char *
07be1b83 1437S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1438 const char *strend, regmatch_info *reginfo)
a687059c 1439{
27da23d5 1440 dVAR;
bbe252da 1441 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
fac1af77
KW
1442 char *pat_string; /* The pattern's exactish string */
1443 char *pat_end; /* ptr to end char of pat_string */
1444 re_fold_t folder; /* Function for computing non-utf8 folds */
1445 const U8 *fold_array; /* array for folding ords < 256 */
d8093b23 1446 STRLEN ln;
5dab1207 1447 STRLEN lnc;
eb578fdb 1448 STRLEN uskip;
fac1af77
KW
1449 U8 c1;
1450 U8 c2;
6eb5f6b9 1451 char *e;
eb578fdb
KW
1452 I32 tmp = 1; /* Scratch variable? */
1453 const bool utf8_target = PL_reg_match_utf8;
453bfd44 1454 UV utf8_fold_flags = 0;
f8fc2ecf 1455 RXi_GET_DECL(prog,progi);
7918f24d
NC
1456
1457 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1458
6eb5f6b9
JH
1459 /* We know what class it must start with. */
1460 switch (OP(c)) {
6eb5f6b9 1461 case ANYOF:
e0193e47 1462 if (utf8_target) {
b1e3e569 1463 REXEC_FBC_UTF8_CLASS_SCAN(
635cd5d4 1464 reginclass(prog, c, (U8*)s, utf8_target));
388cc4de
HS
1465 }
1466 else {
6ef69d56 1467 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
a0d0e21e 1468 }
6eb5f6b9 1469 break;
f33976b4 1470 case CANY:
4cadc6a9 1471 REXEC_FBC_SCAN(
24b23f37 1472 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1473 goto got_it;
1474 else
1475 tmp = doevery;
4cadc6a9 1476 );
f33976b4 1477 break;
2f7f8cb1
KW
1478
1479 case EXACTFA:
1480 if (UTF_PATTERN || utf8_target) {
1481 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1482 goto do_exactf_utf8;
1483 }
1484 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1485 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1486 goto do_exactf_non_utf8; /* isn't dealt with by these */
1487
6eb5f6b9 1488 case EXACTF:
62bf7766 1489 if (utf8_target) {
77a6d856
KW
1490
1491 /* regcomp.c already folded this if pattern is in UTF-8 */
62bf7766 1492 utf8_fold_flags = 0;
fac1af77
KW
1493 goto do_exactf_utf8;
1494 }
1495 fold_array = PL_fold;
1496 folder = foldEQ;
1497 goto do_exactf_non_utf8;
1498
1499 case EXACTFL:
1500 if (UTF_PATTERN || utf8_target) {
17580e7a 1501 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
fac1af77
KW
1502 goto do_exactf_utf8;
1503 }
1504 fold_array = PL_fold_locale;
1505 folder = foldEQ_locale;
16d951b7
KW
1506 goto do_exactf_non_utf8;
1507
3c760661
KW
1508 case EXACTFU_SS:
1509 if (UTF_PATTERN) {
1510 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1511 }
1512 goto do_exactf_utf8;
1513
fab2782b 1514 case EXACTFU_TRICKYFOLD:
16d951b7
KW
1515 case EXACTFU:
1516 if (UTF_PATTERN || utf8_target) {
77a6d856 1517 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
16d951b7
KW
1518 goto do_exactf_utf8;
1519 }
1520
1521 /* Any 'ss' in the pattern should have been replaced by regcomp,
1522 * so we don't have to worry here about this single special case
1523 * in the Latin1 range */
1524 fold_array = PL_fold_latin1;
1525 folder = foldEQ_latin1;
fac1af77
KW
1526
1527 /* FALL THROUGH */
1528
62bf7766
KW
1529 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1530 are no glitches with fold-length differences
1531 between the target string and pattern */
fac1af77
KW
1532
1533 /* The idea in the non-utf8 EXACTF* cases is to first find the
1534 * first character of the EXACTF* node and then, if necessary,
1535 * case-insensitively compare the full text of the node. c1 is the
1536 * first character. c2 is its fold. This logic will not work for
1537 * Unicode semantics and the german sharp ss, which hence should
1538 * not be compiled into a node that gets here. */
1539 pat_string = STRING(c);
1540 ln = STR_LEN(c); /* length to match in octets/bytes */
1541
8a90a8fe
KW
1542 /* We know that we have to match at least 'ln' bytes (which is the
1543 * same as characters, since not utf8). If we have to match 3
1544 * characters, and there are only 2 availabe, we know without
1545 * trying that it will fail; so don't start a match past the
1546 * required minimum number from the far end */
fac1af77
KW
1547 e = HOP3c(strend, -((I32)ln), s);
1548
1549 if (!reginfo && e < s) {
1550 e = s; /* Due to minlen logic of intuit() */
1551 }
1552
1553 c1 = *pat_string;
1554 c2 = fold_array[c1];
1555 if (c1 == c2) { /* If char and fold are the same */
1556 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1557 }
1558 else {
1559 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1560 }
1561 break;
1562
1563 do_exactf_utf8:
e067297c
KW
1564 {
1565 unsigned expansion;
1566
fac1af77
KW
1567
1568 /* If one of the operands is in utf8, we can't use the simpler
1569 * folding above, due to the fact that many different characters
1570 * can have the same fold, or portion of a fold, or different-
1571 * length fold */
1572 pat_string = STRING(c);
1573 ln = STR_LEN(c); /* length to match in octets/bytes */
1574 pat_end = pat_string + ln;
1575 lnc = (UTF_PATTERN) /* length to match in characters */
1576 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1577 : ln;
1578
e067297c
KW
1579 /* We have 'lnc' characters to match in the pattern, but because of
1580 * multi-character folding, each character in the target can match
1581 * up to 3 characters (Unicode guarantees it will never exceed
1582 * this) if it is utf8-encoded; and up to 2 if not (based on the
1583 * fact that the Latin 1 folds are already determined, and the
1584 * only multi-char fold in that range is the sharp-s folding to
1585 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
89378d8a
KW
1586 * string character. Adjust lnc accordingly, rounding up, so that
1587 * if we need to match at least 4+1/3 chars, that really is 5. */
e067297c 1588 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
89378d8a 1589 lnc = (lnc + expansion - 1) / expansion;
e067297c
KW
1590
1591 /* As in the non-UTF8 case, if we have to match 3 characters, and
1592 * only 2 are left, it's guaranteed to fail, so don't start a
1593 * match that would require us to go beyond the end of the string
1594 */
1595 e = HOP3c(strend, -((I32)lnc), s);
fac1af77
KW
1596
1597 if (!reginfo && e < s) {
1598 e = s; /* Due to minlen logic of intuit() */
1599 }
1600
b33105db
KW
1601 /* XXX Note that we could recalculate e to stop the loop earlier,
1602 * as the worst case expansion above will rarely be met, and as we
1603 * go along we would usually find that e moves further to the left.
1604 * This would happen only after we reached the point in the loop
1605 * where if there were no expansion we should fail. Unclear if
1606 * worth the expense */
e067297c 1607
fac1af77
KW
1608 while (s <= e) {
1609 char *my_strend= (char *)strend;
1610 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1611 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1612 && (!reginfo || regtry(reginfo, &s)) )
1613 {
1614 goto got_it;
1615 }
bbdd8bad 1616 s += (utf8_target) ? UTF8SKIP(s) : 1;
fac1af77
KW
1617 }
1618 break;
e067297c 1619 }
bbce6d69 1620 case BOUNDL:
3280af22 1621 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1622 FBC_BOUND(isALNUM_LC,
1623 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1624 isALNUM_LC_utf8((U8*)s));
a0ed51b3 1625 break;
bbce6d69 1626 case NBOUNDL:
3280af22 1627 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1628 FBC_NBOUND(isALNUM_LC,
1629 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1630 isALNUM_LC_utf8((U8*)s));
1631 break;
1632 case BOUND:
1633 FBC_BOUND(isWORDCHAR,
1634 isALNUM_uni(tmp),
1635 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1636 break;
cfaf538b
KW
1637 case BOUNDA:
1638 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1639 isWORDCHAR_A(tmp),
1640 isWORDCHAR_A((U8*)s));
1641 break;
a0d0e21e 1642 case NBOUND:
63ac0dad
KW
1643 FBC_NBOUND(isWORDCHAR,
1644 isALNUM_uni(tmp),
1645 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1646 break;
cfaf538b
KW
1647 case NBOUNDA:
1648 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1649 isWORDCHAR_A(tmp),
1650 isWORDCHAR_A((U8*)s));
1651 break;
63ac0dad
KW
1652 case BOUNDU:
1653 FBC_BOUND(isWORDCHAR_L1,
1654 isALNUM_uni(tmp),
1655 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1656 break;
1657 case NBOUNDU:
1658 FBC_NBOUND(isWORDCHAR_L1,
1659 isALNUM_uni(tmp),
1660 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
a0ed51b3 1661 break;
bbce6d69 1662 case ALNUML:
4cadc6a9
YO
1663 REXEC_FBC_CSCAN_TAINT(
1664 isALNUM_LC_utf8((U8*)s),
1665 isALNUM_LC(*s)
1666 );
6895a8aa 1667 break;
980866de
KW
1668 case ALNUMU:
1669 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1670 LOAD_UTF8_CHARCLASS_ALNUM(),
1671 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1672 isWORDCHAR_L1((U8) *s)
1673 );
6895a8aa 1674 break;
980866de
KW
1675 case ALNUM:
1676 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1677 LOAD_UTF8_CHARCLASS_ALNUM(),
1678 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1679 isWORDCHAR((U8) *s)
1680 );
6895a8aa 1681 break;
cfaf538b 1682 case ALNUMA:
8e9da4d4
KW
1683 /* Don't need to worry about utf8, as it can match only a single
1684 * byte invariant character */
cfaf538b 1685 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
6895a8aa 1686 break;
980866de
KW
1687 case NALNUMU:
1688 REXEC_FBC_CSCAN_PRELOAD(
779d7b58 1689 LOAD_UTF8_CHARCLASS_ALNUM(),
359960d4 1690 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1691 ! isWORDCHAR_L1((U8) *s)
1692 );
6895a8aa 1693 break;
a0d0e21e 1694 case NALNUM:
4cadc6a9 1695 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1696 LOAD_UTF8_CHARCLASS_ALNUM(),
1697 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
980866de 1698 ! isALNUM(*s)
4cadc6a9 1699 );
6895a8aa 1700 break;
cfaf538b 1701 case NALNUMA:
8e9da4d4
KW
1702 REXEC_FBC_CSCAN(
1703 !isWORDCHAR_A(*s),
1704 !isWORDCHAR_A(*s)
1705 );
1706 break;
bbce6d69 1707 case NALNUML:
4cadc6a9
YO
1708 REXEC_FBC_CSCAN_TAINT(
1709 !isALNUM_LC_utf8((U8*)s),
1710 !isALNUM_LC(*s)
1711 );
6895a8aa 1712 break;
980866de 1713 case SPACEU:
bedac28b
KW
1714 REXEC_FBC_CSCAN(
1715 is_XPERLSPACE_utf8(s),
980866de
KW
1716 isSPACE_L1((U8) *s)
1717 );
6895a8aa 1718 break;
a0d0e21e 1719 case SPACE:
bedac28b
KW
1720 REXEC_FBC_CSCAN(
1721 is_XPERLSPACE_utf8(s),
980866de 1722 isSPACE((U8) *s)
4cadc6a9 1723 );
6895a8aa 1724 break;
cfaf538b 1725 case SPACEA:
8e9da4d4
KW
1726 /* Don't need to worry about utf8, as it can match only a single
1727 * byte invariant character */
cfaf538b 1728 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
6895a8aa 1729 break;
bbce6d69 1730 case SPACEL:
4cadc6a9 1731 REXEC_FBC_CSCAN_TAINT(
6bbba904 1732 isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1733 isSPACE_LC(*s)
1734 );
6895a8aa 1735 break;
980866de 1736 case NSPACEU:
bedac28b
KW
1737 REXEC_FBC_CSCAN(
1738 ! is_XPERLSPACE_utf8(s),
980866de
KW
1739 ! isSPACE_L1((U8) *s)
1740 );
6895a8aa 1741 break;
a0d0e21e 1742 case NSPACE:
bedac28b
KW
1743 REXEC_FBC_CSCAN(
1744 ! is_XPERLSPACE_utf8(s),
980866de 1745 ! isSPACE((U8) *s)
4cadc6a9 1746 );
6895a8aa 1747 break;
cfaf538b 1748 case NSPACEA:
8e9da4d4
KW
1749 REXEC_FBC_CSCAN(
1750 !isSPACE_A(*s),
1751 !isSPACE_A(*s)
1752 );
1753 break;
bbce6d69 1754 case NSPACEL:
4cadc6a9 1755 REXEC_FBC_CSCAN_TAINT(
6bbba904 1756 !isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1757 !isSPACE_LC(*s)
1758 );
6895a8aa 1759 break;
a0d0e21e 1760 case DIGIT:
4cadc6a9 1761 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1762 LOAD_UTF8_CHARCLASS_DIGIT(),
1763 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1764 isDIGIT(*s)
1765 );
6895a8aa 1766 break;
cfaf538b 1767 case DIGITA:
8e9da4d4
KW
1768 /* Don't need to worry about utf8, as it can match only a single
1769 * byte invariant character */
cfaf538b 1770 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
6895a8aa 1771 break;
b8c5462f 1772 case DIGITL:
4cadc6a9
YO
1773 REXEC_FBC_CSCAN_TAINT(
1774 isDIGIT_LC_utf8((U8*)s),
1775 isDIGIT_LC(*s)
1776 );
6895a8aa 1777 break;
a0d0e21e 1778 case NDIGIT:
4cadc6a9 1779 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1780 LOAD_UTF8_CHARCLASS_DIGIT(),
1781 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1782 !isDIGIT(*s)
1783 );
6895a8aa 1784 break;
cfaf538b 1785 case NDIGITA:
8e9da4d4
KW
1786 REXEC_FBC_CSCAN(
1787 !isDIGIT_A(*s),
1788 !isDIGIT_A(*s)
1789 );
1790 break;
b8c5462f 1791 case NDIGITL:
4cadc6a9
YO
1792 REXEC_FBC_CSCAN_TAINT(
1793 !isDIGIT_LC_utf8((U8*)s),
1794 !isDIGIT_LC(*s)
1795 );
6895a8aa 1796 break;
e1d1eefb
YO
1797 case LNBREAK:
1798 REXEC_FBC_CSCAN(
7016d6eb
DM
1799 is_LNBREAK_utf8_safe(s, strend),
1800 is_LNBREAK_latin1_safe(s, strend)
e1d1eefb 1801 );
6895a8aa 1802 break;
e1d1eefb
YO
1803 case VERTWS:
1804 REXEC_FBC_CSCAN(
7016d6eb
DM
1805 is_VERTWS_utf8_safe(s, strend),
1806 is_VERTWS_latin1_safe(s, strend)
e1d1eefb 1807 );
6895a8aa 1808 break;
e1d1eefb
YO
1809 case NVERTWS:
1810 REXEC_FBC_CSCAN(
7016d6eb
DM
1811 !is_VERTWS_utf8_safe(s, strend),
1812 !is_VERTWS_latin1_safe(s, strend)
e1d1eefb 1813 );
6895a8aa 1814 break;
e1d1eefb
YO
1815 case HORIZWS:
1816 REXEC_FBC_CSCAN(
7016d6eb
DM
1817 is_HORIZWS_utf8_safe(s, strend),
1818 is_HORIZWS_latin1_safe(s, strend)
e1d1eefb 1819 );
6895a8aa 1820 break;
e1d1eefb
YO
1821 case NHORIZWS:
1822 REXEC_FBC_CSCAN(
7016d6eb
DM
1823 !is_HORIZWS_utf8_safe(s, strend),
1824 !is_HORIZWS_latin1_safe(s, strend)
e1d1eefb 1825 );
6895a8aa 1826 break;
0658cdde
KW
1827 case POSIXA:
1828 /* Don't need to worry about utf8, as it can match only a single
1829 * byte invariant character. The flag in this node type is the
1830 * class number to pass to _generic_isCC() to build a mask for
1831 * searching in PL_charclass[] */
1832 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1833 break;
1834 case NPOSIXA:
1835 REXEC_FBC_CSCAN(
1836 !_generic_isCC_A(*s, FLAGS(c)),
1837 !_generic_isCC_A(*s, FLAGS(c))
1838 );
1839 break;
1840
1de06328
YO
1841 case AHOCORASICKC:
1842 case AHOCORASICK:
07be1b83 1843 {
a0a388a1 1844 DECL_TRIE_TYPE(c);
07be1b83
YO
1845 /* what trie are we using right now */
1846 reg_ac_data *aho
f8fc2ecf 1847 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1848 reg_trie_data *trie
1849 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1850 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1851
1852 const char *last_start = strend - trie->minlen;
6148ee25 1853#ifdef DEBUGGING
07be1b83 1854 const char *real_start = s;
6148ee25 1855#endif
07be1b83 1856 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1857 SV *sv_points;
1858 U8 **points; /* map of where we were in the input string
786e8c11 1859 when reading a given char. For ASCII this
be8e71aa 1860 is unnecessary overhead as the relationship
38a44b82
NC
1861 is always 1:1, but for Unicode, especially
1862 case folded Unicode this is not true. */
f9e705e8 1863 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1864 U8 *bitmap=NULL;
1865
07be1b83
YO
1866
1867 GET_RE_DEBUG_FLAGS_DECL;
1868
be8e71aa
YO
1869 /* We can't just allocate points here. We need to wrap it in
1870 * an SV so it gets freed properly if there is a croak while
1871 * running the match */
1872 ENTER;
1873 SAVETMPS;
1874 sv_points=newSV(maxlen * sizeof(U8 *));
1875 SvCUR_set(sv_points,
1876 maxlen * sizeof(U8 *));
1877 SvPOK_on(sv_points);
1878 sv_2mortal(sv_points);
1879 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1880 if ( trie_type != trie_utf8_fold
1881 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1882 {
786e8c11
YO
1883 if (trie->bitmap)
1884 bitmap=(U8*)trie->bitmap;
1885 else
1886 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1887 }
786e8c11
YO
1888 /* this is the Aho-Corasick algorithm modified a touch
1889 to include special handling for long "unknown char"
1890 sequences. The basic idea being that we use AC as long
1891 as we are dealing with a possible matching char, when
1892 we encounter an unknown char (and we have not encountered
1893 an accepting state) we scan forward until we find a legal
1894 starting char.
1895 AC matching is basically that of trie matching, except
1896 that when we encounter a failing transition, we fall back
1897 to the current states "fail state", and try the current char
1898 again, a process we repeat until we reach the root state,
1899 state 1, or a legal transition. If we fail on the root state
1900 then we can either terminate if we have reached an accepting
1901 state previously, or restart the entire process from the beginning
1902 if we have not.
1903
1904 */
07be1b83
YO
1905 while (s <= last_start) {
1906 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1907 U8 *uc = (U8*)s;
1908 U16 charid = 0;
1909 U32 base = 1;
1910 U32 state = 1;
1911 UV uvc = 0;
1912 STRLEN len = 0;
1913 STRLEN foldlen = 0;
1914 U8 *uscan = (U8*)NULL;
1915 U8 *leftmost = NULL;
786e8c11
YO
1916#ifdef DEBUGGING
1917 U32 accepted_word= 0;
1918#endif
07be1b83
YO
1919 U32 pointpos = 0;
1920
1921 while ( state && uc <= (U8*)strend ) {
1922 int failed=0;
786e8c11
YO
1923 U32 word = aho->states[ state ].wordnum;
1924
1de06328
YO
1925 if( state==1 ) {
1926 if ( bitmap ) {
1927 DEBUG_TRIE_EXECUTE_r(
1928 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1929 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1930 (char *)uc, utf8_target );
1de06328
YO
1931 PerlIO_printf( Perl_debug_log,
1932 " Scanning for legal start char...\n");
1933 }
d085b490
YO
1934 );
1935 if (utf8_target) {
1936 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1937 uc += UTF8SKIP(uc);
1938 }
1939 } else {
1940 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1941 uc++;
1942 }
1943 }
1de06328 1944 s= (char *)uc;
786e8c11 1945 }
786e8c11
YO
1946 if (uc >(U8*)last_start) break;
1947 }
1948
1949 if ( word ) {
2e64971a 1950 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1951 if (!leftmost || lpos < leftmost) {
1952 DEBUG_r(accepted_word=word);
07be1b83 1953 leftmost= lpos;
786e8c11 1954 }
07be1b83 1955 if (base==0) break;
786e8c11 1956
07be1b83
YO
1957 }
1958 points[pointpos++ % maxlen]= uc;
7016d6eb
DM
1959 if (foldlen || uc < (U8*)strend) {
1960 REXEC_TRIE_READ_CHAR(trie_type, trie,
1961 widecharmap, uc,
55eed653
NC
1962 uscan, len, uvc, charid, foldlen,
1963 foldbuf, uniflags);
7016d6eb
DM
1964 DEBUG_TRIE_EXECUTE_r({
1965 dump_exec_pos( (char *)uc, c, strend,
1966 real_start, s, utf8_target);
1967 PerlIO_printf(Perl_debug_log,
1968 " Charid:%3u CP:%4"UVxf" ",
1969 charid, uvc);
1970 });
1971 }
1972 else {
1973 len = 0;
1974 charid = 0;
1975 }
1976
07be1b83
YO
1977
1978 do {
6148ee25 1979#ifdef DEBUGGING
786e8c11 1980 word = aho->states[ state ].wordnum;
6148ee25 1981#endif
07be1b83
YO
1982 base = aho->states[ state ].trans.base;
1983
786e8c11
YO
1984 DEBUG_TRIE_EXECUTE_r({
1985 if (failed)
1986 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1987 s, utf8_target );
07be1b83 1988 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1989 "%sState: %4"UVxf", word=%"UVxf,
1990 failed ? " Fail transition to " : "",
1991 (UV)state, (UV)word);
1992 });
07be1b83
YO
1993 if ( base ) {
1994 U32 tmp;
6dd2be57 1995 I32 offset;
07be1b83 1996 if (charid &&
6dd2be57
DM
1997 ( ((offset = base + charid
1998 - 1 - trie->uniquecharcount)) >= 0)
1999 && ((U32)offset < trie->lasttrans)
2000 && trie->trans[offset].check == state
2001 && (tmp=trie->trans[offset].next))
07be1b83 2002 {
786e8c11
YO
2003 DEBUG_TRIE_EXECUTE_r(
2004 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
2005 state = tmp;
2006 break;
2007 }
2008 else {
786e8c11
YO
2009 DEBUG_TRIE_EXECUTE_r(
2010 PerlIO_printf( Perl_debug_log," - fail\n"));
2011 failed = 1;
2012 state = aho->fail[state];
07be1b83
YO
2013 }
2014 }
2015 else {
2016 /* we must be accepting here */
786e8c11
YO
2017 DEBUG_TRIE_EXECUTE_r(
2018 PerlIO_printf( Perl_debug_log," - accepting\n"));
2019 failed = 1;
07be1b83
YO
2020 break;
2021 }
2022 } while(state);
786e8c11 2023 uc += len;
07be1b83
YO
2024 if (failed) {
2025 if (leftmost)
2026 break;
786e8c11 2027 if (!state) state = 1;
07be1b83
YO
2028 }
2029 }
2030 if ( aho->states[ state ].wordnum ) {
2e64971a 2031 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
2032 if (!leftmost || lpos < leftmost) {
2033 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 2034 leftmost = lpos;
786e8c11 2035 }
07be1b83 2036 }
07be1b83
YO
2037 if (leftmost) {
2038 s = (char*)leftmost;
786e8c11
YO
2039 DEBUG_TRIE_EXECUTE_r({
2040 PerlIO_printf(
70685ca0
JH
2041 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2042 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
2043 );
2044 });
24b23f37 2045 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
2046 FREETMPS;
2047 LEAVE;
07be1b83 2048 goto got_it;
be8e71aa 2049 }
07be1b83 2050 s = HOPc(s,1);
786e8c11
YO
2051 DEBUG_TRIE_EXECUTE_r({
2052 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2053 });
07be1b83 2054 } else {
786e8c11
YO
2055 DEBUG_TRIE_EXECUTE_r(
2056 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
2057 break;
2058 }
2059 }
be8e71aa
YO
2060 FREETMPS;
2061 LEAVE;
07be1b83
YO
2062 }
2063 break;
b3c9acc1 2064 default:
3c3eec57
GS
2065 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2066 break;
d6a28714 2067 }
6eb5f6b9
JH
2068 return 0;
2069 got_it:
2070 return s;
2071}
2072
fae667d5 2073
6eb5f6b9
JH
2074/*
2075 - regexec_flags - match a regexp against a string
2076 */
2077I32
5aaab254 2078Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
6eb5f6b9 2079 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2080/* stringarg: the point in the string at which to begin matching */
2081/* strend: pointer to null at end of string */
2082/* strbeg: real beginning of string */
2083/* minend: end of match must be >= minend bytes after stringarg. */
2084/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2085 * itself is accessed via the pointers above */
2086/* data: May be used for some additional optimizations.
2087 Currently its only used, with a U32 cast, for transmitting
2088 the ganch offset when doing a /g match. This will change */
2089/* nosave: For optimizations. */
2090
6eb5f6b9 2091{
97aff369 2092 dVAR;
8d919b0a 2093 struct regexp *const prog = ReANY(rx);
5aaab254 2094 char *s;
eb578fdb 2095 regnode *c;
5aaab254 2096 char *startpos = stringarg;
6eb5f6b9
JH
2097 I32 minlen; /* must match at least this many chars */
2098 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2099 I32 end_shift = 0; /* Same for the end. */ /* CC */
2100 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2101 char *scream_olds = NULL;
f2ed9b32 2102 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2103 I32 multiline;
f8fc2ecf 2104 RXi_GET_DECL(prog,progi);
3b0527fe 2105 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2106 regexp_paren_pair *swap = NULL;
a3621e74
YO
2107 GET_RE_DEBUG_FLAGS_DECL;
2108
7918f24d 2109 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2110 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2111
2112 /* Be paranoid... */
2113 if (prog == NULL || startpos == NULL) {
2114 Perl_croak(aTHX_ "NULL regexp parameter");
2115 return 0;
2116 }
2117
bbe252da 2118 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2119 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2120
f2ed9b32 2121 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2122 DEBUG_EXECUTE_r(
f2ed9b32 2123 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2124 "Matching");
2125 );
bac06658 2126
6eb5f6b9 2127 minlen = prog->minlen;
1de06328
YO
2128
2129 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2130 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2131 "String too short [regexec_flags]...\n"));
2132 goto phooey;
1aa99e6b 2133 }
6eb5f6b9 2134
1de06328 2135
6eb5f6b9 2136 /* Check validity of program. */
f8fc2ecf 2137 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2138 Perl_croak(aTHX_ "corrupted regexp program");
2139 }
2140
2141 PL_reg_flags = 0;
ed301438 2142 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2143 PL_reg_maxiter = 0;
2144
3c8556c3 2145 if (RX_UTF8(rx))
6eb5f6b9
JH
2146 PL_reg_flags |= RF_utf8;
2147
2148 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2149 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2150 PL_bostr = strbeg;
3b0527fe 2151 reginfo.sv = sv;
6eb5f6b9
JH
2152
2153 /* Mark end of line for $ (and such) */
2154 PL_regeol = strend;
2155
2156 /* see how far we have to get to not match where we matched before */
3b0527fe 2157 reginfo.till = startpos+minend;
6eb5f6b9 2158
6eb5f6b9
JH
2159 /* If there is a "must appear" string, look for it. */
2160 s = startpos;
2161
bbe252da 2162 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2163 MAGIC *mg;
2c296965 2164 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2165 reginfo.ganch = startpos + prog->gofs;
2c296965 2166 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2167 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2168 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2169 && SvMAGIC(sv)
14befaf4
DM
2170 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2171 && mg->mg_len >= 0) {
3b0527fe 2172 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2173 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2174 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2175
bbe252da 2176 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2177 if (s > reginfo.ganch)
6eb5f6b9 2178 goto phooey;
58e23c8d 2179 s = reginfo.ganch - prog->gofs;
2c296965 2180 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2181 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2182 if (s < strbeg)
2183 goto phooey;
6eb5f6b9
JH
2184 }
2185 }
58e23c8d 2186 else if (data) {
70685ca0 2187 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2188 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2189 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2190
2191 } else { /* pos() not defined */
3b0527fe 2192 reginfo.ganch = strbeg;
2c296965
YO
2193 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2194 "GPOS: reginfo.ganch = strbeg\n"));
2195 }
6eb5f6b9 2196 }
288b8c02 2197 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2198 /* We have to be careful. If the previous successful match
2199 was from this regex we don't want a subsequent partially
2200 successful match to clobber the old results.
2201 So when we detect this possibility we add a swap buffer
2202 to the re, and switch the buffer each match. If we fail
2203 we switch it back, otherwise we leave it swapped.
2204 */
2205 swap = prog->offs;
2206 /* do we need a save destructor here for eval dies? */
2207 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2208 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2209 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2210 PTR2UV(prog),
2211 PTR2UV(swap),
2212 PTR2UV(prog->offs)
2213 ));
c74340f9 2214 }
a0714e2c 2215 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2216 re_scream_pos_data d;
2217
2218 d.scream_olds = &scream_olds;
2219 d.scream_pos = &scream_pos;
288b8c02 2220 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2221 if (!s) {
a3621e74 2222 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2223 goto phooey; /* not present */
3fa9c3d7 2224 }
6eb5f6b9
JH
2225 }
2226
1de06328 2227
6eb5f6b9
JH
2228
2229 /* Simplest case: anchored match need be tried only once. */
2230 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2231 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2232 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2233 goto got_it;
bbe252da
YO
2234 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2235 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2236 {
2237 char *end;
2238
2239 if (minlen)
2240 dontbother = minlen - 1;
1aa99e6b 2241 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2242 /* for multiline we only have to try after newlines */
33b8afdf 2243 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2244 /* because of the goto we can not easily reuse the macros for bifurcating the
2245 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2246 if (utf8_target) {
2247 if (s == startpos)
2248 goto after_try_utf8;
2249 while (1) {
2250 if (regtry(&reginfo, &s)) {
2251 goto got_it;
2252 }
2253 after_try_utf8:
2254 if (s > end) {
2255 goto phooey;
2256 }
2257 if (prog->extflags & RXf_USE_INTUIT) {
2258 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2259 if (!s) {
2260 goto phooey;
2261 }
2262 }
2263 else {
2264 s += UTF8SKIP(s);
2265 }
2266 }
2267 } /* end search for check string in unicode */
2268 else {
2269 if (s == startpos) {
2270 goto after_try_latin;
2271 }
2272 while (1) {
2273 if (regtry(&reginfo, &s)) {
2274 goto got_it;
2275 }
2276 after_try_latin:
2277 if (s > end) {
2278 goto phooey;
2279 }
2280 if (prog->extflags & RXf_USE_INTUIT) {
2281 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2282 if (!s) {
2283 goto phooey;
2284 }
2285 }
2286 else {
2287 s++;
2288 }
2289 }
2290 } /* end search for check string in latin*/
2291 } /* end search for check string */
2292 else { /* search for newline */
2293 if (s > startpos) {
2294 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2295 s--;
92f3d482 2296 }
21eede78
YO
2297 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2298 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2299 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2300 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2301 goto got_it;
2302 }
92f3d482
YO
2303 }
2304 } /* end search for newline */
2305 } /* end anchored/multiline check string search */
6eb5f6b9 2306 goto phooey;
bbe252da 2307 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2308 {
486ec47a 2309 /* the warning about reginfo.ganch being used without initialization
bbe252da 2310 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2311 and we only enter this block when the same bit is set. */
58e23c8d 2312 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2313
2314 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2315 goto got_it;
2316 goto phooey;
2317 }
2318
2319 /* Messy cases: unanchored match. */
bbe252da 2320 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2321 /* we have /x+whatever/ */
f2ed9b32 2322 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2323 char ch;
bf93d4cc
GS
2324#ifdef DEBUGGING
2325 int did_match = 0;
2326#endif
f2ed9b32 2327 if (utf8_target) {
7e0d5ad7
KW
2328 if (! prog->anchored_utf8) {
2329 to_utf8_substr(prog);
2330 }
2331 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2332 REXEC_FBC_SCAN(
6eb5f6b9 2333 if (*s == ch) {
a3621e74 2334 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2335 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2336 s += UTF8SKIP(s);
2337 while (s < strend && *s == ch)
2338 s += UTF8SKIP(s);
2339 }
4cadc6a9 2340 );
7e0d5ad7 2341
6eb5f6b9
JH
2342 }
2343 else {
7e0d5ad7
KW
2344 if (! prog->anchored_substr) {
2345 if (! to_byte_substr(prog)) {
6b54ddc5 2346 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2347 }
2348 }
2349 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2350 REXEC_FBC_SCAN(
6eb5f6b9 2351 if (*s == ch) {
a3621e74 2352 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2353 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2354 s++;
2355 while (s < strend && *s == ch)
2356 s++;
2357 }
4cadc6a9 2358 );
6eb5f6b9 2359 }
a3621e74 2360 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2361 PerlIO_printf(Perl_debug_log,
b7953727
JH
2362 "Did not find anchored character...\n")
2363 );
6eb5f6b9 2364 }
a0714e2c
SS
2365 else if (prog->anchored_substr != NULL
2366 || prog->anchored_utf8 != NULL
2367 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2368 && prog->float_max_offset < strend - s)) {
2369 SV *must;
2370 I32 back_max;
2371 I32 back_min;
2372 char *last;
6eb5f6b9 2373 char *last1; /* Last position checked before */
bf93d4cc
GS
2374#ifdef DEBUGGING
2375 int did_match = 0;
2376#endif
33b8afdf 2377 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2378 if (utf8_target) {
2379 if (! prog->anchored_utf8) {
2380 to_utf8_substr(prog);
2381 }
2382 must = prog->anchored_utf8;
2383 }
2384 else {
2385 if (! prog->anchored_substr) {
2386 if (! to_byte_substr(prog)) {
6b54ddc5 2387 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2388 }
2389 }
2390 must = prog->anchored_substr;
2391 }
33b8afdf
JH
2392 back_max = back_min = prog->anchored_offset;
2393 } else {
7e0d5ad7
KW
2394 if (utf8_target) {
2395 if (! prog->float_utf8) {
2396 to_utf8_substr(prog);
2397 }
2398 must = prog->float_utf8;
2399 }
2400 else {
2401 if (! prog->float_substr) {
2402 if (! to_byte_substr(prog)) {
6b54ddc5 2403 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2404 }
2405 }
2406 must = prog->float_substr;
2407 }
33b8afdf
JH
2408 back_max = prog->float_max_offset;
2409 back_min = prog->float_min_offset;
2410 }
1de06328 2411
1de06328
YO
2412 if (back_min<0) {
2413 last = strend;
2414 } else {
2415 last = HOP3c(strend, /* Cannot start after this */
2416 -(I32)(CHR_SVLEN(must)
2417 - (SvTAIL(must) != 0) + back_min), strbeg);
2418 }
6eb5f6b9
JH
2419 if (s > PL_bostr)
2420 last1 = HOPc(s, -1);
2421 else
2422 last1 = s - 1; /* bogus */
2423
a0288114 2424 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2425 check_substr==must. */
2426 scream_pos = -1;
2427 dontbother = end_shift;
2428 strend = HOPc(strend, -dontbother);
2429 while ( (s <= last) &&
c33e64f0 2430 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2431 (unsigned char*)strend, must,
c33e64f0 2432 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2433 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2434 if (HOPc(s, -back_max) > last1) {
2435 last1 = HOPc(s, -back_min);
2436 s = HOPc(s, -back_max);
2437 }
2438 else {
52657f30 2439 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2440
2441 last1 = HOPc(s, -back_min);
52657f30 2442 s = t;
6eb5f6b9 2443 }
f2ed9b32 2444 if (utf8_target) {
6eb5f6b9 2445 while (s <= last1) {
24b23f37 2446 if (regtry(&reginfo, &s))
6eb5f6b9 2447 goto got_it;
7016d6eb
DM
2448 if (s >= last1) {
2449 s++; /* to break out of outer loop */
2450 break;
2451 }
2452 s += UTF8SKIP(s);
6eb5f6b9
JH
2453 }
2454 }
2455 else {
2456 while (s <= last1) {
24b23f37 2457 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2458 goto got_it;
2459 s++;
2460 }
2461 }
2462 }
ab3bbdeb 2463 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2464 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2465 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2466 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2467 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2468 ? "anchored" : "floating"),
ab3bbdeb
YO
2469 quoted, RE_SV_TAIL(must));
2470 });
6eb5f6b9
JH
2471 goto phooey;
2472 }
f8fc2ecf 2473 else if ( (c = progi->regstclass) ) {
f14c76ed 2474 if (minlen) {
f8fc2ecf 2475 const OPCODE op = OP(progi->regstclass);
66e933ab 2476 /* don't bother with what can't match */
786e8c11 2477 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2478 strend = HOPc(strend, -(minlen - 1));
2479 }
a3621e74 2480 DEBUG_EXECUTE_r({
be8e71aa 2481 SV * const prop = sv_newmortal();
32fc9b6a 2482 regprop(prog, prop, c);
0df25f3d 2483 {
f2ed9b32 2484 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2485 s,strend-s,60);
0df25f3d 2486 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2487 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2488 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2489 quoted, (int)(strend - s));
0df25f3d 2490 }
ffc61ed2 2491 });
3b0527fe 2492 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2493 goto got_it;
07be1b83 2494 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2495 }
2496 else {
2497 dontbother = 0;
a0714e2c 2498 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2499 /* Trim the end. */
6af40bd7 2500 char *last= NULL;
33b8afdf 2501 SV* float_real;
c33e64f0
FC
2502 STRLEN len;
2503 const char *little;
33b8afdf 2504
7e0d5ad7
KW
2505 if (utf8_target) {
2506 if (! prog->float_utf8) {
2507 to_utf8_substr(prog);
2508 }
2509 float_real = prog->float_utf8;
2510 }
2511 else {
2512 if (! prog->float_substr) {
2513 if (! to_byte_substr(prog)) {
6b54ddc5 2514 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2515 }
2516 }
2517 float_real = prog->float_substr;
2518 }
d6a28714 2519
c33e64f0
FC
2520 little = SvPV_const(float_real, len);
2521 if (SvTAIL(float_real)) {
7f18ad16
KW
2522 /* This means that float_real contains an artificial \n on
2523 * the end due to the presence of something like this:
2524 * /foo$/ where we can match both "foo" and "foo\n" at the
2525 * end of the string. So we have to compare the end of the
2526 * string first against the float_real without the \n and
2527 * then against the full float_real with the string. We
2528 * have to watch out for cases where the string might be
2529 * smaller than the float_real or the float_real without
2530 * the \n. */
1a13b075
YO
2531 char *checkpos= strend - len;
2532 DEBUG_OPTIMISE_r(
2533 PerlIO_printf(Perl_debug_log,
2534 "%sChecking for float_real.%s\n",
2535 PL_colors[4], PL_colors[5]));
2536 if (checkpos + 1 < strbeg) {
7f18ad16
KW
2537 /* can't match, even if we remove the trailing \n
2538 * string is too short to match */
1a13b075
YO
2539 DEBUG_EXECUTE_r(
2540 PerlIO_printf(Perl_debug_log,
2541 "%sString shorter than required trailing substring, cannot match.%s\n",
2542 PL_colors[4], PL_colors[5]));
2543 goto phooey;
2544 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
2545 /* can match, the end of the string matches without the
2546 * "\n" */
1a13b075
YO
2547 last = checkpos + 1;
2548 } else if (checkpos < strbeg) {
7f18ad16
KW
2549 /* cant match, string is too short when the "\n" is
2550 * included */
1a13b075
YO
2551 DEBUG_EXECUTE_r(
2552 PerlIO_printf(Perl_debug_log,
2553 "%sString does not contain required trailing substring, cannot match.%s\n",
2554 PL_colors[4], PL_colors[5]));
2555 goto phooey;
2556 } else if (!multiline) {
7f18ad16
KW
2557 /* non multiline match, so compare with the "\n" at the
2558 * end of the string */
1a13b075
YO
2559 if (memEQ(checkpos, little, len)) {
2560 last= checkpos;
2561 } else {
2562 DEBUG_EXECUTE_r(
2563 PerlIO_printf(Perl_debug_log,
2564 "%sString does not contain required trailing substring, cannot match.%s\n",
2565 PL_colors[4], PL_colors[5]));
2566 goto phooey;
2567 }
2568 } else {
7f18ad16
KW
2569 /* multiline match, so we have to search for a place
2570 * where the full string is located */
d6a28714 2571 goto find_last;
1a13b075 2572 }
c33e64f0 2573 } else {
d6a28714 2574 find_last:
9041c2e3 2575 if (len)
d6a28714 2576 last = rninstr(s, strend, little, little + len);
b8c5462f 2577 else
a0288114 2578 last = strend; /* matching "$" */
b8c5462f 2579 }
6af40bd7 2580 if (!last) {
7f18ad16
KW
2581 /* at one point this block contained a comment which was
2582 * probably incorrect, which said that this was a "should not
2583 * happen" case. Even if it was true when it was written I am
2584 * pretty sure it is not anymore, so I have removed the comment
2585 * and replaced it with this one. Yves */
6bda09f9
YO
2586 DEBUG_EXECUTE_r(
2587 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2588 "String does not contain required substring, cannot match.\n"
2589 ));
2590 goto phooey;
bf93d4cc 2591 }
d6a28714
JH
2592 dontbother = strend - last + prog->float_min_offset;
2593 }
2594 if (minlen && (dontbother < minlen))
2595 dontbother = minlen - 1;
2596 strend -= dontbother; /* this one's always in bytes! */
2597 /* We don't know much -- general case. */
f2ed9b32 2598 if (utf8_target) {
d6a28714 2599 for (;;) {
24b23f37 2600 if (regtry(&reginfo, &s))
d6a28714
JH
2601 goto got_it;
2602 if (s >= strend)
2603 break;
b8c5462f 2604 s += UTF8SKIP(s);
d6a28714
JH
2605 };
2606 }
2607 else {
2608 do {
24b23f37 2609 if (regtry(&reginfo, &s))
d6a28714
JH
2610 goto got_it;
2611 } while (s++ < strend);
2612 }
2613 }
2614
2615 /* Failure. */
2616 goto phooey;
2617
2618got_it:
495f47a5
DM
2619 DEBUG_BUFFERS_r(
2620 if (swap)
2621 PerlIO_printf(Perl_debug_log,
2622 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2623 PTR2UV(prog),
2624 PTR2UV(swap)
2625 );
2626 );
e9105d30 2627 Safefree(swap);
288b8c02 2628 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2629
ed301438 2630 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2631 restore_pos(aTHX_ prog);
5daac39c
NC
2632 if (RXp_PAREN_NAMES(prog))
2633 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2634
2635 /* make sure $`, $&, $', and $digit will work later */
2636 if ( !(flags & REXEC_NOT_FIRST) ) {
d6a28714 2637 if (flags & REXEC_COPY_STR) {
db2c6cb3
FC
2638#ifdef PERL_ANY_COW
2639 if (SvCANCOW(sv)) {
ed252734
NC
2640 if (DEBUG_C_TEST) {
2641 PerlIO_printf(Perl_debug_log,
2642 "Copy on write: regexp capture, type %d\n",
2643 (int) SvTYPE(sv));
2644 }
77f8f7c1 2645 RX_MATCH_COPY_FREE(rx);
ed252734 2646 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2647 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734 2648 assert (SvPOKp(prog->saved_copy));
6502e081
DM
2649 prog->sublen = PL_regeol - strbeg;
2650 prog->suboffset = 0;
2651 prog->subcoffset = 0;
ed252734
NC
2652 } else
2653#endif
2654 {
6502e081
DM
2655 I32 min = 0;
2656 I32 max = PL_regeol - strbeg;
2657 I32 sublen;
2658
2659 if ( (flags & REXEC_COPY_SKIP_POST)
2660 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2661 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2662 ) { /* don't copy $' part of string */
3de645a8 2663 U32 n = 0;
6502e081
DM
2664 max = -1;
2665 /* calculate the right-most part of the string covered
2666 * by a capture. Due to look-ahead, this may be to
2667 * the right of $&, so we have to scan all captures */
2668 while (n <= prog->lastparen) {
2669 if (prog->offs[n].end > max)
2670 max = prog->offs[n].end;
2671 n++;
2672 }
2673 if (max == -1)
2674 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2675 ? prog->offs[0].start
2676 : 0;
2677 assert(max >= 0 && max <= PL_regeol - strbeg);
2678 }
2679
2680 if ( (flags & REXEC_COPY_SKIP_PRE)
2681 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2682 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2683 ) { /* don't copy $` part of string */
3de645a8 2684 U32 n = 0;
6502e081
DM
2685 min = max;
2686 /* calculate the left-most part of the string covered
2687 * by a capture. Due to look-behind, this may be to
2688 * the left of $&, so we have to scan all captures */
2689 while (min && n <= prog->lastparen) {
2690 if ( prog->offs[n].start != -1
2691 && prog->offs[n].start < min)
2692 {
2693 min = prog->offs[n].start;
2694 }
2695 n++;
2696 }
2697 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2698 && min > prog->offs[0].end
2699 )
2700 min = prog->offs[0].end;
2701
2702 }
2703
2704 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2705 sublen = max - min;
2706
2707 if (RX_MATCH_COPIED(rx)) {
2708 if (sublen > prog->sublen)
2709 prog->subbeg =
2710 (char*)saferealloc(prog->subbeg, sublen+1);
2711 }
2712 else
2713 prog->subbeg = (char*)safemalloc(sublen+1);
2714 Copy(strbeg + min, prog->subbeg, sublen, char);
2715 prog->subbeg[sublen] = '\0';
2716 prog->suboffset = min;
2717 prog->sublen = sublen;
77f8f7c1 2718 RX_MATCH_COPIED_on(rx);
6502e081 2719 }
6502e081
DM
2720 prog->subcoffset = prog->suboffset;
2721 if (prog->suboffset && utf8_target) {
2722 /* Convert byte offset to chars.
2723 * XXX ideally should only compute this if @-/@+
2724 * has been seen, a la PL_sawampersand ??? */
2725
2726 /* If there's a direct correspondence between the
2727 * string which we're matching and the original SV,
2728 * then we can use the utf8 len cache associated with
2729 * the SV. In particular, it means that under //g,
2730 * sv_pos_b2u() will use the previously cached
2731 * position to speed up working out the new length of
2732 * subcoffset, rather than counting from the start of
2733 * the string each time. This stops
2734 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2735 * from going quadratic */
2736 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2737 sv_pos_b2u(sv, &(prog->subcoffset));
2738 else
2739 prog->subcoffset = utf8_length((U8*)strbeg,
2740 (U8*)(strbeg+prog->suboffset));
2741 }
d6a28714
JH
2742 }
2743 else {
6502e081 2744 RX_MATCH_COPY_FREE(rx);
d6a28714 2745 prog->subbeg = strbeg;
6502e081
DM
2746 prog->suboffset = 0;
2747 prog->subcoffset = 0;
d6a28714
JH
2748 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2749 }
2750 }
9041c2e3 2751
d6a28714
JH
2752 return 1;
2753
2754phooey:
a3621e74 2755 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2756 PL_colors[4], PL_colors[5]));
ed301438 2757 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2758 restore_pos(aTHX_ prog);
e9105d30 2759 if (swap) {
c74340f9 2760 /* we failed :-( roll it back */
495f47a5
DM
2761 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2762 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2763 PTR2UV(prog),
2764 PTR2UV(prog->offs),
2765 PTR2UV(swap)
2766 ));
e9105d30
GG
2767 Safefree(prog->offs);
2768 prog->offs = swap;
2769 }
d6a28714
JH
2770 return 0;
2771}
2772
6bda09f9 2773
ec43f78b
DM
2774/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2775 * Do inc before dec, in case old and new rex are the same */
2776#define SET_reg_curpm(Re2) \
2777 if (PL_reg_state.re_state_eval_setup_done) { \
2778 (void)ReREFCNT_inc(Re2); \
2779 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2780 PM_SETRE((PL_reg_curpm), (Re2)); \
2781 }
2782
2783
d6a28714
JH
2784/*
2785 - regtry - try match at specific point
2786 */
2787STATIC I32 /* 0 failure, 1 success */
f73aaa43 2788S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 2789{
97aff369 2790 dVAR;
d6a28714 2791 CHECKPOINT lastcp;
288b8c02 2792 REGEXP *const rx = reginfo->prog;
8d919b0a 2793 regexp *const prog = ReANY(rx);
f73aaa43 2794 I32 result;
f8fc2ecf 2795 RXi_GET_DECL(prog,progi);
a3621e74 2796 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2797
2798 PERL_ARGS_ASSERT_REGTRY;
2799
24b23f37 2800 reginfo->cutpoint=NULL;
d6a28714 2801
ed301438
DM
2802 if ((prog->extflags & RXf_EVAL_SEEN)
2803 && !PL_reg_state.re_state_eval_setup_done)
2804 {
d6a28714
JH
2805 MAGIC *mg;
2806
ed301438 2807 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2808 if (reginfo->sv) {
d6a28714 2809 /* Make $_ available to executed code. */
3b0527fe 2810 if (reginfo->sv != DEFSV) {
59f00321 2811 SAVE_DEFSV;
414bf5ae 2812 DEFSV_set(reginfo->sv);
b8c5462f 2813 }
d6a28714 2814
3b0527fe
DM
2815 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2816 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2817 /* prepare for quick setting of pos */
d300d9fa 2818#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2819 if (SvIsCOW(reginfo->sv))
2820 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2821#endif
3dab1dad 2822 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2823 &PL_vtbl_mglob, NULL, 0);
d6a28714 2824 mg->mg_len = -1;
b8c5462f 2825 }
d6a28714
JH
2826 PL_reg_magic = mg;
2827 PL_reg_oldpos = mg->mg_len;
4f639d21 2828 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2829 }
09687e5a 2830 if (!PL_reg_curpm) {
a02a5408 2831 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2832#ifdef USE_ITHREADS
2833 {
14a49a24 2834 SV* const repointer = &PL_sv_undef;
92313705
NC
2835 /* this regexp is also owned by the new PL_reg_curpm, which
2836 will try to free it. */
d2ece331 2837 av_push(PL_regex_padav, repointer);
09687e5a
AB
2838 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2839 PL_regex_pad = AvARRAY(PL_regex_padav);
2840 }
2841#endif
2842 }
ec43f78b 2843 SET_reg_curpm(rx);
d6a28714
JH
2844 PL_reg_oldcurpm = PL_curpm;
2845 PL_curpm = PL_reg_curpm;
07bc277f 2846 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2847 /* Here is a serious problem: we cannot rewrite subbeg,
2848 since it may be needed if this match fails. Thus
2849 $` inside (?{}) could fail... */
2850 PL_reg_oldsaved = prog->subbeg;
2851 PL_reg_oldsavedlen = prog->sublen;
6502e081
DM
2852 PL_reg_oldsavedoffset = prog->suboffset;
2853 PL_reg_oldsavedcoffset = prog->suboffset;
db2c6cb3 2854#ifdef PERL_ANY_COW
ed252734
NC
2855 PL_nrs = prog->saved_copy;
2856#endif
07bc277f 2857 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2858 }
2859 else
bd61b366 2860 PL_reg_oldsaved = NULL;
d6a28714 2861 prog->subbeg = PL_bostr;
6502e081
DM
2862 prog->suboffset = 0;
2863 prog->subcoffset = 0;
d6a28714
JH
2864 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2865 }
97ca13b7 2866#ifdef DEBUGGING
f73aaa43 2867 PL_reg_starttry = *startposp;
97ca13b7 2868#endif
f73aaa43 2869 prog->offs[0].start = *startposp - PL_bostr;
d6a28714 2870 prog->lastparen = 0;
03994de8 2871 prog->lastcloseparen = 0;
d6a28714 2872 PL_regsize = 0;
d6a28714
JH
2873
2874 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2875 to do this again and again, prog->lastparen should take care of
3dd2943c 2876 this! --ilya*/
dafc8851
JH
2877
2878 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2879 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2880 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2881 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2882 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2883 * Meanwhile, this code *is* needed for the
daf18116
JH
2884 * above-mentioned test suite tests to succeed. The common theme
2885 * on those tests seems to be returning null fields from matches.
225593e1 2886 * --jhi updated by dapm */
dafc8851 2887#if 1
d6a28714 2888 if (prog->nparens) {
b93070ed 2889 regexp_paren_pair *pp = prog->offs;
eb578fdb 2890 I32 i;
b93070ed 2891 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2892 ++pp;
2893 pp->start = -1;
2894 pp->end = -1;
d6a28714
JH
2895 }
2896 }
dafc8851 2897#endif
02db2b7b 2898 REGCP_SET(lastcp);
f73aaa43
DM
2899 result = regmatch(reginfo, *startposp, progi->program + 1);
2900 if (result != -1) {
2901 prog->offs[0].end = result;
d6a28714
JH
2902 return 1;
2903 }
24b23f37 2904 if (reginfo->cutpoint)
f73aaa43 2905 *startposp= reginfo->cutpoint;
02db2b7b 2906 REGCP_UNWIND(lastcp);
d6a28714
JH
2907 return 0;
2908}
2909
02db2b7b 2910
8ba1375e
MJD
2911#define sayYES goto yes
2912#define sayNO goto no
262b90c4 2913#define sayNO_SILENT goto no_silent
8ba1375e 2914
f9f4320a
YO
2915/* we dont use STMT_START/END here because it leads to
2916 "unreachable code" warnings, which are bogus, but distracting. */
2917#define CACHEsayNO \
c476f425
DM
2918 if (ST.cache_mask) \
2919 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2920 sayNO
3298f257 2921
a3621e74 2922/* this is used to determine how far from the left messages like
265c4333
YO
2923 'failed...' are printed. It should be set such that messages
2924 are inline with the regop output that created them.
a3621e74 2925*/
265c4333 2926#define REPORT_CODE_OFF 32
a3621e74
YO
2927
2928
40a82448
DM
2929#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2930#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
2931#define CHRTEST_NOT_A_CP_1 -999
2932#define CHRTEST_NOT_A_CP_2 -998
9e137952 2933
86545054
DM
2934#define SLAB_FIRST(s) (&(s)->states[0])
2935#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2936
5d9a96ca
DM
2937/* grab a new slab and return the first slot in it */
2938
2939STATIC regmatch_state *
2940S_push_slab(pTHX)
2941{
a35a87e7 2942#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2943 dMY_CXT;
2944#endif
5d9a96ca
DM
2945 regmatch_slab *s = PL_regmatch_slab->next;
2946 if (!s) {
2947 Newx(s, 1, regmatch_slab);
2948 s->prev = PL_regmatch_slab;
2949 s->next = NULL;
2950 PL_regmatch_slab->next = s;
2951 }
2952 PL_regmatch_slab = s;
86545054 2953 return SLAB_FIRST(s);
5d9a96ca 2954}
5b47454d 2955
95b24440 2956
40a82448
DM
2957/* push a new state then goto it */
2958
4d5016e5
DM
2959#define PUSH_STATE_GOTO(state, node, input) \
2960 pushinput = input; \
40a82448
DM
2961 scan = node; \
2962 st->resume_state = state; \
2963 goto push_state;
2964
2965/* push a new state with success backtracking, then goto it */
2966
4d5016e5
DM
2967#define PUSH_YES_STATE_GOTO(state, node, input) \
2968 pushinput = input; \
40a82448
DM
2969 scan = node; \
2970 st->resume_state = state; \
2971 goto push_yes_state;
2972
aa283a38 2973
aa283a38 2974
4d5016e5 2975
d6a28714 2976/*
95b24440 2977
bf1f174e
DM
2978regmatch() - main matching routine
2979
2980This is basically one big switch statement in a loop. We execute an op,
2981set 'next' to point the next op, and continue. If we come to a point which
2982we may need to backtrack to on failure such as (A|B|C), we push a
2983backtrack state onto the backtrack stack. On failure, we pop the top
2984state, and re-enter the loop at the state indicated. If there are no more
2985states to pop, we return failure.
2986
2987Sometimes we also need to backtrack on success; for example /A+/, where
2988after successfully matching one A, we need to go back and try to
2989match another one; similarly for lookahead assertions: if the assertion
2990completes successfully, we backtrack to the state just before the assertion
2991and then carry on. In these cases, the pushed state is marked as
2992'backtrack on success too'. This marking is in fact done by a chain of
2993pointers, each pointing to the previous 'yes' state. On success, we pop to
2994the nearest yes state, discarding any intermediate failure-only states.
2995Sometimes a yes state is pushed just to force some cleanup code to be
2996called at the end of a successful match or submatch; e.g. (??{$re}) uses
2997it to free the inner regex.
2998
2999Note that failure backtracking rewinds the cursor position, while
3000success backtracking leaves it alone.
3001
3002A pattern is complete when the END op is executed, while a subpattern
3003such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3004ops trigger the "pop to last yes state if any, otherwise return true"
3005behaviour.
3006
3007A common convention in this function is to use A and B to refer to the two
3008subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3009the subpattern to be matched possibly multiple times, while B is the entire
3010rest of the pattern. Variable and state names reflect this convention.
3011
3012The states in the main switch are the union of ops and failure/success of
3013substates associated with with that op. For example, IFMATCH is the op
3014that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3015'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3016successfully matched A and IFMATCH_A_fail is a state saying that we have
3017just failed to match A. Resume states always come in pairs. The backtrack
3018state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3019at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3020on success or failure.
3021
3022The struct that holds a backtracking state is actually a big union, with
3023one variant for each major type of op. The variable st points to the
3024top-most backtrack struct. To make the code clearer, within each
3025block of code we #define ST to alias the relevant union.
3026
3027Here's a concrete example of a (vastly oversimplified) IFMATCH
3028implementation:
3029
3030 switch (state) {
3031 ....
3032
3033#define ST st->u.ifmatch
3034
3035 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3036 ST.foo = ...; // some state we wish to save
95b24440 3037 ...
bf1f174e
DM
3038 // push a yes backtrack state with a resume value of
3039 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3040 // first node of A:
4d5016e5 3041 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3042 // NOTREACHED
3043
3044 case IFMATCH_A: // we have successfully executed A; now continue with B
3045 next = B;
3046 bar = ST.foo; // do something with the preserved value
3047 break;
3048
3049 case IFMATCH_A_fail: // A failed, so the assertion failed
3050 ...; // do some housekeeping, then ...
3051 sayNO; // propagate the failure
3052
3053#undef ST
95b24440 3054
bf1f174e
DM
3055 ...
3056 }
95b24440 3057
bf1f174e
DM
3058For any old-timers reading this who are familiar with the old recursive
3059approach, the code above is equivalent to:
95b24440 3060
bf1f174e
DM
3061 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3062 {
3063 int foo = ...
95b24440 3064 ...
bf1f174e
DM
3065 if (regmatch(A)) {
3066 next = B;
3067 bar = foo;
3068 break;
95b24440 3069 }
bf1f174e
DM
3070 ...; // do some housekeeping, then ...
3071 sayNO; // propagate the failure
95b24440 3072 }
bf1f174e
DM
3073
3074The topmost backtrack state, pointed to by st, is usually free. If you
3075want to claim it, populate any ST.foo fields in it with values you wish to
3076save, then do one of
3077
4d5016e5
DM
3078 PUSH_STATE_GOTO(resume_state, node, newinput);
3079 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3080
3081which sets that backtrack state's resume value to 'resume_state', pushes a
3082new free entry to the top of the backtrack stack, then goes to 'node'.
3083On backtracking, the free slot is popped, and the saved state becomes the
3084new free state. An ST.foo field in this new top state can be temporarily
3085accessed to retrieve values, but once the main loop is re-entered, it
3086becomes available for reuse.
3087
3088Note that the depth of the backtrack stack constantly increases during the
3089left-to-right execution of the pattern, rather than going up and down with
3090the pattern nesting. For example the stack is at its maximum at Z at the
3091end of the pattern, rather than at X in the following:
3092
3093 /(((X)+)+)+....(Y)+....Z/
3094
3095The only exceptions to this are lookahead/behind assertions and the cut,
3096(?>A), which pop all the backtrack states associated with A before
3097continuing.
3098
486ec47a 3099Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3100PL_regmatch_state and st always point to the currently active state,
3101and PL_regmatch_slab points to the slab currently containing
3102PL_regmatch_state. The first time regmatch() is called, the first slab is
3103allocated, and is never freed until interpreter destruction. When the slab
3104is full, a new one is allocated and chained to the end. At exit from
3105regmatch(), slabs allocated since entry are freed.
3106
3107*/
95b24440 3108
40a82448 3109
5bc10b2c 3110#define DEBUG_STATE_pp(pp) \
265c4333 3111 DEBUG_STATE_r({ \
f2ed9b32 3112 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3113 PerlIO_printf(Perl_debug_log, \
5d458dd8 3114 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3115 depth*2, "", \
13d6edb4 3116 PL_reg_name[st->resume_state], \
5d458dd8
YO
3117 ((st==yes_state||st==mark_state) ? "[" : ""), \
3118 ((st==yes_state) ? "Y" : ""), \
3119 ((st==mark_state) ? "M" : ""), \
3120 ((st==yes_state||st==mark_state) ? "]" : "") \
3121 ); \
265c4333 3122 });
5bc10b2c 3123
40a82448 3124
3dab1dad 3125#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3126
3df15adc 3127#ifdef DEBUGGING
5bc10b2c 3128
ab3bbdeb 3129STATIC void
f2ed9b32 3130S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3131 const char *start, const char *end, const char *blurb)
3132{
efd26800 3133 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3134
3135 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3136
ab3bbdeb
YO
3137 if (!PL_colorset)
3138 reginitcolors();
3139 {
3140 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3141 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3142
f2ed9b32 3143 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3144 start, end - start, 60);
3145
3146 PerlIO_printf(Perl_debug_log,
3147 "%s%s REx%s %s against %s\n",
3148 PL_colors[4], blurb, PL_colors[5], s0, s1);
3149
f2ed9b32 3150 if (utf8_target||utf8_pat)
1de06328
YO
3151 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3152 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3153 utf8_pat && utf8_target ? " and " : "",
3154 utf8_target ? "string" : ""
ab3bbdeb
YO
3155 );
3156 }
3157}
3df15adc
YO
3158
3159STATIC void
786e8c11
YO
3160S_dump_exec_pos(pTHX_ const char *locinput,
3161 const regnode *scan,
3162 const char *loc_regeol,
3163 const char *loc_bostr,
3164 const char *loc_reg_starttry,
f2ed9b32 3165 const bool utf8_target)
07be1b83 3166{
786e8c11 3167 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3168 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3169 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3170 /* The part of the string before starttry has one color
3171 (pref0_len chars), between starttry and current
3172 position another one (pref_len - pref0_len chars),
3173 after the current position the third one.
3174 We assume that pref0_len <= pref_len, otherwise we
3175 decrease pref0_len. */
786e8c11
YO
3176 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3177 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3178 int pref0_len;
3179
7918f24d
NC
3180 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3181
f2ed9b32 3182 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3183 pref_len++;
786e8c11
YO
3184 pref0_len = pref_len - (locinput - loc_reg_starttry);
3185 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3186 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3187 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3188 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3189 l--;
3190 if (pref0_len < 0)
3191 pref0_len = 0;
3192 if (pref0_len > pref_len)
3193 pref0_len = pref_len;
3194 {
f2ed9b32 3195 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3196
ab3bbdeb 3197 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3198 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3199
ab3bbdeb 3200 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3201 (locinput - pref_len + pref0_len),
1de06328 3202 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3203
ab3bbdeb 3204 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3205 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3206
1de06328 3207 const STRLEN tlen=len0+len1+len2;
3df15adc 3208 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3209 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3210 (IV)(locinput - loc_bostr),
07be1b83 3211 len0, s0,
07be1b83 3212 len1, s1,
07be1b83 3213 (docolor ? "" : "> <"),
07be1b83 3214 len2, s2,
f9f4320a 3215 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3216 "");
3217 }
3218}
3df15adc 3219
07be1b83
YO
3220#endif
3221
0a4db386
YO
3222/* reg_check_named_buff_matched()
3223 * Checks to see if a named buffer has matched. The data array of
3224 * buffer numbers corresponding to the buffer is expected to reside
3225 * in the regexp->data->data array in the slot stored in the ARG() of
3226 * node involved. Note that this routine doesn't actually care about the
3227 * name, that information is not preserved from compilation to execution.
3228 * Returns the index of the leftmost defined buffer with the given name
3229 * or 0 if non of the buffers matched.
3230 */
3231STATIC I32
7918f24d
NC
3232S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3233{
0a4db386 3234 I32 n;
f8fc2ecf 3235 RXi_GET_DECL(rex,rexi);
ad64d0ec 3236 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3237 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3238
3239 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3240
0a4db386 3241 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3242 if ((I32)rex->lastparen >= nums[n] &&
3243 rex->offs[nums[n]].end != -1)
0a4db386
YO
3244 {
3245 return nums[n];
3246 }
3247 }
3248 return 0;
3249}
3250
2f554ef7
DM
3251
3252/* free all slabs above current one - called during LEAVE_SCOPE */
3253
3254STATIC void
3255S_clear_backtrack_stack(pTHX_ void *p)
3256{
3257 regmatch_slab *s = PL_regmatch_slab->next;
3258 PERL_UNUSED_ARG(p);
3259
3260 if (!s)
3261 return;
3262 PL_regmatch_slab->next = NULL;
3263 while (s) {
3264 regmatch_slab * const osl = s;
3265 s = s->next;
3266 Safefree(osl);
3267 }
3268}
c74f6de9 3269static bool
79a2a0e8 3270S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
c74f6de9 3271{
79a2a0e8
KW
3272 /* This function determines if there are one or two characters that match
3273 * the first character of the passed-in EXACTish node <text_node>, and if
3274 * so, returns them in the passed-in pointers.
c74f6de9 3275 *
79a2a0e8
KW
3276 * If it determines that no possible character in the target string can
3277 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3278 * the first character in <text_node> requires UTF-8 to represent, and the
3279 * target string isn't in UTF-8.)
c74f6de9 3280 *
79a2a0e8
KW
3281 * If there are more than two characters that could match the beginning of
3282 * <text_node>, or if more context is required to determine a match or not,
3283 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3284 *
3285 * The motiviation behind this function is to allow the caller to set up
3286 * tight loops for matching. If <text_node> is of type EXACT, there is
3287 * only one possible character that can match its first character, and so
3288 * the situation is quite simple. But things get much more complicated if
3289 * folding is involved. It may be that the first character of an EXACTFish
3290 * node doesn't participate in any possible fold, e.g., punctuation, so it
3291 * can be matched only by itself. The vast majority of characters that are
3292 * in folds match just two things, their lower and upper-case equivalents.
3293 * But not all are like that; some have multiple possible matches, or match
3294 * sequences of more than one character. This function sorts all that out.
3295 *
3296 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3297 * loop of trying to match A*, we know we can't exit where the thing
3298 * following it isn't a B. And something can't be a B unless it is the
3299 * beginning of B. By putting a quick test for that beginning in a tight
3300 * loop, we can rule out things that can't possibly be B without having to
3301 * break out of the loop, thus avoiding work. Similarly, if A is a single
3302 * character, we can make a tight loop matching A*, using the outputs of
3303 * this function.
3304 *
3305 * If the target string to match isn't in UTF-8, and there aren't
3306 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3307 * the one or two possible octets (which are characters in this situation)
3308 * that can match. In all cases, if there is only one character that can
3309 * match, *<c1p> and *<c2p> will be identical.
3310 *
3311 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3312 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3313 * can match the beginning of <text_node>. They should be declared with at
3314 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3315 * undefined what these contain.) If one or both of the buffers are
3316 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3317 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3318 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3319 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3320 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
c74f6de9
KW
3321
3322 const bool utf8_target = PL_reg_match_utf8;
79a2a0e8 3323
ddb0d839
KW
3324 UV c1 = CHRTEST_NOT_A_CP_1;
3325 UV c2 = CHRTEST_NOT_A_CP_2;
79a2a0e8
KW
3326 bool use_chrtest_void = FALSE;
3327
3328 /* Used when we have both utf8 input and utf8 output, to avoid converting
3329 * to/from code points */
3330 bool utf8_has_been_setup = FALSE;
3331
c74f6de9
KW
3332 dVAR;
3333
b4291290 3334 U8 *pat = (U8*)STRING(text_node);
c74f6de9 3335
79a2a0e8
KW
3336 if (OP(text_node) == EXACT) {
3337
3338 /* In an exact node, only one thing can be matched, that first
3339 * character. If both the pat and the target are UTF-8, we can just
3340 * copy the input to the output, avoiding finding the code point of
3341 * that character */
3342 if (! UTF_PATTERN) {
3343 c2 = c1 = *pat;
3344 }
3345 else if (utf8_target) {
3346 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3347 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3348 utf8_has_been_setup = TRUE;
3349 }
3350 else {
3351 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
c74f6de9 3352 }
79a2a0e8
KW
3353 }
3354 else /* an EXACTFish node */
3355 if ((UTF_PATTERN
3356 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3357 pat + STR_LEN(text_node)))
3358 || (! UTF_PATTERN
3359 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3360 pat + STR_LEN(text_node))))
3361 {
3362 /* Multi-character folds require more context to sort out. Also
3363 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3364 * handled outside this routine */
3365 use_chrtest_void = TRUE;
3366 }
3367 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3368 c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3369 if (c1 > 256) {
3370 /* Load the folds hash, if not already done */
3371 SV** listp;
3372 if (! PL_utf8_foldclosures) {
3373 if (! PL_utf8_tofold) {
3374 U8 dummy[UTF8_MAXBYTES+1];
3375
3376 /* Force loading this by folding an above-Latin1 char */
3377 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3378 assert(PL_utf8_tofold); /* Verify that worked */
3379 }
3380 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3381 }
3382
3383 /* The fold closures data structure is a hash with the keys being
3384 * the UTF-8 of every character that is folded to, like 'k', and
3385 * the values each an array of all code points that fold to its
3386 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3387 * not included */
3388 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3389 (char *) pat,
3390 UTF8SKIP(pat),
3391 FALSE))))
3392 {
3393 /* Not found in the hash, therefore there are no folds
3394 * containing it, so there is only a single character that
3395 * could match */
3396 c2 = c1;
3397 }
3398 else { /* Does participate in folds */
3399 AV* list = (AV*) *listp;
3400 if (av_len(list) != 1) {
3401
3402 /* If there aren't exactly two folds to this, it is outside
3403 * the scope of this function */
3404 use_chrtest_void = TRUE;
3405 }
3406 else { /* There are two. Get them */
3407 SV** c_p = av_fetch(list, 0, FALSE);
3408 if (c_p == NULL) {
3409 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3410 }
3411 c1 = SvUV(*c_p);
3412
3413 c_p = av_fetch(list, 1, FALSE);
3414 if (c_p == NULL) {
3415 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3416 }
3417 c2 = SvUV(*c_p);
3418
3419 /* Folds that cross the 255/256 boundary are forbidden if
3420 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3421 * pattern character is above 256, and its only other match
3422 * is below 256, the only legal match will be to itself.
3423 * We have thrown away the original, so have to compute
3424 * which is the one above 255 */
3425 if ((c1 < 256) != (c2 < 256)) {
3426 if (OP(text_node) == EXACTFL
3427 || (OP(text_node) == EXACTFA
3428 && (isASCII(c1) || isASCII(c2))))
3429 {
3430 if (c1 < 256) {
3431 c1 = c2;
3432 }
3433 else {
3434 c2 = c1;
3435 }
3436 }
3437 }
3438 }
3439 }
3440 }
3441 else /* Here, c1 is < 255 */
3442 if (utf8_target
3443 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3444 && OP(text_node) != EXACTFL
3445 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
c74f6de9
KW
3446 {
3447 /* Here, there could be something above Latin1 in the target which
79a2a0e8
KW
3448 * folds to this character in the pattern. All such cases except
3449 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3450 * involved in their folds, so are outside the scope of this
3451 * function */
3452 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3453 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3454