This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regmatch(): silence OpenWatcom compiler warnings
[perl5.git] / regexec.c
<
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
7e0d5ad7
KW
40/* At least one required character in the target string is expressible only in
41 * UTF-8. */
991fc03a 42static const char* const non_utf8_target_but_utf8_required
7e0d5ad7
KW
43 = "Can't match, because target string needs to be in UTF-8\n";
44
6b54ddc5
YO
45#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
47 goto target; \
48} STMT_END
49
a687059c 50/*
e50aee73 51 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
52 *
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
55 *
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
59 *
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
62 * from defects in it.
63 *
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
66 *
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
69 *
70 **** Alterations to Henry's code are...
71 ****
4bb101f2 72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
a687059c 75 ****
9ef589d8
LW
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
a687059c
LW
78 *
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
82 */
83#include "EXTERN.h"
864dbfa3 84#define PERL_IN_REGEXEC_C
a687059c 85#include "perl.h"
0f5d15d6 86
54df2634
NC
87#ifdef PERL_IN_XSUB_RE
88# include "re_comp.h"
89#else
90# include "regcomp.h"
91#endif
a687059c 92
81e983c1 93#include "inline_invlist.c"
1b0f46bf 94#include "unicode_constants.h"
81e983c1 95
ef07e810 96#define RF_tainted 1 /* tainted information used? e.g. locale */
c277df42 97#define RF_warned 2 /* warned about big count? */
faec1544 98
ab3bbdeb 99#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 100
f2ed9b32 101#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02 102
c74f6de9
KW
103#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
104
a687059c
LW
105#ifndef STATIC
106#define STATIC static
107#endif
108
e0193e47 109/* Valid for non-utf8 strings: avoids the reginclass
7e2509c1
KW
110 * call if there are no complications: i.e., if everything matchable is
111 * straight forward in the bitmap */
635cd5d4 112#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
af364d03 113 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 114
c277df42
IZ
115/*
116 * Forwards.
117 */
118
f2ed9b32 119#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 120#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 121
3dab1dad
YO
122#define HOPc(pos,off) \
123 (char *)(PL_reg_match_utf8 \
52657f30 124 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
125 : (U8*)(pos + off))
126#define HOPBACKc(pos, off) \
07be1b83
YO
127 (char*)(PL_reg_match_utf8\
128 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
129 : (pos - off >= PL_bostr) \
8e11feef 130 ? (U8*)pos - off \
3dab1dad 131 : NULL)
efb30f32 132
e7409c1b 133#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 134#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 135
7016d6eb
DM
136
137#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
138#define NEXTCHR_IS_EOS (nextchr < 0)
139
140#define SET_nextchr \
141 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
142
143#define SET_locinput(p) \
144 locinput = (p); \
145 SET_nextchr
146
147
20d0b1e9 148/* these are unrolled below in the CCC_TRY_XXX defined */
61dad979 149#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
9c4fdda1 150 if (!CAT2(PL_utf8_,class)) { \
cf54f63a 151 bool ok; \
9c4fdda1 152 ENTER; save_re_context(); \
cf54f63a
JL
153 ok=CAT2(is_utf8_,class)((const U8*)str); \
154 PERL_UNUSED_VAR(ok); \
155 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
37e2e78e
KW
156/* Doesn't do an assert to verify that is correct */
157#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
9c4fdda1 158 if (!CAT2(PL_utf8_,class)) { \
9d63fa07 159 bool throw_away PERL_UNUSED_DECL; \
9c4fdda1
RB
160 ENTER; save_re_context(); \
161 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
c1976674 162 PERL_UNUSED_VAR(throw_away); \
9c4fdda1 163 LEAVE; } } STMT_END
37e2e78e 164
1a4fad37
AL
165#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
166#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
51371543 167
37e2e78e 168#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
61dad979
KW
169 /* No asserts are done for some of these, in case called on a */ \
170 /* Unicode version in which they map to nothing */ \
27d4fc33 171 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
61dad979 172 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
20d0b1e9 173
1dcf4a1b 174#define PLACEHOLDER /* Something for the preprocessor to grab onto */
d1eb3177 175
ee9a90b8
KW
176/* The actual code for CCC_TRY, which uses several variables from the routine
177 * it's callable from. It is designed to be the bulk of a case statement.
178 * FUNC is the macro or function to call on non-utf8 targets that indicate if
179 * nextchr matches the class.
180 * UTF8_TEST is the whole test string to use for utf8 targets
181 * LOAD is what to use to test, and if not present to load in the swash for the
182 * class
183 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
184 * UTF8_TEST test.
185 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
186 * utf8 and a variant, load the swash if necessary and test using the utf8
187 * test. Advance to the next character if test is ok, otherwise fail; If not
188 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
189 * fails, or advance to the next character */
190
191#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
7016d6eb 192 if (NEXTCHR_IS_EOS) { \
ee9a90b8
KW
193 sayNO; \
194 } \
195 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
196 LOAD_UTF8_CHARCLASS(CLASS, STR); \
197 if (POS_OR_NEG (UTF8_TEST)) { \
198 sayNO; \
199 } \
ee9a90b8 200 } \
28b98f76
DM
201 else if (POS_OR_NEG (FUNC(nextchr))) { \
202 sayNO; \
ee9a90b8 203 } \
28b98f76 204 goto increment_locinput;
980866de 205
ee9a90b8
KW
206/* Handle the non-locale cases for a character class and its complement. It
207 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
208 * This is because that code fails when the test succeeds, so we want to have
209 * the test fail so that the code succeeds. The swash is stored in a
210 * predictable PL_ place */
cfaf538b
KW
211#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
212 CLASS, STR) \
ee9a90b8
KW
213 case NAME: \
214 _CCC_TRY_CODE( !, FUNC, \
215 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
216 (U8*)locinput, TRUE)), \
217 CLASS, STR) \
218 case NNAME: \
1dcf4a1b 219 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
ee9a90b8
KW
220 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
221 (U8*)locinput, TRUE)), \
222 CLASS, STR) \
223
224/* Generate the case statements for both locale and non-locale character
225 * classes in regmatch for classes that don't have special unicode semantics.
226 * Locales don't use an immediate swash, but an intermediary special locale
227 * function that is called on the pointer to the current place in the input
228 * string. That function will resolve to needing the same swash. One might
229 * think that because we don't know what the locale will match, we shouldn't
230 * check with the swash loading function that it loaded properly; ie, that we
231 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
232 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
233 * irrelevant here */
234#define CCC_TRY(NAME, NNAME, FUNC, \
235 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
cfaf538b 236 NAMEA, NNAMEA, FUNCA, \
ee9a90b8
KW
237 CLASS, STR) \
238 case NAMEL: \
239 PL_reg_flags |= RF_tainted; \
240 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
241 case NNAMEL: \
242 PL_reg_flags |= RF_tainted; \
1dcf4a1b
KW
243 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
244 CLASS, STR) \
cfaf538b 245 case NAMEA: \
7016d6eb 246 if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) { \
cfaf538b
KW
247 sayNO; \
248 } \
249 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
3640db6b 250 locinput++; \
cfaf538b
KW
251 break; \
252 case NNAMEA: \
7016d6eb 253 if (NEXTCHR_IS_EOS || FUNCA(nextchr)) { \
cfaf538b
KW
254 sayNO; \
255 } \
28b98f76 256 goto increment_locinput; \
ee9a90b8
KW
257 /* Generate the non-locale cases */ \
258 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
259
260/* This is like CCC_TRY, but has an extra set of parameters for generating case
261 * statements to handle separate Unicode semantics nodes */
262#define CCC_TRY_U(NAME, NNAME, FUNC, \
263 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
264 NAMEU, NNAMEU, FUNCU, \
cfaf538b 265 NAMEA, NNAMEA, FUNCA, \
ee9a90b8 266 CLASS, STR) \
cfaf538b
KW
267 CCC_TRY(NAME, NNAME, FUNC, \
268 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
269 NAMEA, NNAMEA, FUNCA, \
270 CLASS, STR) \
ee9a90b8 271 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 272
3dab1dad
YO
273/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
274
5f80c4cf 275/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
276/* it would be nice to rework regcomp.sym to generate this stuff. sigh
277 *
278 * NOTE that *nothing* that affects backtracking should be in here, specifically
279 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
280 * node that is in between two EXACT like nodes when ascertaining what the required
281 * "follow" character is. This should probably be moved to regex compile time
282 * although it may be done at run time beause of the REF possibility - more
283 * investigation required. -- demerphq
284*/
3e901dc0
YO
285#define JUMPABLE(rn) ( \
286 OP(rn) == OPEN || \
287 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
288 OP(rn) == EVAL || \
cca55fe3
JP
289 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
290 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 291 OP(rn) == KEEPS || \
3dab1dad 292 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 293)
ee9b8eae 294#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 295
ee9b8eae
YO
296#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
297
298#if 0
299/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
300 we don't need this definition. */
301#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 302#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
303#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
304
305#else
306/* ... so we use this as its faster. */
307#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 308#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
309#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
310#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
311
312#endif
e2d8ce26 313
a84d97b6
HS
314/*
315 Search for mandatory following text node; for lookahead, the text must
316 follow but for lookbehind (rn->flags != 0) we skip to the next step.
317*/
cca55fe3 318#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
319 while (JUMPABLE(rn)) { \
320 const OPCODE type = OP(rn); \
321 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 322 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 323 else if (type == PLUS) \
cca55fe3 324 rn = NEXTOPER(rn); \
3dab1dad 325 else if (type == IFMATCH) \
a84d97b6 326 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 327 else rn += NEXT_OFF(rn); \
3dab1dad 328 } \
5f80c4cf 329} STMT_END
74750237 330
c476f425 331
acfe0abc 332static void restore_pos(pTHX_ void *arg);
51371543 333
87c0511b 334#define REGCP_PAREN_ELEMS 3
f067efbf 335#define REGCP_OTHER_ELEMS 3
e0fa7e2b 336#define REGCP_FRAME_ELEMS 1
620d5b66
NC
337/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
338 * are needed for the regexp context stack bookkeeping. */
339
76e3520e 340STATIC CHECKPOINT
b93070ed 341S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
a0d0e21e 342{
97aff369 343 dVAR;
a3b680e6 344 const int retval = PL_savestack_ix;
a3b680e6 345 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
346 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
347 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 348 I32 p;
40a82448 349 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 350
b93070ed
DM
351 PERL_ARGS_ASSERT_REGCPPUSH;
352
e49a9654 353 if (paren_elems_to_push < 0)
5637ef5b
NC
354 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
355 paren_elems_to_push);
e49a9654 356
e0fa7e2b
NC
357 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
358 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
359 " out of range (%lu-%ld)",
360 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 361
620d5b66 362 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 363
495f47a5
DM
364 DEBUG_BUFFERS_r(
365 if ((int)PL_regsize > (int)parenfloor)
366 PerlIO_printf(Perl_debug_log,
367 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
368 PTR2UV(rex),
369 PTR2UV(rex->offs)
370 );
371 );
87c0511b 372 for (p = parenfloor+1; p <= (I32)PL_regsize; p++) {
b1ce53c5 373/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
374 SSPUSHINT(rex->offs[p].end);
375 SSPUSHINT(rex->offs[p].start);
1ca2007e 376 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 377 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
378 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
379 (UV)p,
380 (IV)rex->offs[p].start,
381 (IV)rex->offs[p].start_tmp,
382 (IV)rex->offs[p].end
40a82448 383 ));
a0d0e21e 384 }
b1ce53c5 385/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22 386 SSPUSHINT(PL_regsize);
b93070ed
DM
387 SSPUSHINT(rex->lastparen);
388 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 389 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 390
a0d0e21e
LW
391 return retval;
392}
393
c277df42 394/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
395#define REGCP_SET(cp) \
396 DEBUG_STATE_r( \
ab3bbdeb 397 PerlIO_printf(Perl_debug_log, \
e4f74956 398 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
399 (IV)PL_savestack_ix)); \
400 cp = PL_savestack_ix
c3464db5 401
ab3bbdeb 402#define REGCP_UNWIND(cp) \
e4f74956 403 DEBUG_STATE_r( \
ab3bbdeb 404 if (cp != PL_savestack_ix) \
e4f74956
YO
405 PerlIO_printf(Perl_debug_log, \
406 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
407 (IV)(cp), (IV)PL_savestack_ix)); \
408 regcpblow(cp)
c277df42 409
a8d1f4b4
DM
410#define UNWIND_PAREN(lp, lcp) \
411 for (n = rex->lastparen; n > lp; n--) \
412 rex->offs[n].end = -1; \
413 rex->lastparen = n; \
414 rex->lastcloseparen = lcp;
415
416
f067efbf 417STATIC void
b93070ed 418S_regcppop(pTHX_ regexp *rex)
a0d0e21e 419{
97aff369 420 dVAR;
e0fa7e2b 421 UV i;
87c0511b 422 U32 paren;
a3621e74
YO
423 GET_RE_DEBUG_FLAGS_DECL;
424
7918f24d
NC
425 PERL_ARGS_ASSERT_REGCPPOP;
426
b1ce53c5 427 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 428 i = SSPOPUV;
e0fa7e2b
NC
429 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
430 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
431 rex->lastcloseparen = SSPOPINT;
432 rex->lastparen = SSPOPINT;
3280af22 433 PL_regsize = SSPOPINT;
b1ce53c5 434
620d5b66 435 i -= REGCP_OTHER_ELEMS;
b1ce53c5 436 /* Now restore the parentheses context. */
495f47a5
DM
437 DEBUG_BUFFERS_r(
438 if (i || rex->lastparen + 1 <= rex->nparens)
439 PerlIO_printf(Perl_debug_log,
440 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
441 PTR2UV(rex),
442 PTR2UV(rex->offs)
443 );
444 );
87c0511b 445 paren = PL_regsize;
620d5b66 446 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 447 I32 tmps;
1ca2007e 448 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 449 rex->offs[paren].start = SSPOPINT;
cf93c79d 450 tmps = SSPOPINT;
b93070ed
DM
451 if (paren <= rex->lastparen)
452 rex->offs[paren].end = tmps;
495f47a5
DM
453 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
454 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
455 (UV)paren,
456 (IV)rex->offs[paren].start,
457 (IV)rex->offs[paren].start_tmp,
458 (IV)rex->offs[paren].end,
459 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 460 );
87c0511b 461 paren--;
a0d0e21e 462 }
daf18116 463#if 1
dafc8851
JH
464 /* It would seem that the similar code in regtry()
465 * already takes care of this, and in fact it is in
466 * a better location to since this code can #if 0-ed out
467 * but the code in regtry() is needed or otherwise tests
468 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
469 * (as of patchlevel 7877) will fail. Then again,
470 * this code seems to be necessary or otherwise
225593e1
DM
471 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
472 * --jhi updated by dapm */
b93070ed 473 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
097eb12c 474 if (i > PL_regsize)
b93070ed
DM
475 rex->offs[i].start = -1;
476 rex->offs[i].end = -1;
495f47a5
DM
477 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
478 " \\%"UVuf": %s ..-1 undeffing\n",
479 (UV)i,
480 (i > PL_regsize) ? "-1" : " "
481 ));
a0d0e21e 482 }
dafc8851 483#endif
a0d0e21e
LW
484}
485
74088413
DM
486/* restore the parens and associated vars at savestack position ix,
487 * but without popping the stack */
488
489STATIC void
490S_regcp_restore(pTHX_ regexp *rex, I32 ix)
491{
492 I32 tmpix = PL_savestack_ix;
493 PL_savestack_ix = ix;
494 regcppop(rex);
495 PL_savestack_ix = tmpix;
496}
497
02db2b7b 498#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 499
a687059c 500/*
e50aee73 501 * pregexec and friends
a687059c
LW
502 */
503
76234dfb 504#ifndef PERL_IN_XSUB_RE
a687059c 505/*
c277df42 506 - pregexec - match a regexp against a string
a687059c 507 */
c277df42 508I32
49d7dfbc 509Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 510 char *strbeg, I32 minend, SV *screamer, U32 nosave)
8fd1a950
DM
511/* stringarg: the point in the string at which to begin matching */
512/* strend: pointer to null at end of string */
513/* strbeg: real beginning of string */
514/* minend: end of match must be >= minend bytes after stringarg. */
515/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
516 * itself is accessed via the pointers above */
517/* nosave: For optimizations. */
c277df42 518{
7918f24d
NC
519 PERL_ARGS_ASSERT_PREGEXEC;
520
c277df42 521 return
9041c2e3 522 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
523 nosave ? 0 : REXEC_COPY_STR);
524}
76234dfb 525#endif
22e551b9 526
9041c2e3 527/*
cad2e5aa
JH
528 * Need to implement the following flags for reg_anch:
529 *
530 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
531 * USE_INTUIT_ML
532 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
533 * INTUIT_AUTORITATIVE_ML
534 * INTUIT_ONCE_NOML - Intuit can match in one location only.
535 * INTUIT_ONCE_ML
536 *
537 * Another flag for this function: SECOND_TIME (so that float substrs
538 * with giant delta may be not rechecked).
539 */
540
541/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
542
3f7c398e 543/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
544 Otherwise, only SvCUR(sv) is used to get strbeg. */
545
546/* XXXX We assume that strpos is strbeg unless sv. */
547
6eb5f6b9
JH
548/* XXXX Some places assume that there is a fixed substring.
549 An update may be needed if optimizer marks as "INTUITable"
550 RExen without fixed substrings. Similarly, it is assumed that
551 lengths of all the strings are no more than minlen, thus they
552 cannot come from lookahead.
40d049e4
YO
553 (Or minlen should take into account lookahead.)
554 NOTE: Some of this comment is not correct. minlen does now take account
555 of lookahead/behind. Further research is required. -- demerphq
556
557*/
6eb5f6b9 558
2c2d71f5
JH
559/* A failure to find a constant substring means that there is no need to make
560 an expensive call to REx engine, thus we celebrate a failure. Similarly,
561 finding a substring too deep into the string means that less calls to
30944b6d
IZ
562 regtry() should be needed.
563
564 REx compiler's optimizer found 4 possible hints:
565 a) Anchored substring;
566 b) Fixed substring;
567 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 568 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 569 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
570 string which does not contradict any of them.
571 */
2c2d71f5 572
6eb5f6b9
JH
573/* Most of decisions we do here should have been done at compile time.
574 The nodes of the REx which we used for the search should have been
575 deleted from the finite automaton. */
576
cad2e5aa 577char *
288b8c02 578Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 579 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 580{
97aff369 581 dVAR;
8d919b0a 582 struct regexp *const prog = ReANY(rx);
eb578fdb 583 I32 start_shift = 0;
cad2e5aa 584 /* Should be nonnegative! */
eb578fdb
KW
585 I32 end_shift = 0;
586 char *s;
587 SV *check;
a1933d95 588 char *strbeg;
cad2e5aa 589 char *t;
f2ed9b32 590 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 591 I32 ml_anch;
eb578fdb 592 char *other_last = NULL; /* other substr checked before this */
bd61b366 593 char *check_at = NULL; /* check substr found at this pos */
d8080198 594 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 595 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 596 RXi_GET_DECL(prog,progi);
30944b6d 597#ifdef DEBUGGING
890ce7af 598 const char * const i_strpos = strpos;
30944b6d 599#endif
a3621e74
YO
600 GET_RE_DEBUG_FLAGS_DECL;
601
7918f24d 602 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
603 PERL_UNUSED_ARG(flags);
604 PERL_UNUSED_ARG(data);
7918f24d 605
f2ed9b32 606 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 607
3c8556c3 608 if (RX_UTF8(rx)) {
b8d68ded
JH
609 PL_reg_flags |= RF_utf8;
610 }
ab3bbdeb 611 DEBUG_EXECUTE_r(
f2ed9b32 612 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
613 sv ? "Guessing start of match in sv for"
614 : "Guessing start of match in string for");
2a782b5b 615 );
cad2e5aa 616
c344f387
JH
617 /* CHR_DIST() would be more correct here but it makes things slow. */
618 if (prog->minlen > strend - strpos) {
a3621e74 619 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 620 "String too short... [re_intuit_start]\n"));
cad2e5aa 621 goto fail;
2c2d71f5 622 }
1de06328 623
7016d6eb
DM
624 /* XXX we need to pass strbeg as a separate arg: the following is
625 * guesswork and can be wrong... */
626 if (sv && SvPOK(sv)) {
627 char * p = SvPVX(sv);
628 STRLEN cur = SvCUR(sv);
629 if (p <= strpos && strpos < p + cur) {
630 strbeg = p;
631 assert(p <= strend && strend <= p + cur);
632 }
633 else
634 strbeg = strend - cur;
635 }
636 else
637 strbeg = strpos;
638
1aa99e6b 639 PL_regeol = strend;
f2ed9b32 640 if (utf8_target) {
33b8afdf
JH
641 if (!prog->check_utf8 && prog->check_substr)
642 to_utf8_substr(prog);
643 check = prog->check_utf8;
644 } else {
7e0d5ad7
KW
645 if (!prog->check_substr && prog->check_utf8) {
646 if (! to_byte_substr(prog)) {
6b54ddc5 647 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
648 }
649 }
33b8afdf
JH
650 check = prog->check_substr;
651 }
bbe252da
YO
652 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
653 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
654 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 655 && !multiline ) ); /* Check after \n? */
cad2e5aa 656
7e25d62c 657 if (!ml_anch) {
bbe252da
YO
658 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
659 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 660 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
661 && sv && !SvROK(sv)
662 && (strpos != strbeg)) {
a3621e74 663 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
664 goto fail;
665 }
d46b78c6
KW
666 if (prog->check_offset_min == prog->check_offset_max
667 && !(prog->extflags & RXf_CANY_SEEN)
668 && ! multiline) /* /m can cause \n's to match that aren't
669 accounted for in the string max length.
670 See [perl #115242] */
671 {
2c2d71f5 672 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
673 I32 slen;
674
1aa99e6b 675 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 676
653099ff
GS
677 if (SvTAIL(check)) {
678 slen = SvCUR(check); /* >= 1 */
cad2e5aa 679
9041c2e3 680 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 681 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 682 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 683 goto fail_finish;
cad2e5aa
JH
684 }
685 /* Now should match s[0..slen-2] */
686 slen--;
3f7c398e 687 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 688 || (slen > 1
3f7c398e 689 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 690 report_neq:
a3621e74 691 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
692 goto fail_finish;
693 }
cad2e5aa 694 }
3f7c398e 695 else if (*SvPVX_const(check) != *s
653099ff 696 || ((slen = SvCUR(check)) > 1
3f7c398e 697 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 698 goto report_neq;
c315bfe8 699 check_at = s;
2c2d71f5 700 goto success_at_start;
7e25d62c 701 }
cad2e5aa 702 }
2c2d71f5 703 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 704 s = strpos;
2c2d71f5 705 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
706 end_shift = prog->check_end_shift;
707
2c2d71f5 708 if (!ml_anch) {
a3b680e6 709 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 710 - (SvTAIL(check) != 0);
a3b680e6 711 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
712
713 if (end_shift < eshift)
714 end_shift = eshift;
715 }
cad2e5aa 716 }
2c2d71f5 717 else { /* Can match at random position */
cad2e5aa
JH
718 ml_anch = 0;
719 s = strpos;
1de06328
YO
720 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
721 end_shift = prog->check_end_shift;
722
723 /* end shift should be non negative here */
cad2e5aa
JH
724 }
725
bcdf7404 726#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 727 if (end_shift < 0)
1de06328 728 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 729 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
730#endif
731
2c2d71f5
JH
732 restart:
733 /* Find a possible match in the region s..strend by looking for
734 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
735
736 {
737 I32 srch_start_shift = start_shift;
738 I32 srch_end_shift = end_shift;
c33e64f0
FC
739 U8* start_point;
740 U8* end_point;
1de06328
YO
741 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
742 srch_end_shift -= ((strbeg - s) - srch_start_shift);
743 srch_start_shift = strbeg - s;
744 }
6bda09f9 745 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
746 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
747 (IV)prog->check_offset_min,
748 (IV)srch_start_shift,
749 (IV)srch_end_shift,
750 (IV)prog->check_end_shift);
751 });
752
bbe252da 753 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
754 start_point= (U8*)(s + srch_start_shift);
755 end_point= (U8*)(strend - srch_end_shift);
756 } else {
757 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
758 end_point= HOP3(strend, -srch_end_shift, strbeg);
759 }
6bda09f9 760 DEBUG_OPTIMISE_MORE_r({
56570a2c 761 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 762 (int)(end_point - start_point),
fc8cd66c 763 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
764 start_point);
765 });
766
767 s = fbm_instr( start_point, end_point,
7fba1cd6 768 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 769 }
cad2e5aa
JH
770 /* Update the count-of-usability, remove useless subpatterns,
771 unshift s. */
2c2d71f5 772
ab3bbdeb 773 DEBUG_EXECUTE_r({
f2ed9b32 774 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
775 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
776 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 777 (s ? "Found" : "Did not find"),
f2ed9b32 778 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
779 ? "anchored" : "floating"),
780 quoted,
781 RE_SV_TAIL(check),
782 (s ? " at offset " : "...\n") );
783 });
2c2d71f5
JH
784
785 if (!s)
786 goto fail_finish;
2c2d71f5 787 /* Finish the diagnostic message */
a3621e74 788 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 789
1de06328
YO
790 /* XXX dmq: first branch is for positive lookbehind...
791 Our check string is offset from the beginning of the pattern.
792 So we need to do any stclass tests offset forward from that
793 point. I think. :-(
794 */
795
796
797
798 check_at=s;
799
800
2c2d71f5
JH
801 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
802 Start with the other substr.
803 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 804 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
805 *always* match. Probably should be marked during compile...
806 Probably it is right to do no SCREAM here...
807 */
808
f2ed9b32 809 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
810 : (prog->float_substr && prog->anchored_substr))
811 {
30944b6d 812 /* Take into account the "other" substring. */
2c2d71f5
JH
813 /* XXXX May be hopelessly wrong for UTF... */
814 if (!other_last)
6eb5f6b9 815 other_last = strpos;
f2ed9b32 816 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
817 do_other_anchored:
818 {
890ce7af
AL
819 char * const last = HOP3c(s, -start_shift, strbeg);
820 char *last1, *last2;
be8e71aa 821 char * const saved_s = s;
33b8afdf 822 SV* must;
2c2d71f5 823
2c2d71f5
JH
824 t = s - prog->check_offset_max;
825 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 826 && (!utf8_target
0ce71af7 827 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 828 && t > strpos)))
6f207bd3 829 NOOP;
2c2d71f5
JH
830 else
831 t = strpos;
1aa99e6b 832 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
833 if (t < other_last) /* These positions already checked */
834 t = other_last;
1aa99e6b 835 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
836 if (last < last1)
837 last1 = last;
1de06328
YO
838 /* XXXX It is not documented what units *_offsets are in.
839 We assume bytes, but this is clearly wrong.
840 Meaning this code needs to be carefully reviewed for errors.
841 dmq.
842 */
843
2c2d71f5 844 /* On end-of-str: see comment below. */
f2ed9b32 845 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
846 if (must == &PL_sv_undef) {
847 s = (char*)NULL;
1de06328 848 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
849 }
850 else
851 s = fbm_instr(
852 (unsigned char*)t,
853 HOP3(HOP3(last1, prog->anchored_offset, strend)
854 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
855 must,
7fba1cd6 856 multiline ? FBMrf_MULTILINE : 0
33b8afdf 857 );
ab3bbdeb 858 DEBUG_EXECUTE_r({
f2ed9b32 859 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
860 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
861 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 862 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
863 quoted, RE_SV_TAIL(must));
864 });
865
866
2c2d71f5
JH
867 if (!s) {
868 if (last1 >= last2) {
a3621e74 869 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
870 ", giving up...\n"));
871 goto fail_finish;
872 }
a3621e74 873 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 874 ", trying floating at offset %ld...\n",
be8e71aa 875 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
876 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
877 s = HOP3c(last, 1, strend);
2c2d71f5
JH
878 goto restart;
879 }
880 else {
a3621e74 881 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 882 (long)(s - i_strpos)));
1aa99e6b
IH
883 t = HOP3c(s, -prog->anchored_offset, strbeg);
884 other_last = HOP3c(s, 1, strend);
be8e71aa 885 s = saved_s;
2c2d71f5
JH
886 if (t == strpos)
887 goto try_at_start;
2c2d71f5
JH
888 goto try_at_offset;
889 }
30944b6d 890 }
2c2d71f5
JH
891 }
892 else { /* Take into account the floating substring. */
33b8afdf 893 char *last, *last1;
be8e71aa 894 char * const saved_s = s;
33b8afdf
JH
895 SV* must;
896
897 t = HOP3c(s, -start_shift, strbeg);
898 last1 = last =
899 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
900 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
901 last = HOP3c(t, prog->float_max_offset, strend);
902 s = HOP3c(t, prog->float_min_offset, strend);
903 if (s < other_last)
904 s = other_last;
2c2d71f5 905 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 906 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
907 /* fbm_instr() takes into account exact value of end-of-str
908 if the check is SvTAIL(ed). Since false positives are OK,
909 and end-of-str is not later than strend we are OK. */
910 if (must == &PL_sv_undef) {
911 s = (char*)NULL;
1de06328 912 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
913 }
914 else
2c2d71f5 915 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
916 (unsigned char*)last + SvCUR(must)
917 - (SvTAIL(must)!=0),
7fba1cd6 918 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 919 DEBUG_EXECUTE_r({
f2ed9b32 920 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
921 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
922 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 923 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
924 quoted, RE_SV_TAIL(must));
925 });
33b8afdf
JH
926 if (!s) {
927 if (last1 == last) {
a3621e74 928 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
929 ", giving up...\n"));
930 goto fail_finish;
2c2d71f5 931 }
a3621e74 932 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 933 ", trying anchored starting at offset %ld...\n",
be8e71aa 934 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
935 other_last = last;
936 s = HOP3c(t, 1, strend);
937 goto restart;
938 }
939 else {
a3621e74 940 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
941 (long)(s - i_strpos)));
942 other_last = s; /* Fix this later. --Hugo */
be8e71aa 943 s = saved_s;
33b8afdf
JH
944 if (t == strpos)
945 goto try_at_start;
946 goto try_at_offset;
947 }
2c2d71f5 948 }
cad2e5aa 949 }
2c2d71f5 950
1de06328 951
9ef43ace 952 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 953
6bda09f9 954 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
955 PerlIO_printf(Perl_debug_log,
956 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
957 (IV)prog->check_offset_min,
958 (IV)prog->check_offset_max,
959 (IV)(s-strpos),
960 (IV)(t-strpos),
961 (IV)(t-s),
962 (IV)(strend-strpos)
963 )
964 );
965
2c2d71f5 966 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 967 && (!utf8_target
9ef43ace 968 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
969 && t > strpos)))
970 {
2c2d71f5
JH
971 /* Fixed substring is found far enough so that the match
972 cannot start at strpos. */
973 try_at_offset:
cad2e5aa 974 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
975 /* Eventually fbm_*() should handle this, but often
976 anchored_offset is not 0, so this check will not be wasted. */
977 /* XXXX In the code below we prefer to look for "^" even in
978 presence of anchored substrings. And we search even
979 beyond the found float position. These pessimizations
980 are historical artefacts only. */
981 find_anchor:
2c2d71f5 982 while (t < strend - prog->minlen) {
cad2e5aa 983 if (*t == '\n') {
4ee3650e 984 if (t < check_at - prog->check_offset_min) {
f2ed9b32 985 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
986 /* Since we moved from the found position,
987 we definitely contradict the found anchored
30944b6d
IZ
988 substr. Due to the above check we do not
989 contradict "check" substr.
990 Thus we can arrive here only if check substr
991 is float. Redo checking for "other"=="fixed".
992 */
9041c2e3 993 strpos = t + 1;
a3621e74 994 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 995 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
996 goto do_other_anchored;
997 }
4ee3650e
GS
998 /* We don't contradict the found floating substring. */
999 /* XXXX Why not check for STCLASS? */
cad2e5aa 1000 s = t + 1;
a3621e74 1001 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 1002 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
1003 goto set_useful;
1004 }
4ee3650e
GS
1005 /* Position contradicts check-string */
1006 /* XXXX probably better to look for check-string
1007 than for "\n", so one should lower the limit for t? */
a3621e74 1008 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 1009 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 1010 other_last = strpos = s = t + 1;
cad2e5aa
JH
1011 goto restart;
1012 }
1013 t++;
1014 }
a3621e74 1015 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 1016 PL_colors[0], PL_colors[1]));
2c2d71f5 1017 goto fail_finish;
cad2e5aa 1018 }
f5952150 1019 else {
a3621e74 1020 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 1021 PL_colors[0], PL_colors[1]));
f5952150 1022 }
cad2e5aa
JH
1023 s = t;
1024 set_useful:
f2ed9b32 1025 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1026 }
1027 else {
f5952150 1028 /* The found string does not prohibit matching at strpos,
2c2d71f5 1029 - no optimization of calling REx engine can be performed,
f5952150
GS
1030 unless it was an MBOL and we are not after MBOL,
1031 or a future STCLASS check will fail this. */
2c2d71f5
JH
1032 try_at_start:
1033 /* Even in this situation we may use MBOL flag if strpos is offset
1034 wrt the start of the string. */
05b4157f 1035 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1036 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1037 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1038 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1039 {
cad2e5aa
JH
1040 t = strpos;
1041 goto find_anchor;
1042 }
a3621e74 1043 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1044 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1045 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1046 );
2c2d71f5 1047 success_at_start:
bbe252da 1048 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1049 && (utf8_target ? (
33b8afdf
JH
1050 prog->check_utf8 /* Could be deleted already */
1051 && --BmUSEFUL(prog->check_utf8) < 0
1052 && (prog->check_utf8 == prog->float_utf8)
1053 ) : (
1054 prog->check_substr /* Could be deleted already */
1055 && --BmUSEFUL(prog->check_substr) < 0
1056 && (prog->check_substr == prog->float_substr)
1057 )))
66e933ab 1058 {
cad2e5aa 1059 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1060 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1061 /* XXX Does the destruction order has to change with utf8_target? */
1062 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1063 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1064 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1065 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1066 check = NULL; /* abort */
cad2e5aa 1067 s = strpos;
486ec47a 1068 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1069 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1070 if (prog->intflags & PREGf_IMPLICIT)
1071 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1072 /* XXXX This is a remnant of the old implementation. It
1073 looks wasteful, since now INTUIT can use many
6eb5f6b9 1074 other heuristics. */
bbe252da 1075 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1076 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1077 }
1078 else
1079 s = strpos;
1080 }
1081
6eb5f6b9
JH
1082 /* Last resort... */
1083 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1084 /* trie stclasses are too expensive to use here, we are better off to
1085 leave it to regmatch itself */
f8fc2ecf 1086 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1087 /* minlen == 0 is possible if regstclass is \b or \B,
1088 and the fixed substr is ''$.
1089 Since minlen is already taken into account, s+1 is before strend;
1090 accidentally, minlen >= 1 guaranties no false positives at s + 1
1091 even for \b or \B. But (minlen? 1 : 0) below assumes that
1092 regstclass does not come from lookahead... */
1093 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1094 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1095 const U8* const str = (U8*)STRING(progi->regstclass);
1096 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1097 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1098 : 1);
1de06328
YO
1099 char * endpos;
1100 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1101 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1102 else if (prog->float_substr || prog->float_utf8)
1103 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1104 else
1105 endpos= strend;
1106
d8080198
YO
1107 if (checked_upto < s)
1108 checked_upto = s;
1109 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1110 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1111
6eb5f6b9 1112 t = s;
d8080198
YO
1113 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1114 if (s) {
1115 checked_upto = s;
1116 } else {
6eb5f6b9 1117#ifdef DEBUGGING
cbbf8932 1118 const char *what = NULL;
6eb5f6b9
JH
1119#endif
1120 if (endpos == strend) {
a3621e74 1121 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1122 "Could not match STCLASS...\n") );
1123 goto fail;
1124 }
a3621e74 1125 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1126 "This position contradicts STCLASS...\n") );
bbe252da 1127 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1128 goto fail;
d8080198
YO
1129 checked_upto = HOPBACKc(endpos, start_shift);
1130 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1131 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1132 /* Contradict one of substrings */
33b8afdf 1133 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1134 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1135 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1136 hop_and_restart:
1aa99e6b 1137 s = HOP3c(t, 1, strend);
66e933ab
GS
1138 if (s + start_shift + end_shift > strend) {
1139 /* XXXX Should be taken into account earlier? */
a3621e74 1140 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1141 "Could not match STCLASS...\n") );
1142 goto fail;
1143 }
5e39e1e5
HS
1144 if (!check)
1145 goto giveup;
a3621e74 1146 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1147 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1148 what, (long)(s + start_shift - i_strpos)) );
1149 goto restart;
1150 }
66e933ab 1151 /* Have both, check_string is floating */
6eb5f6b9
JH
1152 if (t + start_shift >= check_at) /* Contradicts floating=check */
1153 goto retry_floating_check;
1154 /* Recheck anchored substring, but not floating... */
9041c2e3 1155 s = check_at;
5e39e1e5
HS
1156 if (!check)
1157 goto giveup;
a3621e74 1158 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1159 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1160 (long)(other_last - i_strpos)) );
1161 goto do_other_anchored;
1162 }
60e71179
GS
1163 /* Another way we could have checked stclass at the
1164 current position only: */
1165 if (ml_anch) {
1166 s = t = t + 1;
5e39e1e5
HS
1167 if (!check)
1168 goto giveup;
a3621e74 1169 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1170 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1171 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1172 goto try_at_offset;
66e933ab 1173 }
f2ed9b32 1174 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1175 goto fail;
486ec47a 1176 /* Check is floating substring. */
6eb5f6b9
JH
1177 retry_floating_check:
1178 t = check_at - start_shift;
a3621e74 1179 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1180 goto hop_and_restart;
1181 }
b7953727 1182 if (t != s) {
a3621e74 1183 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1184 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1185 (long)(t - i_strpos), (long)(s - i_strpos))
1186 );
1187 }
1188 else {
a3621e74 1189 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1190 "Does not contradict STCLASS...\n");
1191 );
1192 }
6eb5f6b9 1193 }
5e39e1e5 1194 giveup:
a3621e74 1195 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1196 PL_colors[4], (check ? "Guessed" : "Giving up"),
1197 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1198 return s;
2c2d71f5
JH
1199
1200 fail_finish: /* Substring not found */
33b8afdf 1201 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1202 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1203 fail:
a3621e74 1204 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1205 PL_colors[4], PL_colors[5]));
bd61b366 1206 return NULL;
cad2e5aa 1207}
9661b544 1208
a0a388a1
YO
1209#define DECL_TRIE_TYPE(scan) \
1210 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1211 trie_type = ((scan->flags == EXACT) \
1212 ? (utf8_target ? trie_utf8 : trie_plain) \
1213 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1214
1215#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1216uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1217 STRLEN skiplen; \
1218 switch (trie_type) { \
1219 case trie_utf8_fold: \
1220 if ( foldlen>0 ) { \
1221 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1222 foldlen -= len; \
1223 uscan += len; \
1224 len=0; \
1225 } else { \
1226 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1227 len = UTF8SKIP(uc); \
1228 skiplen = UNISKIP( uvc ); \
1229 foldlen -= skiplen; \
1230 uscan = foldbuf + skiplen; \
1231 } \
1232 break; \
1233 case trie_latin_utf8_fold: \
1234 if ( foldlen>0 ) { \
1235 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1236 foldlen -= len; \
1237 uscan += len; \
1238 len=0; \
1239 } else { \
1240 len = 1; \
1241 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1242 skiplen = UNISKIP( uvc ); \
1243 foldlen -= skiplen; \
1244 uscan = foldbuf + skiplen; \
1245 } \
1246 break; \
1247 case trie_utf8: \
1248 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1249 break; \
1250 case trie_plain: \
1251 uvc = (UV)*uc; \
1252 len = 1; \
1253 } \
1254 if (uvc < 256) { \
1255 charid = trie->charmap[ uvc ]; \
1256 } \
1257 else { \
1258 charid = 0; \
1259 if (widecharmap) { \
1260 SV** const svpp = hv_fetch(widecharmap, \
1261 (char*)&uvc, sizeof(UV), 0); \
1262 if (svpp) \
1263 charid = (U16)SvIV(*svpp); \
1264 } \
1265 } \
4cadc6a9
YO
1266} STMT_END
1267
4cadc6a9
YO
1268#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1269STMT_START { \
1270 while (s <= e) { \
1271 if ( (CoNd) \
fac1af77 1272 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1273 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1274 goto got_it; \
1275 s++; \
1276 } \
1277} STMT_END
1278
1279#define REXEC_FBC_UTF8_SCAN(CoDe) \
1280STMT_START { \
7016d6eb 1281 while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) { \
4cadc6a9
YO
1282 CoDe \
1283 s += uskip; \
1284 } \
1285} STMT_END
1286
1287#define REXEC_FBC_SCAN(CoDe) \
1288STMT_START { \
1289 while (s < strend) { \
1290 CoDe \
1291 s++; \
1292 } \
1293} STMT_END
1294
1295#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1296REXEC_FBC_UTF8_SCAN( \
1297 if (CoNd) { \
24b23f37 1298 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1299 goto got_it; \
1300 else \
1301 tmp = doevery; \
1302 } \
1303 else \
1304 tmp = 1; \
1305)
1306
1307#define REXEC_FBC_CLASS_SCAN(CoNd) \
1308REXEC_FBC_SCAN( \
1309 if (CoNd) { \
24b23f37 1310 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1311 goto got_it; \
1312 else \
1313 tmp = doevery; \
1314 } \
1315 else \
1316 tmp = 1; \
1317)
1318
1319#define REXEC_FBC_TRYIT \
24b23f37 1320if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1321 goto got_it
1322
e1d1eefb 1323#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1324 if (utf8_target) { \
e1d1eefb
YO
1325 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1326 } \
1327 else { \
1328 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1329 }
e1d1eefb 1330
4cadc6a9 1331#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1332 if (utf8_target) { \
4cadc6a9
YO
1333 UtFpReLoAd; \
1334 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1335 } \
1336 else { \
1337 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1338 }
4cadc6a9
YO
1339
1340#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1341 PL_reg_flags |= RF_tainted; \
f2ed9b32 1342 if (utf8_target) { \
4cadc6a9
YO
1343 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1344 } \
1345 else { \
1346 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1347 }
4cadc6a9 1348
786e8c11
YO
1349#define DUMP_EXEC_POS(li,s,doutf8) \
1350 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1351
cfaf538b
KW
1352
1353#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1354 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1355 tmp = TEST_NON_UTF8(tmp); \
1356 REXEC_FBC_UTF8_SCAN( \
1357 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1358 tmp = !tmp; \
1359 IF_SUCCESS; \
1360 } \
1361 else { \
1362 IF_FAIL; \
1363 } \
1364 ); \
1365
1366#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1367 if (s == PL_bostr) { \
1368 tmp = '\n'; \
1369 } \
1370 else { \
1371 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1372 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1373 } \
1374 tmp = TeSt1_UtF8; \
1375 LOAD_UTF8_CHARCLASS_ALNUM(); \
1376 REXEC_FBC_UTF8_SCAN( \
1377 if (tmp == ! (TeSt2_UtF8)) { \
1378 tmp = !tmp; \
1379 IF_SUCCESS; \
1380 } \
1381 else { \
1382 IF_FAIL; \
1383 } \
1384 ); \
1385
63ac0dad
KW
1386/* The only difference between the BOUND and NBOUND cases is that
1387 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1388 * NBOUND. This is accomplished by passing it in either the if or else clause,
1389 * with the other one being empty */
1390#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1391 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1392
1393#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1394 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1395
1396#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1397 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1398
1399#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1400 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1401
63ac0dad
KW
1402
1403/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1404 * be passed in completely with the variable name being tested, which isn't
1405 * such a clean interface, but this is easier to read than it was before. We
1406 * are looking for the boundary (or non-boundary between a word and non-word
1407 * character. The utf8 and non-utf8 cases have the same logic, but the details
1408 * must be different. Find the "wordness" of the character just prior to this
1409 * one, and compare it with the wordness of this one. If they differ, we have
1410 * a boundary. At the beginning of the string, pretend that the previous
1411 * character was a new-line */
cfaf538b 1412#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1413 if (utf8_target) { \
cfaf538b 1414 UTF8_CODE \
63ac0dad
KW
1415 } \
1416 else { /* Not utf8 */ \
1417 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1418 tmp = TEST_NON_UTF8(tmp); \
1419 REXEC_FBC_SCAN( \
1420 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1421 tmp = !tmp; \
1422 IF_SUCCESS; \
1423 } \
1424 else { \
1425 IF_FAIL; \
1426 } \
1427 ); \
1428 } \
1429 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1430 goto got_it;
1431
786e8c11
YO
1432/* We know what class REx starts with. Try to find this position... */
1433/* if reginfo is NULL, its a dryrun */
1434/* annoyingly all the vars in this routine have different names from their counterparts
1435 in regmatch. /grrr */
1436
3c3eec57 1437STATIC char *
07be1b83 1438S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1439 const char *strend, regmatch_info *reginfo)
a687059c 1440{
27da23d5 1441 dVAR;
bbe252da 1442 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
fac1af77
KW
1443 char *pat_string; /* The pattern's exactish string */
1444 char *pat_end; /* ptr to end char of pat_string */
1445 re_fold_t folder; /* Function for computing non-utf8 folds */
1446 const U8 *fold_array; /* array for folding ords < 256 */
d8093b23 1447 STRLEN ln;
5dab1207 1448 STRLEN lnc;
eb578fdb 1449 STRLEN uskip;
fac1af77
KW
1450 U8 c1;
1451 U8 c2;
6eb5f6b9 1452 char *e;
eb578fdb
KW
1453 I32 tmp = 1; /* Scratch variable? */
1454 const bool utf8_target = PL_reg_match_utf8;
453bfd44 1455 UV utf8_fold_flags = 0;
f8fc2ecf 1456 RXi_GET_DECL(prog,progi);
7918f24d
NC
1457
1458 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1459
6eb5f6b9
JH
1460 /* We know what class it must start with. */
1461 switch (OP(c)) {
6eb5f6b9 1462 case ANYOF:
e0193e47 1463 if (utf8_target) {
b1e3e569 1464 REXEC_FBC_UTF8_CLASS_SCAN(
635cd5d4 1465 reginclass(prog, c, (U8*)s, utf8_target));
388cc4de
HS
1466 }
1467 else {
6ef69d56 1468 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
a0d0e21e 1469 }
6eb5f6b9 1470 break;
f33976b4 1471 case CANY:
4cadc6a9 1472 REXEC_FBC_SCAN(
24b23f37 1473 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1474 goto got_it;
1475 else
1476 tmp = doevery;
4cadc6a9 1477 );
f33976b4 1478 break;
2f7f8cb1
KW
1479
1480 case EXACTFA:
1481 if (UTF_PATTERN || utf8_target) {
1482 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1483 goto do_exactf_utf8;
1484 }
1485 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1486 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1487 goto do_exactf_non_utf8; /* isn't dealt with by these */
1488
6eb5f6b9 1489 case EXACTF:
62bf7766 1490 if (utf8_target) {
77a6d856
KW
1491
1492 /* regcomp.c already folded this if pattern is in UTF-8 */
62bf7766 1493 utf8_fold_flags = 0;
fac1af77
KW
1494 goto do_exactf_utf8;
1495 }
1496 fold_array = PL_fold;
1497 folder = foldEQ;
1498 goto do_exactf_non_utf8;
1499
1500 case EXACTFL:
1501 if (UTF_PATTERN || utf8_target) {
17580e7a 1502 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
fac1af77
KW
1503 goto do_exactf_utf8;
1504 }
1505 fold_array = PL_fold_locale;
1506 folder = foldEQ_locale;
16d951b7
KW
1507 goto do_exactf_non_utf8;
1508
3c760661
KW
1509 case EXACTFU_SS:
1510 if (UTF_PATTERN) {
1511 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1512 }
1513 goto do_exactf_utf8;
1514
fab2782b 1515 case EXACTFU_TRICKYFOLD:
16d951b7
KW
1516 case EXACTFU:
1517 if (UTF_PATTERN || utf8_target) {
77a6d856 1518 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
16d951b7
KW
1519 goto do_exactf_utf8;
1520 }
1521
1522 /* Any 'ss' in the pattern should have been replaced by regcomp,
1523 * so we don't have to worry here about this single special case
1524 * in the Latin1 range */
1525 fold_array = PL_fold_latin1;
1526 folder = foldEQ_latin1;
fac1af77
KW
1527
1528 /* FALL THROUGH */
1529
62bf7766
KW
1530 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1531 are no glitches with fold-length differences
1532 between the target string and pattern */
fac1af77
KW
1533
1534 /* The idea in the non-utf8 EXACTF* cases is to first find the
1535 * first character of the EXACTF* node and then, if necessary,
1536 * case-insensitively compare the full text of the node. c1 is the
1537 * first character. c2 is its fold. This logic will not work for
1538 * Unicode semantics and the german sharp ss, which hence should
1539 * not be compiled into a node that gets here. */
1540 pat_string = STRING(c);
1541 ln = STR_LEN(c); /* length to match in octets/bytes */
1542
8a90a8fe
KW
1543 /* We know that we have to match at least 'ln' bytes (which is the
1544 * same as characters, since not utf8). If we have to match 3
1545 * characters, and there are only 2 availabe, we know without
1546 * trying that it will fail; so don't start a match past the
1547 * required minimum number from the far end */
fac1af77
KW
1548 e = HOP3c(strend, -((I32)ln), s);
1549
1550 if (!reginfo && e < s) {
1551 e = s; /* Due to minlen logic of intuit() */
1552 }
1553
1554 c1 = *pat_string;
1555 c2 = fold_array[c1];
1556 if (c1 == c2) { /* If char and fold are the same */
1557 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1558 }
1559 else {
1560 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1561 }
1562 break;
1563
1564 do_exactf_utf8:
e067297c
KW
1565 {
1566 unsigned expansion;
1567
fac1af77
KW
1568
1569 /* If one of the operands is in utf8, we can't use the simpler
1570 * folding above, due to the fact that many different characters
1571 * can have the same fold, or portion of a fold, or different-
1572 * length fold */
1573 pat_string = STRING(c);
1574 ln = STR_LEN(c); /* length to match in octets/bytes */
1575 pat_end = pat_string + ln;
1576 lnc = (UTF_PATTERN) /* length to match in characters */
1577 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1578 : ln;
1579
e067297c
KW
1580 /* We have 'lnc' characters to match in the pattern, but because of
1581 * multi-character folding, each character in the target can match
1582 * up to 3 characters (Unicode guarantees it will never exceed
1583 * this) if it is utf8-encoded; and up to 2 if not (based on the
1584 * fact that the Latin 1 folds are already determined, and the
1585 * only multi-char fold in that range is the sharp-s folding to
1586 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
89378d8a
KW
1587 * string character. Adjust lnc accordingly, rounding up, so that
1588 * if we need to match at least 4+1/3 chars, that really is 5. */
e067297c 1589 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
89378d8a 1590 lnc = (lnc + expansion - 1) / expansion;
e067297c
KW
1591
1592 /* As in the non-UTF8 case, if we have to match 3 characters, and
1593 * only 2 are left, it's guaranteed to fail, so don't start a
1594 * match that would require us to go beyond the end of the string
1595 */
1596 e = HOP3c(strend, -((I32)lnc), s);
fac1af77
KW
1597
1598 if (!reginfo && e < s) {
1599 e = s; /* Due to minlen logic of intuit() */
1600 }
1601
b33105db
KW
1602 /* XXX Note that we could recalculate e to stop the loop earlier,
1603 * as the worst case expansion above will rarely be met, and as we
1604 * go along we would usually find that e moves further to the left.
1605 * This would happen only after we reached the point in the loop
1606 * where if there were no expansion we should fail. Unclear if
1607 * worth the expense */
e067297c 1608
fac1af77
KW
1609 while (s <= e) {
1610 char *my_strend= (char *)strend;
1611 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1612 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1613 && (!reginfo || regtry(reginfo, &s)) )
1614 {
1615 goto got_it;
1616 }
bbdd8bad 1617 s += (utf8_target) ? UTF8SKIP(s) : 1;
fac1af77
KW
1618 }
1619 break;
e067297c 1620 }
bbce6d69 1621 case BOUNDL:
3280af22 1622 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1623 FBC_BOUND(isALNUM_LC,
1624 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1625 isALNUM_LC_utf8((U8*)s));
a0ed51b3 1626 break;
bbce6d69 1627 case NBOUNDL:
3280af22 1628 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1629 FBC_NBOUND(isALNUM_LC,
1630 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1631 isALNUM_LC_utf8((U8*)s));
1632 break;
1633 case BOUND:
1634 FBC_BOUND(isWORDCHAR,
1635 isALNUM_uni(tmp),
1636 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1637 break;
cfaf538b
KW
1638 case BOUNDA:
1639 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1640 isWORDCHAR_A(tmp),
1641 isWORDCHAR_A((U8*)s));
1642 break;
a0d0e21e 1643 case NBOUND:
63ac0dad
KW
1644 FBC_NBOUND(isWORDCHAR,
1645 isALNUM_uni(tmp),
1646 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1647 break;
cfaf538b
KW
1648 case NBOUNDA:
1649 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1650 isWORDCHAR_A(tmp),
1651 isWORDCHAR_A((U8*)s));
1652 break;
63ac0dad
KW
1653 case BOUNDU:
1654 FBC_BOUND(isWORDCHAR_L1,
1655 isALNUM_uni(tmp),
1656 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1657 break;
1658 case NBOUNDU:
1659 FBC_NBOUND(isWORDCHAR_L1,
1660 isALNUM_uni(tmp),
1661 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
a0ed51b3 1662 break;
bbce6d69 1663 case ALNUML:
4cadc6a9
YO
1664 REXEC_FBC_CSCAN_TAINT(
1665 isALNUM_LC_utf8((U8*)s),
1666 isALNUM_LC(*s)
1667 );
6895a8aa 1668 break;
980866de
KW
1669 case ALNUMU:
1670 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1671 LOAD_UTF8_CHARCLASS_ALNUM(),
1672 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1673 isWORDCHAR_L1((U8) *s)
1674 );
6895a8aa 1675 break;
980866de
KW
1676 case ALNUM:
1677 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1678 LOAD_UTF8_CHARCLASS_ALNUM(),
1679 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1680 isWORDCHAR((U8) *s)
1681 );
6895a8aa 1682 break;
cfaf538b 1683 case ALNUMA:
8e9da4d4
KW
1684 /* Don't need to worry about utf8, as it can match only a single
1685 * byte invariant character */
cfaf538b 1686 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
6895a8aa 1687 break;
980866de
KW
1688 case NALNUMU:
1689 REXEC_FBC_CSCAN_PRELOAD(
779d7b58 1690 LOAD_UTF8_CHARCLASS_ALNUM(),
359960d4 1691 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1692 ! isWORDCHAR_L1((U8) *s)
1693 );
6895a8aa 1694 break;
a0d0e21e 1695 case NALNUM:
4cadc6a9 1696 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1697 LOAD_UTF8_CHARCLASS_ALNUM(),
1698 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
980866de 1699 ! isALNUM(*s)
4cadc6a9 1700 );
6895a8aa 1701 break;
cfaf538b 1702 case NALNUMA:
8e9da4d4
KW
1703 REXEC_FBC_CSCAN(
1704 !isWORDCHAR_A(*s),
1705 !isWORDCHAR_A(*s)
1706 );
1707 break;
bbce6d69 1708 case NALNUML:
4cadc6a9
YO
1709 REXEC_FBC_CSCAN_TAINT(
1710 !isALNUM_LC_utf8((U8*)s),
1711 !isALNUM_LC(*s)
1712 );
6895a8aa 1713 break;
980866de 1714 case SPACEU:
bedac28b
KW
1715 REXEC_FBC_CSCAN(
1716 is_XPERLSPACE_utf8(s),
980866de
KW
1717 isSPACE_L1((U8) *s)
1718 );
6895a8aa 1719 break;
a0d0e21e 1720 case SPACE:
bedac28b
KW
1721 REXEC_FBC_CSCAN(
1722 is_XPERLSPACE_utf8(s),
980866de 1723 isSPACE((U8) *s)
4cadc6a9 1724 );
6895a8aa 1725 break;
cfaf538b 1726 case SPACEA:
8e9da4d4
KW
1727 /* Don't need to worry about utf8, as it can match only a single
1728 * byte invariant character */
cfaf538b 1729 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
6895a8aa 1730 break;
bbce6d69 1731 case SPACEL:
4cadc6a9 1732 REXEC_FBC_CSCAN_TAINT(
6bbba904 1733 isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1734 isSPACE_LC(*s)
1735 );
6895a8aa 1736 break;
980866de 1737 case NSPACEU:
bedac28b
KW
1738 REXEC_FBC_CSCAN(
1739 ! is_XPERLSPACE_utf8(s),
980866de
KW
1740 ! isSPACE_L1((U8) *s)
1741 );
6895a8aa 1742 break;
a0d0e21e 1743 case NSPACE:
bedac28b
KW
1744 REXEC_FBC_CSCAN(
1745 ! is_XPERLSPACE_utf8(s),
980866de 1746 ! isSPACE((U8) *s)
4cadc6a9 1747 );
6895a8aa 1748 break;
cfaf538b 1749 case NSPACEA:
8e9da4d4
KW
1750 REXEC_FBC_CSCAN(
1751 !isSPACE_A(*s),
1752 !isSPACE_A(*s)
1753 );
1754 break;
bbce6d69 1755 case NSPACEL:
4cadc6a9 1756 REXEC_FBC_CSCAN_TAINT(
6bbba904 1757 !isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1758 !isSPACE_LC(*s)
1759 );
6895a8aa 1760 break;
a0d0e21e 1761 case DIGIT:
4cadc6a9 1762 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1763 LOAD_UTF8_CHARCLASS_DIGIT(),
1764 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1765 isDIGIT(*s)
1766 );
6895a8aa 1767 break;
cfaf538b 1768 case DIGITA:
8e9da4d4
KW
1769 /* Don't need to worry about utf8, as it can match only a single
1770 * byte invariant character */
cfaf538b 1771 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
6895a8aa 1772 break;
b8c5462f 1773 case DIGITL:
4cadc6a9
YO
1774 REXEC_FBC_CSCAN_TAINT(
1775 isDIGIT_LC_utf8((U8*)s),
1776 isDIGIT_LC(*s)
1777 );
6895a8aa 1778 break;
a0d0e21e 1779 case NDIGIT:
4cadc6a9 1780 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1781 LOAD_UTF8_CHARCLASS_DIGIT(),
1782 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1783 !isDIGIT(*s)
1784 );
6895a8aa 1785 break;
cfaf538b 1786 case NDIGITA:
8e9da4d4
KW
1787 REXEC_FBC_CSCAN(
1788 !isDIGIT_A(*s),
1789 !isDIGIT_A(*s)
1790 );
1791 break;
b8c5462f 1792 case NDIGITL:
4cadc6a9
YO
1793 REXEC_FBC_CSCAN_TAINT(
1794 !isDIGIT_LC_utf8((U8*)s),
1795 !isDIGIT_LC(*s)
1796 );
6895a8aa 1797 break;
e1d1eefb
YO
1798 case LNBREAK:
1799 REXEC_FBC_CSCAN(
7016d6eb
DM
1800 is_LNBREAK_utf8_safe(s, strend),
1801 is_LNBREAK_latin1_safe(s, strend)
e1d1eefb 1802 );
6895a8aa 1803 break;
e1d1eefb
YO
1804 case VERTWS:
1805 REXEC_FBC_CSCAN(
7016d6eb
DM
1806 is_VERTWS_utf8_safe(s, strend),
1807 is_VERTWS_latin1_safe(s, strend)
e1d1eefb 1808 );
6895a8aa 1809 break;
e1d1eefb
YO
1810 case NVERTWS:
1811 REXEC_FBC_CSCAN(
7016d6eb
DM
1812 !is_VERTWS_utf8_safe(s, strend),
1813 !is_VERTWS_latin1_safe(s, strend)
e1d1eefb 1814 );
6895a8aa 1815 break;
e1d1eefb
YO
1816 case HORIZWS:
1817 REXEC_FBC_CSCAN(
7016d6eb
DM
1818 is_HORIZWS_utf8_safe(s, strend),
1819 is_HORIZWS_latin1_safe(s, strend)
e1d1eefb 1820 );
6895a8aa 1821 break;
e1d1eefb
YO
1822 case NHORIZWS:
1823 REXEC_FBC_CSCAN(
7016d6eb
DM
1824 !is_HORIZWS_utf8_safe(s, strend),
1825 !is_HORIZWS_latin1_safe(s, strend)
e1d1eefb 1826 );
6895a8aa 1827 break;
0658cdde
KW
1828 case POSIXA:
1829 /* Don't need to worry about utf8, as it can match only a single
1830 * byte invariant character. The flag in this node type is the
1831 * class number to pass to _generic_isCC() to build a mask for
1832 * searching in PL_charclass[] */
1833 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1834 break;
1835 case NPOSIXA:
1836 REXEC_FBC_CSCAN(
1837 !_generic_isCC_A(*s, FLAGS(c)),
1838 !_generic_isCC_A(*s, FLAGS(c))
1839 );
1840 break;
1841
1de06328
YO
1842 case AHOCORASICKC:
1843 case AHOCORASICK:
07be1b83 1844 {
a0a388a1 1845 DECL_TRIE_TYPE(c);
07be1b83
YO
1846 /* what trie are we using right now */
1847 reg_ac_data *aho
f8fc2ecf 1848 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1849 reg_trie_data *trie
1850 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1851 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1852
1853 const char *last_start = strend - trie->minlen;
6148ee25 1854#ifdef DEBUGGING
07be1b83 1855 const char *real_start = s;
6148ee25 1856#endif
07be1b83 1857 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1858 SV *sv_points;
1859 U8 **points; /* map of where we were in the input string
786e8c11 1860 when reading a given char. For ASCII this
be8e71aa 1861 is unnecessary overhead as the relationship
38a44b82
NC
1862 is always 1:1, but for Unicode, especially
1863 case folded Unicode this is not true. */
f9e705e8 1864 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1865 U8 *bitmap=NULL;
1866
07be1b83
YO
1867
1868 GET_RE_DEBUG_FLAGS_DECL;
1869
be8e71aa
YO
1870 /* We can't just allocate points here. We need to wrap it in
1871 * an SV so it gets freed properly if there is a croak while
1872 * running the match */
1873 ENTER;
1874 SAVETMPS;
1875 sv_points=newSV(maxlen * sizeof(U8 *));
1876 SvCUR_set(sv_points,
1877 maxlen * sizeof(U8 *));
1878 SvPOK_on(sv_points);
1879 sv_2mortal(sv_points);
1880 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1881 if ( trie_type != trie_utf8_fold
1882 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1883 {
786e8c11
YO
1884 if (trie->bitmap)
1885 bitmap=(U8*)trie->bitmap;
1886 else
1887 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1888 }
786e8c11
YO
1889 /* this is the Aho-Corasick algorithm modified a touch
1890 to include special handling for long "unknown char"
1891 sequences. The basic idea being that we use AC as long
1892 as we are dealing with a possible matching char, when
1893 we encounter an unknown char (and we have not encountered
1894 an accepting state) we scan forward until we find a legal
1895 starting char.
1896 AC matching is basically that of trie matching, except
1897 that when we encounter a failing transition, we fall back
1898 to the current states "fail state", and try the current char
1899 again, a process we repeat until we reach the root state,
1900 state 1, or a legal transition. If we fail on the root state
1901 then we can either terminate if we have reached an accepting
1902 state previously, or restart the entire process from the beginning
1903 if we have not.
1904
1905 */
07be1b83
YO
1906 while (s <= last_start) {
1907 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1908 U8 *uc = (U8*)s;
1909 U16 charid = 0;
1910 U32 base = 1;
1911 U32 state = 1;
1912 UV uvc = 0;
1913 STRLEN len = 0;
1914 STRLEN foldlen = 0;
1915 U8 *uscan = (U8*)NULL;
1916 U8 *leftmost = NULL;
786e8c11
YO
1917#ifdef DEBUGGING
1918 U32 accepted_word= 0;
1919#endif
07be1b83
YO
1920 U32 pointpos = 0;
1921
1922 while ( state && uc <= (U8*)strend ) {
1923 int failed=0;
786e8c11
YO
1924 U32 word = aho->states[ state ].wordnum;
1925
1de06328
YO
1926 if( state==1 ) {
1927 if ( bitmap ) {
1928 DEBUG_TRIE_EXECUTE_r(
1929 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1930 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1931 (char *)uc, utf8_target );
1de06328
YO
1932 PerlIO_printf( Perl_debug_log,
1933 " Scanning for legal start char...\n");
1934 }
d085b490
YO
1935 );
1936 if (utf8_target) {
1937 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1938 uc += UTF8SKIP(uc);
1939 }
1940 } else {
1941 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1942 uc++;
1943 }
1944 }
1de06328 1945 s= (char *)uc;
786e8c11 1946 }
786e8c11
YO
1947 if (uc >(U8*)last_start) break;
1948 }
1949
1950 if ( word ) {
2e64971a 1951 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1952 if (!leftmost || lpos < leftmost) {
1953 DEBUG_r(accepted_word=word);
07be1b83 1954 leftmost= lpos;
786e8c11 1955 }
07be1b83 1956 if (base==0) break;
786e8c11 1957
07be1b83
YO
1958 }
1959 points[pointpos++ % maxlen]= uc;
7016d6eb
DM
1960 if (foldlen || uc < (U8*)strend) {
1961 REXEC_TRIE_READ_CHAR(trie_type, trie,
1962 widecharmap, uc,
55eed653
NC
1963 uscan, len, uvc, charid, foldlen,
1964 foldbuf, uniflags);
7016d6eb
DM
1965 DEBUG_TRIE_EXECUTE_r({
1966 dump_exec_pos( (char *)uc, c, strend,
1967 real_start, s, utf8_target);
1968 PerlIO_printf(Perl_debug_log,
1969 " Charid:%3u CP:%4"UVxf" ",
1970 charid, uvc);
1971 });
1972 }
1973 else {
1974 len = 0;
1975 charid = 0;
1976 }
1977
07be1b83
YO
1978
1979 do {
6148ee25 1980#ifdef DEBUGGING
786e8c11 1981 word = aho->states[ state ].wordnum;
6148ee25 1982#endif
07be1b83
YO
1983 base = aho->states[ state ].trans.base;
1984
786e8c11
YO
1985 DEBUG_TRIE_EXECUTE_r({
1986 if (failed)
1987 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1988 s, utf8_target );
07be1b83 1989 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1990 "%sState: %4"UVxf", word=%"UVxf,
1991 failed ? " Fail transition to " : "",
1992 (UV)state, (UV)word);
1993 });
07be1b83
YO
1994 if ( base ) {
1995 U32 tmp;
6dd2be57 1996 I32 offset;
07be1b83 1997 if (charid &&
6dd2be57
DM
1998 ( ((offset = base + charid
1999 - 1 - trie->uniquecharcount)) >= 0)
2000 && ((U32)offset < trie->lasttrans)
2001 && trie->trans[offset].check == state
2002 && (tmp=trie->trans[offset].next))
07be1b83 2003 {
786e8c11
YO
2004 DEBUG_TRIE_EXECUTE_r(
2005 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
2006 state = tmp;
2007 break;
2008 }
2009 else {
786e8c11
YO
2010 DEBUG_TRIE_EXECUTE_r(
2011 PerlIO_printf( Perl_debug_log," - fail\n"));
2012 failed = 1;
2013 state = aho->fail[state];
07be1b83
YO
2014 }
2015 }
2016 else {
2017 /* we must be accepting here */
786e8c11
YO
2018 DEBUG_TRIE_EXECUTE_r(
2019 PerlIO_printf( Perl_debug_log," - accepting\n"));
2020 failed = 1;
07be1b83
YO
2021 break;
2022 }
2023 } while(state);
786e8c11 2024 uc += len;
07be1b83
YO
2025 if (failed) {
2026 if (leftmost)
2027 break;
786e8c11 2028 if (!state) state = 1;
07be1b83
YO
2029 }
2030 }
2031 if ( aho->states[ state ].wordnum ) {
2e64971a 2032 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
2033 if (!leftmost || lpos < leftmost) {
2034 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 2035 leftmost = lpos;
786e8c11 2036 }
07be1b83 2037 }
07be1b83
YO
2038 if (leftmost) {
2039 s = (char*)leftmost;
786e8c11
YO
2040 DEBUG_TRIE_EXECUTE_r({
2041 PerlIO_printf(
70685ca0
JH
2042 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2043 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
2044 );
2045 });
24b23f37 2046 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
2047 FREETMPS;
2048 LEAVE;
07be1b83 2049 goto got_it;
be8e71aa 2050 }
07be1b83 2051 s = HOPc(s,1);
786e8c11
YO
2052 DEBUG_TRIE_EXECUTE_r({
2053 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2054 });
07be1b83 2055 } else {
786e8c11
YO
2056 DEBUG_TRIE_EXECUTE_r(
2057 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
2058 break;
2059 }
2060 }
be8e71aa
YO
2061 FREETMPS;
2062 LEAVE;
07be1b83
YO
2063 }
2064 break;
b3c9acc1 2065 default:
3c3eec57
GS
2066 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2067 break;
d6a28714 2068 }
6eb5f6b9
JH
2069 return 0;
2070 got_it:
2071 return s;
2072}
2073
fae667d5 2074
6eb5f6b9
JH
2075/*
2076 - regexec_flags - match a regexp against a string
2077 */
2078I32
288b8c02 2079Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9 2080 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2081/* stringarg: the point in the string at which to begin matching */
2082/* strend: pointer to null at end of string */
2083/* strbeg: real beginning of string */
2084/* minend: end of match must be >= minend bytes after stringarg. */
2085/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2086 * itself is accessed via the pointers above */
2087/* data: May be used for some additional optimizations.
2088 Currently its only used, with a U32 cast, for transmitting
2089 the ganch offset when doing a /g match. This will change */
2090/* nosave: For optimizations. */
2091
6eb5f6b9 2092{
97aff369 2093 dVAR;
8d919b0a 2094 struct regexp *const prog = ReANY(rx);
24b23f37 2095 /*register*/ char *s;
eb578fdb 2096 regnode *c;
24b23f37 2097 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
2098 I32 minlen; /* must match at least this many chars */
2099 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2100 I32 end_shift = 0; /* Same for the end. */ /* CC */
2101 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2102 char *scream_olds = NULL;
f2ed9b32 2103 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2104 I32 multiline;
f8fc2ecf 2105 RXi_GET_DECL(prog,progi);
3b0527fe 2106 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2107 regexp_paren_pair *swap = NULL;
a3621e74
YO
2108 GET_RE_DEBUG_FLAGS_DECL;
2109
7918f24d 2110 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2111 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2112
2113 /* Be paranoid... */
2114 if (prog == NULL || startpos == NULL) {
2115 Perl_croak(aTHX_ "NULL regexp parameter");
2116 return 0;
2117 }
2118
bbe252da 2119 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2120 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2121
f2ed9b32 2122 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2123 DEBUG_EXECUTE_r(
f2ed9b32 2124 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2125 "Matching");
2126 );
bac06658 2127
6eb5f6b9 2128 minlen = prog->minlen;
1de06328
YO
2129
2130 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2131 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2132 "String too short [regexec_flags]...\n"));
2133 goto phooey;
1aa99e6b 2134 }
6eb5f6b9 2135
1de06328 2136
6eb5f6b9 2137 /* Check validity of program. */
f8fc2ecf 2138 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2139 Perl_croak(aTHX_ "corrupted regexp program");
2140 }
2141
2142 PL_reg_flags = 0;
ed301438 2143 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2144 PL_reg_maxiter = 0;
2145
3c8556c3 2146 if (RX_UTF8(rx))
6eb5f6b9
JH
2147 PL_reg_flags |= RF_utf8;
2148
2149 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2150 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2151 PL_bostr = strbeg;
3b0527fe 2152 reginfo.sv = sv;
6eb5f6b9
JH
2153
2154 /* Mark end of line for $ (and such) */
2155 PL_regeol = strend;
2156
2157 /* see how far we have to get to not match where we matched before */
3b0527fe 2158 reginfo.till = startpos+minend;
6eb5f6b9 2159
6eb5f6b9
JH
2160 /* If there is a "must appear" string, look for it. */
2161 s = startpos;
2162
bbe252da 2163 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2164 MAGIC *mg;
2c296965 2165 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2166 reginfo.ganch = startpos + prog->gofs;
2c296965 2167 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2168 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2169 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2170 && SvMAGIC(sv)
14befaf4
DM
2171 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2172 && mg->mg_len >= 0) {
3b0527fe 2173 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2174 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2175 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2176
bbe252da 2177 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2178 if (s > reginfo.ganch)
6eb5f6b9 2179 goto phooey;
58e23c8d 2180 s = reginfo.ganch - prog->gofs;
2c296965 2181 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2182 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2183 if (s < strbeg)
2184 goto phooey;
6eb5f6b9
JH
2185 }
2186 }
58e23c8d 2187 else if (data) {
70685ca0 2188 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2189 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2190 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2191
2192 } else { /* pos() not defined */
3b0527fe 2193 reginfo.ganch = strbeg;
2c296965
YO
2194 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2195 "GPOS: reginfo.ganch = strbeg\n"));
2196 }
6eb5f6b9 2197 }
288b8c02 2198 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2199 /* We have to be careful. If the previous successful match
2200 was from this regex we don't want a subsequent partially
2201 successful match to clobber the old results.
2202 So when we detect this possibility we add a swap buffer
2203 to the re, and switch the buffer each match. If we fail
2204 we switch it back, otherwise we leave it swapped.
2205 */
2206 swap = prog->offs;
2207 /* do we need a save destructor here for eval dies? */
2208 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2209 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2210 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2211 PTR2UV(prog),
2212 PTR2UV(swap),
2213 PTR2UV(prog->offs)
2214 ));
c74340f9 2215 }
a0714e2c 2216 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2217 re_scream_pos_data d;
2218
2219 d.scream_olds = &scream_olds;
2220 d.scream_pos = &scream_pos;
288b8c02 2221 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2222 if (!s) {
a3621e74 2223 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2224 goto phooey; /* not present */
3fa9c3d7 2225 }
6eb5f6b9
JH
2226 }
2227
1de06328 2228
6eb5f6b9
JH
2229
2230 /* Simplest case: anchored match need be tried only once. */
2231 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2232 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2233 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2234 goto got_it;
bbe252da
YO
2235 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2236 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2237 {
2238 char *end;
2239
2240 if (minlen)
2241 dontbother = minlen - 1;
1aa99e6b 2242 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2243 /* for multiline we only have to try after newlines */
33b8afdf 2244 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2245 /* because of the goto we can not easily reuse the macros for bifurcating the
2246 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2247 if (utf8_target) {
2248 if (s == startpos)
2249 goto after_try_utf8;
2250 while (1) {
2251 if (regtry(&reginfo, &s)) {
2252 goto got_it;
2253 }
2254 after_try_utf8:
2255 if (s > end) {
2256 goto phooey;
2257 }
2258 if (prog->extflags & RXf_USE_INTUIT) {
2259 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2260 if (!s) {
2261 goto phooey;
2262 }
2263 }
2264 else {
2265 s += UTF8SKIP(s);
2266 }
2267 }
2268 } /* end search for check string in unicode */
2269 else {
2270 if (s == startpos) {
2271 goto after_try_latin;
2272 }
2273 while (1) {
2274 if (regtry(&reginfo, &s)) {
2275 goto got_it;
2276 }
2277 after_try_latin:
2278 if (s > end) {
2279 goto phooey;
2280 }
2281 if (prog->extflags & RXf_USE_INTUIT) {
2282 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2283 if (!s) {
2284 goto phooey;
2285 }
2286 }
2287 else {
2288 s++;
2289 }
2290 }
2291 } /* end search for check string in latin*/
2292 } /* end search for check string */
2293 else { /* search for newline */
2294 if (s > startpos) {
2295 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2296 s--;
92f3d482 2297 }
21eede78
YO
2298 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2299 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2300 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2301 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2302 goto got_it;
2303 }
92f3d482
YO
2304 }
2305 } /* end search for newline */
2306 } /* end anchored/multiline check string search */
6eb5f6b9 2307 goto phooey;
bbe252da 2308 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2309 {
486ec47a 2310 /* the warning about reginfo.ganch being used without initialization
bbe252da 2311 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2312 and we only enter this block when the same bit is set. */
58e23c8d 2313 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2314
2315 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2316 goto got_it;
2317 goto phooey;
2318 }
2319
2320 /* Messy cases: unanchored match. */
bbe252da 2321 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2322 /* we have /x+whatever/ */
f2ed9b32 2323 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2324 char ch;
bf93d4cc
GS
2325#ifdef DEBUGGING
2326 int did_match = 0;
2327#endif
f2ed9b32 2328 if (utf8_target) {
7e0d5ad7
KW
2329 if (! prog->anchored_utf8) {
2330 to_utf8_substr(prog);
2331 }
2332 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2333 REXEC_FBC_SCAN(
6eb5f6b9 2334 if (*s == ch) {
a3621e74 2335 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2336 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2337 s += UTF8SKIP(s);
2338 while (s < strend && *s == ch)
2339 s += UTF8SKIP(s);
2340 }
4cadc6a9 2341 );
7e0d5ad7 2342
6eb5f6b9
JH
2343 }
2344 else {
7e0d5ad7
KW
2345 if (! prog->anchored_substr) {
2346 if (! to_byte_substr(prog)) {
6b54ddc5 2347 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2348 }
2349 }
2350 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2351 REXEC_FBC_SCAN(
6eb5f6b9 2352 if (*s == ch) {
a3621e74 2353 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2354 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2355 s++;
2356 while (s < strend && *s == ch)
2357 s++;
2358 }
4cadc6a9 2359 );
6eb5f6b9 2360 }
a3621e74 2361 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2362 PerlIO_printf(Perl_debug_log,
b7953727
JH
2363 "Did not find anchored character...\n")
2364 );
6eb5f6b9 2365 }
a0714e2c
SS
2366 else if (prog->anchored_substr != NULL
2367 || prog->anchored_utf8 != NULL
2368 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2369 && prog->float_max_offset < strend - s)) {
2370 SV *must;
2371 I32 back_max;
2372 I32 back_min;
2373 char *last;
6eb5f6b9 2374 char *last1; /* Last position checked before */
bf93d4cc
GS
2375#ifdef DEBUGGING
2376 int did_match = 0;
2377#endif
33b8afdf 2378 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2379 if (utf8_target) {
2380 if (! prog->anchored_utf8) {
2381 to_utf8_substr(prog);
2382 }
2383 must = prog->anchored_utf8;
2384 }
2385 else {
2386 if (! prog->anchored_substr) {
2387 if (! to_byte_substr(prog)) {
6b54ddc5 2388 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2389 }
2390 }
2391 must = prog->anchored_substr;
2392 }
33b8afdf
JH
2393 back_max = back_min = prog->anchored_offset;
2394 } else {
7e0d5ad7
KW
2395 if (utf8_target) {
2396 if (! prog->float_utf8) {
2397 to_utf8_substr(prog);
2398 }
2399 must = prog->float_utf8;
2400 }
2401 else {
2402 if (! prog->float_substr) {
2403 if (! to_byte_substr(prog)) {
6b54ddc5 2404 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2405 }
2406 }
2407 must = prog->float_substr;
2408 }
33b8afdf
JH
2409 back_max = prog->float_max_offset;
2410 back_min = prog->float_min_offset;
2411 }
1de06328 2412
1de06328
YO
2413 if (back_min<0) {
2414 last = strend;
2415 } else {
2416 last = HOP3c(strend, /* Cannot start after this */
2417 -(I32)(CHR_SVLEN(must)
2418 - (SvTAIL(must) != 0) + back_min), strbeg);
2419 }
6eb5f6b9
JH
2420 if (s > PL_bostr)
2421 last1 = HOPc(s, -1);
2422 else
2423 last1 = s - 1; /* bogus */
2424
a0288114 2425 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2426 check_substr==must. */
2427 scream_pos = -1;
2428 dontbother = end_shift;
2429 strend = HOPc(strend, -dontbother);
2430 while ( (s <= last) &&
c33e64f0 2431 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2432 (unsigned char*)strend, must,
c33e64f0 2433 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2434 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2435 if (HOPc(s, -back_max) > last1) {
2436 last1 = HOPc(s, -back_min);
2437 s = HOPc(s, -back_max);
2438 }
2439 else {
52657f30 2440 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2441
2442 last1 = HOPc(s, -back_min);
52657f30 2443 s = t;
6eb5f6b9 2444 }
f2ed9b32 2445 if (utf8_target) {
6eb5f6b9 2446 while (s <= last1) {
24b23f37 2447 if (regtry(&reginfo, &s))
6eb5f6b9 2448 goto got_it;
7016d6eb
DM
2449 if (s >= last1) {
2450 s++; /* to break out of outer loop */
2451 break;
2452 }
2453 s += UTF8SKIP(s);
6eb5f6b9
JH
2454 }
2455 }
2456 else {
2457 while (s <= last1) {
24b23f37 2458 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2459 goto got_it;
2460 s++;
2461 }
2462 }
2463 }
ab3bbdeb 2464 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2465 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2466 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2467 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2468 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2469 ? "anchored" : "floating"),
ab3bbdeb
YO
2470 quoted, RE_SV_TAIL(must));
2471 });
6eb5f6b9
JH
2472 goto phooey;
2473 }
f8fc2ecf 2474 else if ( (c = progi->regstclass) ) {
f14c76ed 2475 if (minlen) {
f8fc2ecf 2476 const OPCODE op = OP(progi->regstclass);
66e933ab 2477 /* don't bother with what can't match */
786e8c11 2478 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2479 strend = HOPc(strend, -(minlen - 1));
2480 }
a3621e74 2481 DEBUG_EXECUTE_r({
be8e71aa 2482 SV * const prop = sv_newmortal();
32fc9b6a 2483 regprop(prog, prop, c);
0df25f3d 2484 {
f2ed9b32 2485 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2486 s,strend-s,60);
0df25f3d 2487 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2488 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2489 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2490 quoted, (int)(strend - s));
0df25f3d 2491 }
ffc61ed2 2492 });
3b0527fe 2493 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2494 goto got_it;
07be1b83 2495 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2496 }
2497 else {
2498 dontbother = 0;
a0714e2c 2499 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2500 /* Trim the end. */
6af40bd7 2501 char *last= NULL;
33b8afdf 2502 SV* float_real;
c33e64f0
FC
2503 STRLEN len;
2504 const char *little;
33b8afdf 2505
7e0d5ad7
KW
2506 if (utf8_target) {
2507 if (! prog->float_utf8) {
2508 to_utf8_substr(prog);
2509 }
2510 float_real = prog->float_utf8;
2511 }
2512 else {
2513 if (! prog->float_substr) {
2514 if (! to_byte_substr(prog)) {
6b54ddc5 2515 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2516 }
2517 }
2518 float_real = prog->float_substr;
2519 }
d6a28714 2520
c33e64f0
FC
2521 little = SvPV_const(float_real, len);
2522 if (SvTAIL(float_real)) {
7f18ad16
KW
2523 /* This means that float_real contains an artificial \n on
2524 * the end due to the presence of something like this:
2525 * /foo$/ where we can match both "foo" and "foo\n" at the
2526 * end of the string. So we have to compare the end of the
2527 * string first against the float_real without the \n and
2528 * then against the full float_real with the string. We
2529 * have to watch out for cases where the string might be
2530 * smaller than the float_real or the float_real without
2531 * the \n. */
1a13b075
YO
2532 char *checkpos= strend - len;
2533 DEBUG_OPTIMISE_r(
2534 PerlIO_printf(Perl_debug_log,
2535 "%sChecking for float_real.%s\n",
2536 PL_colors[4], PL_colors[5]));
2537 if (checkpos + 1 < strbeg) {
7f18ad16
KW
2538 /* can't match, even if we remove the trailing \n
2539 * string is too short to match */
1a13b075
YO
2540 DEBUG_EXECUTE_r(
2541 PerlIO_printf(Perl_debug_log,
2542 "%sString shorter than required trailing substring, cannot match.%s\n",
2543 PL_colors[4], PL_colors[5]));
2544 goto phooey;
2545 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
2546 /* can match, the end of the string matches without the
2547 * "\n" */
1a13b075
YO
2548 last = checkpos + 1;
2549 } else if (checkpos < strbeg) {
7f18ad16
KW
2550 /* cant match, string is too short when the "\n" is
2551 * included */
1a13b075
YO
2552 DEBUG_EXECUTE_r(
2553 PerlIO_printf(Perl_debug_log,
2554 "%sString does not contain required trailing substring, cannot match.%s\n",
2555 PL_colors[4], PL_colors[5]));
2556 goto phooey;
2557 } else if (!multiline) {
7f18ad16
KW
2558 /* non multiline match, so compare with the "\n" at the
2559 * end of the string */
1a13b075
YO
2560 if (memEQ(checkpos, little, len)) {
2561 last= checkpos;
2562 } else {
2563 DEBUG_EXECUTE_r(
2564 PerlIO_printf(Perl_debug_log,
2565 "%sString does not contain required trailing substring, cannot match.%s\n",
2566 PL_colors[4], PL_colors[5]));
2567 goto phooey;
2568 }
2569 } else {
7f18ad16
KW
2570 /* multiline match, so we have to search for a place
2571 * where the full string is located */
d6a28714 2572 goto find_last;
1a13b075 2573 }
c33e64f0 2574 } else {
d6a28714 2575 find_last:
9041c2e3 2576 if (len)
d6a28714 2577 last = rninstr(s, strend, little, little + len);
b8c5462f 2578 else
a0288114 2579 last = strend; /* matching "$" */
b8c5462f 2580 }
6af40bd7 2581 if (!last) {
7f18ad16
KW
2582 /* at one point this block contained a comment which was
2583 * probably incorrect, which said that this was a "should not
2584 * happen" case. Even if it was true when it was written I am
2585 * pretty sure it is not anymore, so I have removed the comment
2586 * and replaced it with this one. Yves */
6bda09f9
YO
2587 DEBUG_EXECUTE_r(
2588 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2589 "String does not contain required substring, cannot match.\n"
2590 ));
2591 goto phooey;
bf93d4cc 2592 }
d6a28714
JH
2593 dontbother = strend - last + prog->float_min_offset;
2594 }
2595 if (minlen && (dontbother < minlen))
2596 dontbother = minlen - 1;
2597 strend -= dontbother; /* this one's always in bytes! */
2598 /* We don't know much -- general case. */
f2ed9b32 2599 if (utf8_target) {
d6a28714 2600 for (;;) {
24b23f37 2601 if (regtry(&reginfo, &s))
d6a28714
JH
2602 goto got_it;
2603 if (s >= strend)
2604 break;
b8c5462f 2605 s += UTF8SKIP(s);
d6a28714
JH
2606 };
2607 }
2608 else {
2609 do {
24b23f37 2610 if (regtry(&reginfo, &s))
d6a28714
JH
2611 goto got_it;
2612 } while (s++ < strend);
2613 }
2614 }
2615
2616 /* Failure. */
2617 goto phooey;
2618
2619got_it:
495f47a5
DM
2620 DEBUG_BUFFERS_r(
2621 if (swap)
2622 PerlIO_printf(Perl_debug_log,
2623 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2624 PTR2UV(prog),
2625 PTR2UV(swap)
2626 );
2627 );
e9105d30 2628 Safefree(swap);
288b8c02 2629 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2630
ed301438 2631 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2632 restore_pos(aTHX_ prog);
5daac39c
NC
2633 if (RXp_PAREN_NAMES(prog))
2634 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2635
2636 /* make sure $`, $&, $', and $digit will work later */
2637 if ( !(flags & REXEC_NOT_FIRST) ) {
d6a28714 2638 if (flags & REXEC_COPY_STR) {
f8c7b90f 2639#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2640 if ((SvIsCOW(sv)
2641 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2642 if (DEBUG_C_TEST) {
2643 PerlIO_printf(Perl_debug_log,
2644 "Copy on write: regexp capture, type %d\n",
2645 (int) SvTYPE(sv));
2646 }
77f8f7c1 2647 RX_MATCH_COPY_FREE(rx);
ed252734 2648 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2649 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734 2650 assert (SvPOKp(prog->saved_copy));
6502e081
DM
2651 prog->sublen = PL_regeol - strbeg;
2652 prog->suboffset = 0;
2653 prog->subcoffset = 0;
ed252734
NC
2654 } else
2655#endif
2656 {
6502e081
DM
2657 I32 min = 0;
2658 I32 max = PL_regeol - strbeg;
2659 I32 sublen;
2660
2661 if ( (flags & REXEC_COPY_SKIP_POST)
2662 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2663 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2664 ) { /* don't copy $' part of string */
3de645a8 2665 U32 n = 0;
6502e081
DM
2666 max = -1;
2667 /* calculate the right-most part of the string covered
2668 * by a capture. Due to look-ahead, this may be to
2669 * the right of $&, so we have to scan all captures */
2670 while (n <= prog->lastparen) {
2671 if (prog->offs[n].end > max)
2672 max = prog->offs[n].end;
2673 n++;
2674 }
2675 if (max == -1)
2676 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2677 ? prog->offs[0].start
2678 : 0;
2679 assert(max >= 0 && max <= PL_regeol - strbeg);
2680 }
2681
2682 if ( (flags & REXEC_COPY_SKIP_PRE)
2683 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2684 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2685 ) { /* don't copy $` part of string */
3de645a8 2686 U32 n = 0;
6502e081
DM
2687 min = max;
2688 /* calculate the left-most part of the string covered
2689 * by a capture. Due to look-behind, this may be to
2690 * the left of $&, so we have to scan all captures */
2691 while (min && n <= prog->lastparen) {
2692 if ( prog->offs[n].start != -1
2693 && prog->offs[n].start < min)
2694 {
2695 min = prog->offs[n].start;
2696 }
2697 n++;
2698 }
2699 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2700 && min > prog->offs[0].end
2701 )
2702 min = prog->offs[0].end;
2703
2704 }
2705
2706 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2707 sublen = max - min;
2708
2709 if (RX_MATCH_COPIED(rx)) {
2710 if (sublen > prog->sublen)
2711 prog->subbeg =
2712 (char*)saferealloc(prog->subbeg, sublen+1);
2713 }
2714 else
2715 prog->subbeg = (char*)safemalloc(sublen+1);
2716 Copy(strbeg + min, prog->subbeg, sublen, char);
2717 prog->subbeg[sublen] = '\0';
2718 prog->suboffset = min;
2719 prog->sublen = sublen;
77f8f7c1 2720 RX_MATCH_COPIED_on(rx);
6502e081 2721 }
6502e081
DM
2722 prog->subcoffset = prog->suboffset;
2723 if (prog->suboffset && utf8_target) {
2724 /* Convert byte offset to chars.
2725 * XXX ideally should only compute this if @-/@+
2726 * has been seen, a la PL_sawampersand ??? */
2727
2728 /* If there's a direct correspondence between the
2729 * string which we're matching and the original SV,
2730 * then we can use the utf8 len cache associated with
2731 * the SV. In particular, it means that under //g,
2732 * sv_pos_b2u() will use the previously cached
2733 * position to speed up working out the new length of
2734 * subcoffset, rather than counting from the start of
2735 * the string each time. This stops
2736 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2737 * from going quadratic */
2738 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2739 sv_pos_b2u(sv, &(prog->subcoffset));
2740 else
2741 prog->subcoffset = utf8_length((U8*)strbeg,
2742 (U8*)(strbeg+prog->suboffset));
2743 }
d6a28714
JH
2744 }
2745 else {
6502e081 2746 RX_MATCH_COPY_FREE(rx);
d6a28714 2747 prog->subbeg = strbeg;
6502e081
DM
2748 prog->suboffset = 0;
2749 prog->subcoffset = 0;
d6a28714
JH
2750 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2751 }
2752 }
9041c2e3 2753
d6a28714
JH
2754 return 1;
2755
2756phooey:
a3621e74 2757 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2758 PL_colors[4], PL_colors[5]));
ed301438 2759 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2760 restore_pos(aTHX_ prog);
e9105d30 2761 if (swap) {
c74340f9 2762 /* we failed :-( roll it back */
495f47a5
DM
2763 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2764 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2765 PTR2UV(prog),
2766 PTR2UV(prog->offs),
2767 PTR2UV(swap)
2768 ));
e9105d30
GG
2769 Safefree(prog->offs);
2770 prog->offs = swap;
2771 }
d6a28714
JH
2772 return 0;
2773}
2774
6bda09f9 2775
ec43f78b
DM
2776/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2777 * Do inc before dec, in case old and new rex are the same */
2778#define SET_reg_curpm(Re2) \
2779 if (PL_reg_state.re_state_eval_setup_done) { \
2780 (void)ReREFCNT_inc(Re2); \
2781 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2782 PM_SETRE((PL_reg_curpm), (Re2)); \
2783 }
2784
2785
d6a28714
JH
2786/*
2787 - regtry - try match at specific point
2788 */
2789STATIC I32 /* 0 failure, 1 success */
f73aaa43 2790S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 2791{
97aff369 2792 dVAR;
d6a28714 2793 CHECKPOINT lastcp;
288b8c02 2794 REGEXP *const rx = reginfo->prog;
8d919b0a 2795 regexp *const prog = ReANY(rx);
f73aaa43 2796 I32 result;
f8fc2ecf 2797 RXi_GET_DECL(prog,progi);
a3621e74 2798 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2799
2800 PERL_ARGS_ASSERT_REGTRY;
2801
24b23f37 2802 reginfo->cutpoint=NULL;
d6a28714 2803
ed301438
DM
2804 if ((prog->extflags & RXf_EVAL_SEEN)
2805 && !PL_reg_state.re_state_eval_setup_done)
2806 {
d6a28714
JH
2807 MAGIC *mg;
2808
ed301438 2809 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2810 if (reginfo->sv) {
d6a28714 2811 /* Make $_ available to executed code. */
3b0527fe 2812 if (reginfo->sv != DEFSV) {
59f00321 2813 SAVE_DEFSV;
414bf5ae 2814 DEFSV_set(reginfo->sv);
b8c5462f 2815 }
d6a28714 2816
3b0527fe
DM
2817 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2818 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2819 /* prepare for quick setting of pos */
d300d9fa 2820#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2821 if (SvIsCOW(reginfo->sv))
2822 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2823#endif
3dab1dad 2824 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2825 &PL_vtbl_mglob, NULL, 0);
d6a28714 2826 mg->mg_len = -1;
b8c5462f 2827 }
d6a28714
JH
2828 PL_reg_magic = mg;
2829 PL_reg_oldpos = mg->mg_len;
4f639d21 2830 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2831 }
09687e5a 2832 if (!PL_reg_curpm) {
a02a5408 2833 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2834#ifdef USE_ITHREADS
2835 {
14a49a24 2836 SV* const repointer = &PL_sv_undef;
92313705
NC
2837 /* this regexp is also owned by the new PL_reg_curpm, which
2838 will try to free it. */
d2ece331 2839 av_push(PL_regex_padav, repointer);
09687e5a
AB
2840 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2841 PL_regex_pad = AvARRAY(PL_regex_padav);
2842 }
2843#endif
2844 }
ec43f78b 2845 SET_reg_curpm(rx);
d6a28714
JH
2846 PL_reg_oldcurpm = PL_curpm;
2847 PL_curpm = PL_reg_curpm;
07bc277f 2848 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2849 /* Here is a serious problem: we cannot rewrite subbeg,
2850 since it may be needed if this match fails. Thus
2851 $` inside (?{}) could fail... */
2852 PL_reg_oldsaved = prog->subbeg;
2853 PL_reg_oldsavedlen = prog->sublen;
6502e081
DM
2854 PL_reg_oldsavedoffset = prog->suboffset;
2855 PL_reg_oldsavedcoffset = prog->suboffset;
f8c7b90f 2856#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2857 PL_nrs = prog->saved_copy;
2858#endif
07bc277f 2859 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2860 }
2861 else
bd61b366 2862 PL_reg_oldsaved = NULL;
d6a28714 2863 prog->subbeg = PL_bostr;
6502e081
DM
2864 prog->suboffset = 0;
2865 prog->subcoffset = 0;
d6a28714
JH
2866 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2867 }
97ca13b7 2868#ifdef DEBUGGING
f73aaa43 2869 PL_reg_starttry = *startposp;
97ca13b7 2870#endif
f73aaa43 2871 prog->offs[0].start = *startposp - PL_bostr;
d6a28714 2872 prog->lastparen = 0;
03994de8 2873 prog->lastcloseparen = 0;
d6a28714 2874 PL_regsize = 0;
d6a28714
JH
2875
2876 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2877 to do this again and again, prog->lastparen should take care of
3dd2943c 2878 this! --ilya*/
dafc8851
JH
2879
2880 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2881 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2882 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2883 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2884 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2885 * Meanwhile, this code *is* needed for the
daf18116
JH
2886 * above-mentioned test suite tests to succeed. The common theme
2887 * on those tests seems to be returning null fields from matches.
225593e1 2888 * --jhi updated by dapm */
dafc8851 2889#if 1
d6a28714 2890 if (prog->nparens) {
b93070ed 2891 regexp_paren_pair *pp = prog->offs;
eb578fdb 2892 I32 i;
b93070ed 2893 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2894 ++pp;
2895 pp->start = -1;
2896 pp->end = -1;
d6a28714
JH
2897 }
2898 }
dafc8851 2899#endif
02db2b7b 2900 REGCP_SET(lastcp);
f73aaa43
DM
2901 result = regmatch(reginfo, *startposp, progi->program + 1);
2902 if (result != -1) {
2903 prog->offs[0].end = result;
d6a28714
JH
2904 return 1;
2905 }
24b23f37 2906 if (reginfo->cutpoint)
f73aaa43 2907 *startposp= reginfo->cutpoint;
02db2b7b 2908 REGCP_UNWIND(lastcp);
d6a28714
JH
2909 return 0;
2910}
2911
02db2b7b 2912
8ba1375e
MJD
2913#define sayYES goto yes
2914#define sayNO goto no
262b90c4 2915#define sayNO_SILENT goto no_silent
8ba1375e 2916
f9f4320a
YO
2917/* we dont use STMT_START/END here because it leads to
2918 "unreachable code" warnings, which are bogus, but distracting. */
2919#define CACHEsayNO \
c476f425
DM
2920 if (ST.cache_mask) \
2921 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2922 sayNO
3298f257 2923
a3621e74 2924/* this is used to determine how far from the left messages like
265c4333
YO
2925 'failed...' are printed. It should be set such that messages
2926 are inline with the regop output that created them.
a3621e74 2927*/
265c4333 2928#define REPORT_CODE_OFF 32
a3621e74
YO
2929
2930
40a82448
DM
2931#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2932#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
2933#define CHRTEST_NOT_A_CP_1 -999
2934#define CHRTEST_NOT_A_CP_2 -998
9e137952 2935
86545054
DM
2936#define SLAB_FIRST(s) (&(s)->states[0])
2937#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2938
5d9a96ca
DM
2939/* grab a new slab and return the first slot in it */
2940
2941STATIC regmatch_state *
2942S_push_slab(pTHX)
2943{
a35a87e7 2944#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2945 dMY_CXT;
2946#endif
5d9a96ca
DM
2947 regmatch_slab *s = PL_regmatch_slab->next;
2948 if (!s) {
2949 Newx(s, 1, regmatch_slab);
2950 s->prev = PL_regmatch_slab;
2951 s->next = NULL;
2952 PL_regmatch_slab->next = s;
2953 }
2954 PL_regmatch_slab = s;
86545054 2955 return SLAB_FIRST(s);
5d9a96ca 2956}
5b47454d 2957
95b24440 2958
40a82448
DM
2959/* push a new state then goto it */
2960
4d5016e5
DM
2961#define PUSH_STATE_GOTO(state, node, input) \
2962 pushinput = input; \
40a82448
DM
2963 scan = node; \
2964 st->resume_state = state; \
2965 goto push_state;
2966
2967/* push a new state with success backtracking, then goto it */
2968
4d5016e5
DM
2969#define PUSH_YES_STATE_GOTO(state, node, input) \
2970 pushinput = input; \
40a82448
DM
2971 scan = node; \
2972 st->resume_state = state; \
2973 goto push_yes_state;
2974
aa283a38 2975
aa283a38 2976
4d5016e5 2977
d6a28714 2978/*
95b24440 2979
bf1f174e
DM
2980regmatch() - main matching routine
2981
2982This is basically one big switch statement in a loop. We execute an op,
2983set 'next' to point the next op, and continue. If we come to a point which
2984we may need to backtrack to on failure such as (A|B|C), we push a
2985backtrack state onto the backtrack stack. On failure, we pop the top
2986state, and re-enter the loop at the state indicated. If there are no more
2987states to pop, we return failure.
2988
2989Sometimes we also need to backtrack on success; for example /A+/, where
2990after successfully matching one A, we need to go back and try to
2991match another one; similarly for lookahead assertions: if the assertion
2992completes successfully, we backtrack to the state just before the assertion
2993and then carry on. In these cases, the pushed state is marked as
2994'backtrack on success too'. This marking is in fact done by a chain of
2995pointers, each pointing to the previous 'yes' state. On success, we pop to
2996the nearest yes state, discarding any intermediate failure-only states.
2997Sometimes a yes state is pushed just to force some cleanup code to be
2998called at the end of a successful match or submatch; e.g. (??{$re}) uses
2999it to free the inner regex.
3000
3001Note that failure backtracking rewinds the cursor position, while
3002success backtracking leaves it alone.
3003
3004A pattern is complete when the END op is executed, while a subpattern
3005such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3006ops trigger the "pop to last yes state if any, otherwise return true"
3007behaviour.
3008
3009A common convention in this function is to use A and B to refer to the two
3010subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3011the subpattern to be matched possibly multiple times, while B is the entire
3012rest of the pattern. Variable and state names reflect this convention.
3013
3014The states in the main switch are the union of ops and failure/success of
3015substates associated with with that op. For example, IFMATCH is the op
3016that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3017'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3018successfully matched A and IFMATCH_A_fail is a state saying that we have
3019just failed to match A. Resume states always come in pairs. The backtrack
3020state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3021at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3022on success or failure.
3023
3024The struct that holds a backtracking state is actually a big union, with
3025one variant for each major type of op. The variable st points to the
3026top-most backtrack struct. To make the code clearer, within each
3027block of code we #define ST to alias the relevant union.
3028
3029Here's a concrete example of a (vastly oversimplified) IFMATCH
3030implementation:
3031
3032 switch (state) {
3033 ....
3034
3035#define ST st->u.ifmatch
3036
3037 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3038 ST.foo = ...; // some state we wish to save
95b24440 3039 ...
bf1f174e
DM
3040 // push a yes backtrack state with a resume value of
3041 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3042 // first node of A:
4d5016e5 3043 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3044 // NOTREACHED
3045
3046 case IFMATCH_A: // we have successfully executed A; now continue with B
3047 next = B;
3048 bar = ST.foo; // do something with the preserved value
3049 break;
3050
3051 case IFMATCH_A_fail: // A failed, so the assertion failed
3052 ...; // do some housekeeping, then ...
3053 sayNO; // propagate the failure
3054
3055#undef ST
95b24440 3056
bf1f174e
DM
3057 ...
3058 }
95b24440 3059
bf1f174e
DM
3060For any old-timers reading this who are familiar with the old recursive
3061approach, the code above is equivalent to:
95b24440 3062
bf1f174e
DM
3063 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3064 {
3065 int foo = ...
95b24440 3066 ...
bf1f174e
DM
3067 if (regmatch(A)) {
3068 next = B;
3069 bar = foo;
3070 break;
95b24440 3071 }
bf1f174e
DM
3072 ...; // do some housekeeping, then ...
3073 sayNO; // propagate the failure
95b24440 3074 }
bf1f174e
DM
3075
3076The topmost backtrack state, pointed to by st, is usually free. If you
3077want to claim it, populate any ST.foo fields in it with values you wish to
3078save, then do one of
3079
4d5016e5
DM
3080 PUSH_STATE_GOTO(resume_state, node, newinput);
3081 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3082
3083which sets that backtrack state's resume value to 'resume_state', pushes a
3084new free entry to the top of the backtrack stack, then goes to 'node'.
3085On backtracking, the free slot is popped, and the saved state becomes the
3086new free state. An ST.foo field in this new top state can be temporarily
3087accessed to retrieve values, but once the main loop is re-entered, it
3088becomes available for reuse.
3089
3090Note that the depth of the backtrack stack constantly increases during the
3091left-to-right execution of the pattern, rather than going up and down with
3092the pattern nesting. For example the stack is at its maximum at Z at the
3093end of the pattern, rather than at X in the following:
3094
3095 /(((X)+)+)+....(Y)+....Z/
3096
3097The only exceptions to this are lookahead/behind assertions and the cut,
3098(?>A), which pop all the backtrack states associated with A before
3099continuing.
3100
486ec47a 3101Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3102PL_regmatch_state and st always point to the currently active state,
3103and PL_regmatch_slab points to the slab currently containing
3104PL_regmatch_state. The first time regmatch() is called, the first slab is
3105allocated, and is never freed until interpreter destruction. When the slab
3106is full, a new one is allocated and chained to the end. At exit from
3107regmatch(), slabs allocated since entry are freed.
3108
3109*/
95b24440 3110
40a82448 3111
5bc10b2c 3112#define DEBUG_STATE_pp(pp) \
265c4333 3113 DEBUG_STATE_r({ \
f2ed9b32 3114 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3115 PerlIO_printf(Perl_debug_log, \
5d458dd8 3116 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3117 depth*2, "", \
13d6edb4 3118 PL_reg_name[st->resume_state], \
5d458dd8
YO
3119 ((st==yes_state||st==mark_state) ? "[" : ""), \
3120 ((st==yes_state) ? "Y" : ""), \
3121 ((st==mark_state) ? "M" : ""), \
3122 ((st==yes_state||st==mark_state) ? "]" : "") \
3123 ); \
265c4333 3124 });
5bc10b2c 3125
40a82448 3126
3dab1dad 3127#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3128
3df15adc 3129#ifdef DEBUGGING
5bc10b2c 3130
ab3bbdeb 3131STATIC void
f2ed9b32 3132S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3133 const char *start, const char *end, const char *blurb)
3134{
efd26800 3135 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3136
3137 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3138
ab3bbdeb
YO
3139 if (!PL_colorset)
3140 reginitcolors();
3141 {
3142 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3143 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3144
f2ed9b32 3145 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3146 start, end - start, 60);
3147
3148 PerlIO_printf(Perl_debug_log,
3149 "%s%s REx%s %s against %s\n",
3150 PL_colors[4], blurb, PL_colors[5], s0, s1);
3151
f2ed9b32 3152 if (utf8_target||utf8_pat)
1de06328
YO
3153 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3154 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3155 utf8_pat && utf8_target ? " and " : "",
3156 utf8_target ? "string" : ""
ab3bbdeb
YO
3157 );
3158 }
3159}
3df15adc
YO
3160
3161STATIC void
786e8c11
YO
3162S_dump_exec_pos(pTHX_ const char *locinput,
3163 const regnode *scan,
3164 const char *loc_regeol,
3165 const char *loc_bostr,
3166 const char *loc_reg_starttry,
f2ed9b32 3167 const bool utf8_target)
07be1b83 3168{
786e8c11 3169 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3170 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3171 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3172 /* The part of the string before starttry has one color
3173 (pref0_len chars), between starttry and current
3174 position another one (pref_len - pref0_len chars),
3175 after the current position the third one.
3176 We assume that pref0_len <= pref_len, otherwise we
3177 decrease pref0_len. */
786e8c11
YO
3178 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3179 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3180 int pref0_len;
3181
7918f24d
NC
3182 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3183
f2ed9b32 3184 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3185 pref_len++;
786e8c11
YO
3186 pref0_len = pref_len - (locinput - loc_reg_starttry);
3187 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3188 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3189 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3190 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3191 l--;
3192 if (pref0_len < 0)
3193 pref0_len = 0;
3194 if (pref0_len > pref_len)
3195 pref0_len = pref_len;
3196 {
f2ed9b32 3197 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3198
ab3bbdeb 3199 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3200 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3201
ab3bbdeb 3202 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3203 (locinput - pref_len + pref0_len),
1de06328 3204 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3205
ab3bbdeb 3206 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3207 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3208
1de06328 3209 const STRLEN tlen=len0+len1+len2;
3df15adc 3210 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3211 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3212 (IV)(locinput - loc_bostr),
07be1b83 3213 len0, s0,
07be1b83 3214 len1, s1,
07be1b83 3215 (docolor ? "" : "> <"),
07be1b83 3216 len2, s2,
f9f4320a 3217 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3218 "");
3219 }
3220}
3df15adc 3221
07be1b83
YO
3222#endif
3223
0a4db386
YO
3224/* reg_check_named_buff_matched()
3225 * Checks to see if a named buffer has matched. The data array of
3226 * buffer numbers corresponding to the buffer is expected to reside
3227 * in the regexp->data->data array in the slot stored in the ARG() of
3228 * node involved. Note that this routine doesn't actually care about the
3229 * name, that information is not preserved from compilation to execution.
3230 * Returns the index of the leftmost defined buffer with the given name
3231 * or 0 if non of the buffers matched.
3232 */
3233STATIC I32
7918f24d
NC
3234S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3235{
0a4db386 3236 I32 n;
f8fc2ecf 3237 RXi_GET_DECL(rex,rexi);
ad64d0ec 3238 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3239 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3240
3241 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3242
0a4db386 3243 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3244 if ((I32)rex->lastparen >= nums[n] &&
3245 rex->offs[nums[n]].end != -1)
0a4db386
YO
3246 {
3247 return nums[n];
3248 }
3249 }
3250 return 0;
3251}
3252
2f554ef7
DM
3253
3254/* free all slabs above current one - called during LEAVE_SCOPE */
3255
3256STATIC void
3257S_clear_backtrack_stack(pTHX_ void *p)
3258{
3259 regmatch_slab *s = PL_regmatch_slab->next;
3260 PERL_UNUSED_ARG(p);
3261
3262 if (!s)
3263 return;
3264 PL_regmatch_slab->next = NULL;
3265 while (s) {
3266 regmatch_slab * const osl = s;
3267 s = s->next;
3268 Safefree(osl);
3269 }
3270}
c74f6de9 3271static bool
79a2a0e8 3272S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
c74f6de9 3273{
79a2a0e8
KW
3274 /* This function determines if there are one or two characters that match
3275 * the first character of the passed-in EXACTish node <text_node>, and if
3276 * so, returns them in the passed-in pointers.
c74f6de9 3277 *
79a2a0e8
KW
3278 * If it determines that no possible character in the target string can
3279 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3280 * the first character in <text_node> requires UTF-8 to represent, and the
3281 * target string isn't in UTF-8.)
c74f6de9 3282 *
79a2a0e8
KW
3283 * If there are more than two characters that could match the beginning of
3284 * <text_node>, or if more context is required to determine a match or not,
3285 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3286 *
3287 * The motiviation behind this function is to allow the caller to set up
3288 * tight loops for matching. If <text_node> is of type EXACT, there is
3289 * only one possible character that can match its first character, and so
3290 * the situation is quite simple. But things get much more complicated if
3291 * folding is involved. It may be that the first character of an EXACTFish
3292 * node doesn't participate in any possible fold, e.g., punctuation, so it
3293 * can be matched only by itself. The vast majority of characters that are
3294 * in folds match just two things, their lower and upper-case equivalents.
3295 * But not all are like that; some have multiple possible matches, or match
3296 * sequences of more than one character. This function sorts all that out.
3297 *
3298 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3299 * loop of trying to match A*, we know we can't exit where the thing
3300 * following it isn't a B. And something can't be a B unless it is the
3301 * beginning of B. By putting a quick test for that beginning in a tight
3302 * loop, we can rule out things that can't possibly be B without having to
3303 * break out of the loop, thus avoiding work. Similarly, if A is a single
3304 * character, we can make a tight loop matching A*, using the outputs of
3305 * this function.
3306 *
3307 * If the target string to match isn't in UTF-8, and there aren't
3308 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3309 * the one or two possible octets (which are characters in this situation)
3310 * that can match. In all cases, if there is only one character that can
3311 * match, *<c1p> and *<c2p> will be identical.
3312 *
3313 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3314 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3315 * can match the beginning of <text_node>. They should be declared with at
3316 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3317 * undefined what these contain.) If one or both of the buffers are
3318 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3319 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3320 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3321 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3322 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
c74f6de9
KW
3323
3324 const bool utf8_target = PL_reg_match_utf8;
79a2a0e8 3325
ddb0d839
KW
3326 UV c1 = CHRTEST_NOT_A_CP_1;
3327 UV c2 = CHRTEST_NOT_A_CP_2;
79a2a0e8
KW
3328 bool use_chrtest_void = FALSE;
3329
3330 /* Used when we have both utf8 input and utf8 output, to avoid converting
3331 * to/from code points */
3332 bool utf8_has_been_setup = FALSE;
3333
c74f6de9
KW
3334 dVAR;
3335
b4291290 3336 U8 *pat = (U8*)STRING(text_node);
c74f6de9 3337
79a2a0e8
KW
3338 if (OP(text_node) == EXACT) {
3339
3340 /* In an exact node, only one thing can be matched, that first
3341 * character. If both the pat and the target are UTF-8, we can just
3342 * copy the input to the output, avoiding finding the code point of
3343 * that character */
3344 if (! UTF_PATTERN) {
3345 c2 = c1 = *pat;
3346 }
3347 else if (utf8_target) {
3348 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3349 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3350 utf8_has_been_setup = TRUE;
3351 }
3352 else {
3353 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
c74f6de9 3354 }
79a2a0e8
KW
3355 }
3356 else /* an EXACTFish node */
3357 if ((UTF_PATTERN
3358 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3359 pat + STR_LEN(text_node)))
3360 || (! UTF_PATTERN
3361 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3362 pat + STR_LEN(text_node))))
3363 {
3364 /* Multi-character folds require more context to sort out. Also
3365 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3366 * handled outside this routine */
3367 use_chrtest_void = TRUE;
3368 }
3369 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3370 c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3371 if (c1 > 256) {
3372 /* Load the folds hash, if not already done */
3373 SV** listp;
3374 if (! PL_utf8_foldclosures) {
3375 if (! PL_utf8_tofold) {
3376 U8 dummy[UTF8_MAXBYTES+1];
3377
3378 /* Force loading this by folding an above-Latin1 char */
3379 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3380 assert(PL_utf8_tofold); /* Verify that worked */
3381 }
3382 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3383 }
3384
3385 /* The fold closures data structure is a hash with the keys being
3386 * the UTF-8 of every character that is folded to, like 'k', and
3387 * the values each an array of all code points that fold to its
3388 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3389 * not included */
3390 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3391 (char *) pat,
3392 UTF8SKIP(pat),
3393 FALSE))))
3394 {
3395 /* Not found in the hash, therefore there are no folds
3396 * containing it, so there is only a single character that
3397 * could match */
3398 c2 = c1;
3399 }
3400 else { /* Does participate in folds */
3401 AV* list = (AV*) *listp;
3402 if (av_len(list) != 1) {
3403
3404 /* If there aren't exactly two folds to this, it is outside
3405 * the scope of this function */
3406 use_chrtest_void = TRUE;
3407 }
3408 else { /* There are two. Get them */
3409 SV** c_p = av_fetch(list, 0, FALSE);
3410 if (c_p == NULL) {
3411 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3412 }
3413 c1 = SvUV(*c_p);
3414
3415 c_p = av_fetch(list, 1, FALSE);
3416 if (c_p == NULL) {
3417 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3418 }
3419 c2 = SvUV(*c_p);
3420
3421 /* Folds that cross the 255/256 boundary are forbidden if
3422 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3423 * pattern character is above 256, and its only other match
3424 * is below 256, the only legal match will be to itself.
3425 * We have thrown away the original, so have to compute
3426 * which is the one above 255 */
3427 if ((c1 < 256) != (c2 < 256)) {
3428 if (OP(text_node) == EXACTFL
3429 || (OP(text_node) == EXACTFA
3430 && (isASCII(c1) || isASCII(c2))))
3431 {
3432 if (c1 < 256) {
3433 c1 = c2;
3434 }
3435 else {
3436 c2 = c1;
3437 }
3438 }
3439 }
3440 }
3441 }
3442 }
3443 else /* Here, c1 is < 255 */
3444 if (utf8_target
3445 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3446 && OP(text_node) != EXACTFL
3447 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
c74f6de9
KW
3448 {
3449 /* Here, there could be something above Latin1 in the target which
79a2a0e8
KW
3450 * folds to this character in the pattern. All such cases except
3451 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3452 * involved in their folds, so are outside the scope of this
3453 * function */
3454 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3455