This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regexec.c: comments, white-space only
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
7e0d5ad7
KW
40/* At least one required character in the target string is expressible only in
41 * UTF-8. */
991fc03a 42static const char* const non_utf8_target_but_utf8_required
7e0d5ad7
KW
43 = "Can't match, because target string needs to be in UTF-8\n";
44
6b54ddc5
YO
45#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
47 goto target; \
48} STMT_END
49
a687059c 50/*
e50aee73 51 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
52 *
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
55 *
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
59 *
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
62 * from defects in it.
63 *
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
66 *
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
69 *
70 **** Alterations to Henry's code are...
71 ****
4bb101f2 72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
a687059c 75 ****
9ef589d8
LW
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
a687059c
LW
78 *
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
82 */
83#include "EXTERN.h"
864dbfa3 84#define PERL_IN_REGEXEC_C
a687059c 85#include "perl.h"
0f5d15d6 86
54df2634
NC
87#ifdef PERL_IN_XSUB_RE
88# include "re_comp.h"
89#else
90# include "regcomp.h"
91#endif
a687059c 92
81e983c1 93#include "inline_invlist.c"
1b0f46bf 94#include "unicode_constants.h"
81e983c1 95
ef07e810 96#define RF_tainted 1 /* tainted information used? e.g. locale */
c277df42 97#define RF_warned 2 /* warned about big count? */
faec1544 98
ab3bbdeb 99#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 100
f2ed9b32 101#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02 102
c74f6de9
KW
103#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
104
a687059c
LW
105#ifndef STATIC
106#define STATIC static
107#endif
108
e0193e47 109/* Valid for non-utf8 strings: avoids the reginclass
7e2509c1
KW
110 * call if there are no complications: i.e., if everything matchable is
111 * straight forward in the bitmap */
635cd5d4 112#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
af364d03 113 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 114
c277df42
IZ
115/*
116 * Forwards.
117 */
118
f2ed9b32 119#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 120#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 121
3dab1dad
YO
122#define HOPc(pos,off) \
123 (char *)(PL_reg_match_utf8 \
52657f30 124 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
125 : (U8*)(pos + off))
126#define HOPBACKc(pos, off) \
07be1b83
YO
127 (char*)(PL_reg_match_utf8\
128 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
129 : (pos - off >= PL_bostr) \
8e11feef 130 ? (U8*)pos - off \
3dab1dad 131 : NULL)
efb30f32 132
e7409c1b 133#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 134#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 135
7016d6eb
DM
136
137#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
138#define NEXTCHR_IS_EOS (nextchr < 0)
139
140#define SET_nextchr \
141 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
142
143#define SET_locinput(p) \
144 locinput = (p); \
145 SET_nextchr
146
147
20d0b1e9 148/* these are unrolled below in the CCC_TRY_XXX defined */
61dad979 149#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
9c4fdda1 150 if (!CAT2(PL_utf8_,class)) { \
cf54f63a 151 bool ok; \
9c4fdda1 152 ENTER; save_re_context(); \
cf54f63a
JL
153 ok=CAT2(is_utf8_,class)((const U8*)str); \
154 PERL_UNUSED_VAR(ok); \
155 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
37e2e78e
KW
156/* Doesn't do an assert to verify that is correct */
157#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
9c4fdda1 158 if (!CAT2(PL_utf8_,class)) { \
cb65b687
DM
159 bool throw_away; \
160 PERL_UNUSED_VAR(throw_away); \
9c4fdda1
RB
161 ENTER; save_re_context(); \
162 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
c1976674 163 PERL_UNUSED_VAR(throw_away); \
9c4fdda1 164 LEAVE; } } STMT_END
37e2e78e 165
1a4fad37
AL
166#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
167#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
51371543 168
37e2e78e 169#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
61dad979
KW
170 /* No asserts are done for some of these, in case called on a */ \
171 /* Unicode version in which they map to nothing */ \
9566f14a 172 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
61dad979 173 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
20d0b1e9 174
1dcf4a1b 175#define PLACEHOLDER /* Something for the preprocessor to grab onto */
d1eb3177 176
ee9a90b8
KW
177/* The actual code for CCC_TRY, which uses several variables from the routine
178 * it's callable from. It is designed to be the bulk of a case statement.
179 * FUNC is the macro or function to call on non-utf8 targets that indicate if
180 * nextchr matches the class.
181 * UTF8_TEST is the whole test string to use for utf8 targets
182 * LOAD is what to use to test, and if not present to load in the swash for the
183 * class
184 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
185 * UTF8_TEST test.
186 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
187 * utf8 and a variant, load the swash if necessary and test using the utf8
188 * test. Advance to the next character if test is ok, otherwise fail; If not
189 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
190 * fails, or advance to the next character */
191
192#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
9566f14a 193 if (NEXTCHR_IS_EOS) { \
ee9a90b8
KW
194 sayNO; \
195 } \
196 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
197 LOAD_UTF8_CHARCLASS(CLASS, STR); \
198 if (POS_OR_NEG (UTF8_TEST)) { \
199 sayNO; \
200 } \
ee9a90b8 201 } \
28b98f76
DM
202 else if (POS_OR_NEG (FUNC(nextchr))) { \
203 sayNO; \
ee9a90b8 204 } \
28b98f76 205 goto increment_locinput;
980866de 206
ee9a90b8
KW
207/* Handle the non-locale cases for a character class and its complement. It
208 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
209 * This is because that code fails when the test succeeds, so we want to have
210 * the test fail so that the code succeeds. The swash is stored in a
211 * predictable PL_ place */
cfaf538b
KW
212#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
213 CLASS, STR) \
ee9a90b8
KW
214 case NAME: \
215 _CCC_TRY_CODE( !, FUNC, \
216 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
217 (U8*)locinput, TRUE)), \
218 CLASS, STR) \
219 case NNAME: \
1dcf4a1b 220 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
ee9a90b8
KW
221 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
222 (U8*)locinput, TRUE)), \
9566f14a 223 CLASS, STR)
ee9a90b8
KW
224/* Generate the case statements for both locale and non-locale character
225 * classes in regmatch for classes that don't have special unicode semantics.
226 * Locales don't use an immediate swash, but an intermediary special locale
227 * function that is called on the pointer to the current place in the input
228 * string. That function will resolve to needing the same swash. One might
229 * think that because we don't know what the locale will match, we shouldn't
230 * check with the swash loading function that it loaded properly; ie, that we
231 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
232 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
233 * irrelevant here */
234#define CCC_TRY(NAME, NNAME, FUNC, \
235 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
cfaf538b 236 NAMEA, NNAMEA, FUNCA, \
ee9a90b8
KW
237 CLASS, STR) \
238 case NAMEL: \
239 PL_reg_flags |= RF_tainted; \
240 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
241 case NNAMEL: \
242 PL_reg_flags |= RF_tainted; \
1dcf4a1b
KW
243 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
244 CLASS, STR) \
cfaf538b 245 case NAMEA: \
7016d6eb 246 if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) { \
cfaf538b
KW
247 sayNO; \
248 } \
249 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
3640db6b 250 locinput++; \
cfaf538b
KW
251 break; \
252 case NNAMEA: \
7016d6eb 253 if (NEXTCHR_IS_EOS || FUNCA(nextchr)) { \
cfaf538b
KW
254 sayNO; \
255 } \
28b98f76 256 goto increment_locinput; \
ee9a90b8
KW
257 /* Generate the non-locale cases */ \
258 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
259
260/* This is like CCC_TRY, but has an extra set of parameters for generating case
261 * statements to handle separate Unicode semantics nodes */
262#define CCC_TRY_U(NAME, NNAME, FUNC, \
263 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
264 NAMEU, NNAMEU, FUNCU, \
cfaf538b 265 NAMEA, NNAMEA, FUNCA, \
ee9a90b8 266 CLASS, STR) \
cfaf538b
KW
267 CCC_TRY(NAME, NNAME, FUNC, \
268 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
269 NAMEA, NNAMEA, FUNCA, \
270 CLASS, STR) \
ee9a90b8 271 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 272
3dab1dad
YO
273/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
274
5f80c4cf 275/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
276/* it would be nice to rework regcomp.sym to generate this stuff. sigh
277 *
278 * NOTE that *nothing* that affects backtracking should be in here, specifically
279 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
280 * node that is in between two EXACT like nodes when ascertaining what the required
281 * "follow" character is. This should probably be moved to regex compile time
282 * although it may be done at run time beause of the REF possibility - more
283 * investigation required. -- demerphq
284*/
3e901dc0
YO
285#define JUMPABLE(rn) ( \
286 OP(rn) == OPEN || \
287 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
288 OP(rn) == EVAL || \
cca55fe3
JP
289 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
290 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 291 OP(rn) == KEEPS || \
3dab1dad 292 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 293)
ee9b8eae 294#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 295
ee9b8eae
YO
296#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
297
298#if 0
299/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
300 we don't need this definition. */
301#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 302#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
303#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
304
305#else
306/* ... so we use this as its faster. */
307#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 308#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
309#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
310#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
311
312#endif
e2d8ce26 313
a84d97b6
HS
314/*
315 Search for mandatory following text node; for lookahead, the text must
316 follow but for lookbehind (rn->flags != 0) we skip to the next step.
317*/
cca55fe3 318#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
319 while (JUMPABLE(rn)) { \
320 const OPCODE type = OP(rn); \
321 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 322 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 323 else if (type == PLUS) \
cca55fe3 324 rn = NEXTOPER(rn); \
3dab1dad 325 else if (type == IFMATCH) \
a84d97b6 326 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 327 else rn += NEXT_OFF(rn); \
3dab1dad 328 } \
5f80c4cf 329} STMT_END
74750237 330
22913b96
KW
331/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
332 * These are for the pre-composed Hangul syllables, which are all in a
333 * contiguous block and arranged there in such a way so as to facilitate
334 * alorithmic determination of their characteristics. As such, they don't need
335 * a swash, but can be determined by simple arithmetic. Almost all are
336 * GCB=LVT, but every 28th one is a GCB=LV */
337#define SBASE 0xAC00 /* Start of block */
338#define SCount 11172 /* Length of block */
339#define TCount 28
c476f425 340
acfe0abc 341static void restore_pos(pTHX_ void *arg);
51371543 342
87c0511b 343#define REGCP_PAREN_ELEMS 3
f067efbf 344#define REGCP_OTHER_ELEMS 3
e0fa7e2b 345#define REGCP_FRAME_ELEMS 1
620d5b66
NC
346/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
347 * are needed for the regexp context stack bookkeeping. */
348
76e3520e 349STATIC CHECKPOINT
92da3157 350S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
a0d0e21e 351{
97aff369 352 dVAR;
a3b680e6 353 const int retval = PL_savestack_ix;
92da3157
DM
354 const int paren_elems_to_push =
355 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
356 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
357 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 358 I32 p;
40a82448 359 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 360
b93070ed
DM
361 PERL_ARGS_ASSERT_REGCPPUSH;
362
e49a9654 363 if (paren_elems_to_push < 0)
5637ef5b
NC
364 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
365 paren_elems_to_push);
e49a9654 366
e0fa7e2b
NC
367 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
368 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0 369 " out of range (%lu-%ld)",
92da3157
DM
370 total_elems,
371 (unsigned long)maxopenparen,
372 (long)parenfloor);
e0fa7e2b 373
620d5b66 374 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 375
495f47a5 376 DEBUG_BUFFERS_r(
92da3157 377 if ((int)maxopenparen > (int)parenfloor)
495f47a5
DM
378 PerlIO_printf(Perl_debug_log,
379 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
380 PTR2UV(rex),
381 PTR2UV(rex->offs)
382 );
383 );
92da3157 384 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 385/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
386 SSPUSHINT(rex->offs[p].end);
387 SSPUSHINT(rex->offs[p].start);
1ca2007e 388 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 389 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
390 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
391 (UV)p,
392 (IV)rex->offs[p].start,
393 (IV)rex->offs[p].start_tmp,
394 (IV)rex->offs[p].end
40a82448 395 ));
a0d0e21e 396 }
b1ce53c5 397/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 398 SSPUSHINT(maxopenparen);
b93070ed
DM
399 SSPUSHINT(rex->lastparen);
400 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 401 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 402
a0d0e21e
LW
403 return retval;
404}
405
c277df42 406/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
407#define REGCP_SET(cp) \
408 DEBUG_STATE_r( \
ab3bbdeb 409 PerlIO_printf(Perl_debug_log, \
e4f74956 410 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
411 (IV)PL_savestack_ix)); \
412 cp = PL_savestack_ix
c3464db5 413
ab3bbdeb 414#define REGCP_UNWIND(cp) \
e4f74956 415 DEBUG_STATE_r( \
ab3bbdeb 416 if (cp != PL_savestack_ix) \
e4f74956
YO
417 PerlIO_printf(Perl_debug_log, \
418 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
419 (IV)(cp), (IV)PL_savestack_ix)); \
420 regcpblow(cp)
c277df42 421
a8d1f4b4
DM
422#define UNWIND_PAREN(lp, lcp) \
423 for (n = rex->lastparen; n > lp; n--) \
424 rex->offs[n].end = -1; \
425 rex->lastparen = n; \
426 rex->lastcloseparen = lcp;
427
428
f067efbf 429STATIC void
92da3157 430S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
a0d0e21e 431{
97aff369 432 dVAR;
e0fa7e2b 433 UV i;
87c0511b 434 U32 paren;
a3621e74
YO
435 GET_RE_DEBUG_FLAGS_DECL;
436
7918f24d
NC
437 PERL_ARGS_ASSERT_REGCPPOP;
438
b1ce53c5 439 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 440 i = SSPOPUV;
e0fa7e2b
NC
441 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
442 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
443 rex->lastcloseparen = SSPOPINT;
444 rex->lastparen = SSPOPINT;
92da3157 445 *maxopenparen_p = SSPOPINT;
b1ce53c5 446
620d5b66 447 i -= REGCP_OTHER_ELEMS;
b1ce53c5 448 /* Now restore the parentheses context. */
495f47a5
DM
449 DEBUG_BUFFERS_r(
450 if (i || rex->lastparen + 1 <= rex->nparens)
451 PerlIO_printf(Perl_debug_log,
452 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
453 PTR2UV(rex),
454 PTR2UV(rex->offs)
455 );
456 );
92da3157 457 paren = *maxopenparen_p;
620d5b66 458 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 459 I32 tmps;
1ca2007e 460 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 461 rex->offs[paren].start = SSPOPINT;
cf93c79d 462 tmps = SSPOPINT;
b93070ed
DM
463 if (paren <= rex->lastparen)
464 rex->offs[paren].end = tmps;
495f47a5
DM
465 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
466 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
467 (UV)paren,
468 (IV)rex->offs[paren].start,
469 (IV)rex->offs[paren].start_tmp,
470 (IV)rex->offs[paren].end,
471 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 472 );
87c0511b 473 paren--;
a0d0e21e 474 }
daf18116 475#if 1
dafc8851
JH
476 /* It would seem that the similar code in regtry()
477 * already takes care of this, and in fact it is in
478 * a better location to since this code can #if 0-ed out
479 * but the code in regtry() is needed or otherwise tests
480 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
481 * (as of patchlevel 7877) will fail. Then again,
482 * this code seems to be necessary or otherwise
225593e1
DM
483 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
484 * --jhi updated by dapm */
b93070ed 485 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 486 if (i > *maxopenparen_p)
b93070ed
DM
487 rex->offs[i].start = -1;
488 rex->offs[i].end = -1;
495f47a5
DM
489 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
490 " \\%"UVuf": %s ..-1 undeffing\n",
491 (UV)i,
92da3157 492 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 493 ));
a0d0e21e 494 }
dafc8851 495#endif
a0d0e21e
LW
496}
497
74088413
DM
498/* restore the parens and associated vars at savestack position ix,
499 * but without popping the stack */
500
501STATIC void
92da3157 502S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
74088413
DM
503{
504 I32 tmpix = PL_savestack_ix;
505 PL_savestack_ix = ix;
92da3157 506 regcppop(rex, maxopenparen_p);
74088413
DM
507 PL_savestack_ix = tmpix;
508}
509
02db2b7b 510#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 511
31c7f561
KW
512STATIC bool
513S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
514{
515 /* Returns a boolean as to whether or not 'character' is a member of the
516 * Posix character class given by 'classnum' that should be equivalent to a
517 * value in the typedef '_char_class_number'.
518 *
519 * Ideally this could be replaced by a just an array of function pointers
520 * to the C library functions that implement the macros this calls.
521 * However, to compile, the precise function signatures are required, and
522 * these may vary from platform to to platform. To avoid having to figure
523 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
524 * which adds an extra layer of function call overhead (unless the C
525 * optimizer strips it away). But we don't particularly care about
526 * performance with locales anyway. */
31c7f561
KW
527
528 switch ((_char_class_number) classnum) {
15861f94 529 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561
KW
530 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
531 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
532 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
533 case _CC_ENUM_LOWER: return isLOWER_LC(character);
534 case _CC_ENUM_PRINT: return isPRINT_LC(character);
535 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
536 case _CC_ENUM_UPPER: return isUPPER_LC(character);
537 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
538 case _CC_ENUM_SPACE: return isSPACE_LC(character);
539 case _CC_ENUM_BLANK: return isBLANK_LC(character);
540 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
541 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
542 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
543 case _CC_ENUM_ASCII: return isASCII_LC(character);
544 default: /* VERTSPACE should never occur in locales */
545 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
546 }
547
548 assert(0); /* NOTREACHED */
549 return FALSE;
550}
551
a687059c 552/*
e50aee73 553 * pregexec and friends
a687059c
LW
554 */
555
76234dfb 556#ifndef PERL_IN_XSUB_RE
a687059c 557/*
c277df42 558 - pregexec - match a regexp against a string
a687059c 559 */
c277df42 560I32
5aaab254 561Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
c3464db5 562 char *strbeg, I32 minend, SV *screamer, U32 nosave)
8fd1a950
DM
563/* stringarg: the point in the string at which to begin matching */
564/* strend: pointer to null at end of string */
565/* strbeg: real beginning of string */
566/* minend: end of match must be >= minend bytes after stringarg. */
567/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
568 * itself is accessed via the pointers above */
569/* nosave: For optimizations. */
c277df42 570{
7918f24d
NC
571 PERL_ARGS_ASSERT_PREGEXEC;
572
c277df42 573 return
9041c2e3 574 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
575 nosave ? 0 : REXEC_COPY_STR);
576}
76234dfb 577#endif
22e551b9 578
9041c2e3 579/*
cad2e5aa
JH
580 * Need to implement the following flags for reg_anch:
581 *
582 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
583 * USE_INTUIT_ML
584 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
585 * INTUIT_AUTORITATIVE_ML
586 * INTUIT_ONCE_NOML - Intuit can match in one location only.
587 * INTUIT_ONCE_ML
588 *
589 * Another flag for this function: SECOND_TIME (so that float substrs
590 * with giant delta may be not rechecked).
591 */
592
593/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
594
3f7c398e 595/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
596 Otherwise, only SvCUR(sv) is used to get strbeg. */
597
598/* XXXX We assume that strpos is strbeg unless sv. */
599
6eb5f6b9
JH
600/* XXXX Some places assume that there is a fixed substring.
601 An update may be needed if optimizer marks as "INTUITable"
602 RExen without fixed substrings. Similarly, it is assumed that
603 lengths of all the strings are no more than minlen, thus they
604 cannot come from lookahead.
40d049e4
YO
605 (Or minlen should take into account lookahead.)
606 NOTE: Some of this comment is not correct. minlen does now take account
607 of lookahead/behind. Further research is required. -- demerphq
608
609*/
6eb5f6b9 610
2c2d71f5
JH
611/* A failure to find a constant substring means that there is no need to make
612 an expensive call to REx engine, thus we celebrate a failure. Similarly,
613 finding a substring too deep into the string means that less calls to
30944b6d
IZ
614 regtry() should be needed.
615
616 REx compiler's optimizer found 4 possible hints:
617 a) Anchored substring;
618 b) Fixed substring;
619 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 620 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 621 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
622 string which does not contradict any of them.
623 */
2c2d71f5 624
6eb5f6b9
JH
625/* Most of decisions we do here should have been done at compile time.
626 The nodes of the REx which we used for the search should have been
627 deleted from the finite automaton. */
628
cad2e5aa 629char *
288b8c02 630Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 631 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 632{
97aff369 633 dVAR;
8d919b0a 634 struct regexp *const prog = ReANY(rx);
eb578fdb 635 I32 start_shift = 0;
cad2e5aa 636 /* Should be nonnegative! */
eb578fdb
KW
637 I32 end_shift = 0;
638 char *s;
639 SV *check;
a1933d95 640 char *strbeg;
cad2e5aa 641 char *t;
f2ed9b32 642 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 643 I32 ml_anch;
eb578fdb 644 char *other_last = NULL; /* other substr checked before this */
bd61b366 645 char *check_at = NULL; /* check substr found at this pos */
d8080198 646 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 647 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 648 RXi_GET_DECL(prog,progi);
30944b6d 649#ifdef DEBUGGING
890ce7af 650 const char * const i_strpos = strpos;
30944b6d 651#endif
a3621e74
YO
652 GET_RE_DEBUG_FLAGS_DECL;
653
7918f24d 654 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
655 PERL_UNUSED_ARG(flags);
656 PERL_UNUSED_ARG(data);
7918f24d 657
f2ed9b32 658 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 659
3c8556c3 660 if (RX_UTF8(rx)) {
b8d68ded
JH
661 PL_reg_flags |= RF_utf8;
662 }
ab3bbdeb 663 DEBUG_EXECUTE_r(
f2ed9b32 664 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
665 sv ? "Guessing start of match in sv for"
666 : "Guessing start of match in string for");
2a782b5b 667 );
cad2e5aa 668
c344f387
JH
669 /* CHR_DIST() would be more correct here but it makes things slow. */
670 if (prog->minlen > strend - strpos) {
a3621e74 671 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 672 "String too short... [re_intuit_start]\n"));
cad2e5aa 673 goto fail;
2c2d71f5 674 }
1de06328 675
7016d6eb
DM
676 /* XXX we need to pass strbeg as a separate arg: the following is
677 * guesswork and can be wrong... */
678 if (sv && SvPOK(sv)) {
679 char * p = SvPVX(sv);
680 STRLEN cur = SvCUR(sv);
681 if (p <= strpos && strpos < p + cur) {
682 strbeg = p;
683 assert(p <= strend && strend <= p + cur);
684 }
685 else
686 strbeg = strend - cur;
687 }
688 else
689 strbeg = strpos;
690
1aa99e6b 691 PL_regeol = strend;
f2ed9b32 692 if (utf8_target) {
33b8afdf
JH
693 if (!prog->check_utf8 && prog->check_substr)
694 to_utf8_substr(prog);
695 check = prog->check_utf8;
696 } else {
7e0d5ad7
KW
697 if (!prog->check_substr && prog->check_utf8) {
698 if (! to_byte_substr(prog)) {
6b54ddc5 699 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
700 }
701 }
33b8afdf
JH
702 check = prog->check_substr;
703 }
bbe252da
YO
704 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
705 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
706 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 707 && !multiline ) ); /* Check after \n? */
cad2e5aa 708
7e25d62c 709 if (!ml_anch) {
bbe252da
YO
710 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
711 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 712 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
713 && sv && !SvROK(sv)
714 && (strpos != strbeg)) {
a3621e74 715 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
716 goto fail;
717 }
d46b78c6
KW
718 if (prog->check_offset_min == prog->check_offset_max
719 && !(prog->extflags & RXf_CANY_SEEN)
720 && ! multiline) /* /m can cause \n's to match that aren't
721 accounted for in the string max length.
722 See [perl #115242] */
723 {
2c2d71f5 724 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
725 I32 slen;
726
1aa99e6b 727 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 728
653099ff
GS
729 if (SvTAIL(check)) {
730 slen = SvCUR(check); /* >= 1 */
cad2e5aa 731
9041c2e3 732 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 733 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 734 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 735 goto fail_finish;
cad2e5aa
JH
736 }
737 /* Now should match s[0..slen-2] */
738 slen--;
3f7c398e 739 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 740 || (slen > 1
3f7c398e 741 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 742 report_neq:
a3621e74 743 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
744 goto fail_finish;
745 }
cad2e5aa 746 }
3f7c398e 747 else if (*SvPVX_const(check) != *s
653099ff 748 || ((slen = SvCUR(check)) > 1
3f7c398e 749 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 750 goto report_neq;
c315bfe8 751 check_at = s;
2c2d71f5 752 goto success_at_start;
7e25d62c 753 }
cad2e5aa 754 }
2c2d71f5 755 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 756 s = strpos;
2c2d71f5 757 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
758 end_shift = prog->check_end_shift;
759
2c2d71f5 760 if (!ml_anch) {
a3b680e6 761 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 762 - (SvTAIL(check) != 0);
a3b680e6 763 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
764
765 if (end_shift < eshift)
766 end_shift = eshift;
767 }
cad2e5aa 768 }
2c2d71f5 769 else { /* Can match at random position */
cad2e5aa
JH
770 ml_anch = 0;
771 s = strpos;
1de06328
YO
772 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
773 end_shift = prog->check_end_shift;
774
775 /* end shift should be non negative here */
cad2e5aa
JH
776 }
777
bcdf7404 778#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 779 if (end_shift < 0)
1de06328 780 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 781 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
782#endif
783
2c2d71f5
JH
784 restart:
785 /* Find a possible match in the region s..strend by looking for
786 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
787
788 {
789 I32 srch_start_shift = start_shift;
790 I32 srch_end_shift = end_shift;
c33e64f0
FC
791 U8* start_point;
792 U8* end_point;
1de06328
YO
793 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
794 srch_end_shift -= ((strbeg - s) - srch_start_shift);
795 srch_start_shift = strbeg - s;
796 }
6bda09f9 797 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
798 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
799 (IV)prog->check_offset_min,
800 (IV)srch_start_shift,
801 (IV)srch_end_shift,
802 (IV)prog->check_end_shift);
803 });
804
bbe252da 805 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
806 start_point= (U8*)(s + srch_start_shift);
807 end_point= (U8*)(strend - srch_end_shift);
808 } else {
809 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
810 end_point= HOP3(strend, -srch_end_shift, strbeg);
811 }
6bda09f9 812 DEBUG_OPTIMISE_MORE_r({
56570a2c 813 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 814 (int)(end_point - start_point),
fc8cd66c 815 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
816 start_point);
817 });
818
819 s = fbm_instr( start_point, end_point,
7fba1cd6 820 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 821 }
cad2e5aa
JH
822 /* Update the count-of-usability, remove useless subpatterns,
823 unshift s. */
2c2d71f5 824
ab3bbdeb 825 DEBUG_EXECUTE_r({
f2ed9b32 826 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
827 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
828 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 829 (s ? "Found" : "Did not find"),
f2ed9b32 830 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
831 ? "anchored" : "floating"),
832 quoted,
833 RE_SV_TAIL(check),
834 (s ? " at offset " : "...\n") );
835 });
2c2d71f5
JH
836
837 if (!s)
838 goto fail_finish;
2c2d71f5 839 /* Finish the diagnostic message */
a3621e74 840 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 841
1de06328
YO
842 /* XXX dmq: first branch is for positive lookbehind...
843 Our check string is offset from the beginning of the pattern.
844 So we need to do any stclass tests offset forward from that
845 point. I think. :-(
846 */
847
848
849
850 check_at=s;
851
852
2c2d71f5
JH
853 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
854 Start with the other substr.
855 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 856 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
857 *always* match. Probably should be marked during compile...
858 Probably it is right to do no SCREAM here...
859 */
860
f2ed9b32 861 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
862 : (prog->float_substr && prog->anchored_substr))
863 {
30944b6d 864 /* Take into account the "other" substring. */
2c2d71f5
JH
865 /* XXXX May be hopelessly wrong for UTF... */
866 if (!other_last)
6eb5f6b9 867 other_last = strpos;
f2ed9b32 868 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
869 do_other_anchored:
870 {
890ce7af
AL
871 char * const last = HOP3c(s, -start_shift, strbeg);
872 char *last1, *last2;
be8e71aa 873 char * const saved_s = s;
33b8afdf 874 SV* must;
2c2d71f5 875
2c2d71f5
JH
876 t = s - prog->check_offset_max;
877 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 878 && (!utf8_target
0ce71af7 879 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 880 && t > strpos)))
6f207bd3 881 NOOP;
2c2d71f5
JH
882 else
883 t = strpos;
1aa99e6b 884 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
885 if (t < other_last) /* These positions already checked */
886 t = other_last;
1aa99e6b 887 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
888 if (last < last1)
889 last1 = last;
1de06328
YO
890 /* XXXX It is not documented what units *_offsets are in.
891 We assume bytes, but this is clearly wrong.
892 Meaning this code needs to be carefully reviewed for errors.
893 dmq.
894 */
895
2c2d71f5 896 /* On end-of-str: see comment below. */
f2ed9b32 897 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
898 if (must == &PL_sv_undef) {
899 s = (char*)NULL;
1de06328 900 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
901 }
902 else
903 s = fbm_instr(
904 (unsigned char*)t,
905 HOP3(HOP3(last1, prog->anchored_offset, strend)
906 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
907 must,
7fba1cd6 908 multiline ? FBMrf_MULTILINE : 0
33b8afdf 909 );
ab3bbdeb 910 DEBUG_EXECUTE_r({
f2ed9b32 911 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
912 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
913 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 914 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
915 quoted, RE_SV_TAIL(must));
916 });
917
918
2c2d71f5
JH
919 if (!s) {
920 if (last1 >= last2) {
a3621e74 921 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
922 ", giving up...\n"));
923 goto fail_finish;
924 }
a3621e74 925 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 926 ", trying floating at offset %ld...\n",
be8e71aa 927 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
928 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
929 s = HOP3c(last, 1, strend);
2c2d71f5
JH
930 goto restart;
931 }
932 else {
a3621e74 933 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 934 (long)(s - i_strpos)));
1aa99e6b
IH
935 t = HOP3c(s, -prog->anchored_offset, strbeg);
936 other_last = HOP3c(s, 1, strend);
be8e71aa 937 s = saved_s;
2c2d71f5
JH
938 if (t == strpos)
939 goto try_at_start;
2c2d71f5
JH
940 goto try_at_offset;
941 }
30944b6d 942 }
2c2d71f5
JH
943 }
944 else { /* Take into account the floating substring. */
33b8afdf 945 char *last, *last1;
be8e71aa 946 char * const saved_s = s;
33b8afdf
JH
947 SV* must;
948
949 t = HOP3c(s, -start_shift, strbeg);
950 last1 = last =
951 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
952 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
953 last = HOP3c(t, prog->float_max_offset, strend);
954 s = HOP3c(t, prog->float_min_offset, strend);
955 if (s < other_last)
956 s = other_last;
2c2d71f5 957 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 958 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
959 /* fbm_instr() takes into account exact value of end-of-str
960 if the check is SvTAIL(ed). Since false positives are OK,
961 and end-of-str is not later than strend we are OK. */
962 if (must == &PL_sv_undef) {
963 s = (char*)NULL;
1de06328 964 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
965 }
966 else
2c2d71f5 967 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
968 (unsigned char*)last + SvCUR(must)
969 - (SvTAIL(must)!=0),
7fba1cd6 970 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 971 DEBUG_EXECUTE_r({
f2ed9b32 972 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
973 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
974 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 975 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
976 quoted, RE_SV_TAIL(must));
977 });
33b8afdf
JH
978 if (!s) {
979 if (last1 == last) {
a3621e74 980 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
981 ", giving up...\n"));
982 goto fail_finish;
2c2d71f5 983 }
a3621e74 984 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 985 ", trying anchored starting at offset %ld...\n",
be8e71aa 986 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
987 other_last = last;
988 s = HOP3c(t, 1, strend);
989 goto restart;
990 }
991 else {
a3621e74 992 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
993 (long)(s - i_strpos)));
994 other_last = s; /* Fix this later. --Hugo */
be8e71aa 995 s = saved_s;
33b8afdf
JH
996 if (t == strpos)
997 goto try_at_start;
998 goto try_at_offset;
999 }
2c2d71f5 1000 }
cad2e5aa 1001 }
2c2d71f5 1002
1de06328 1003
9ef43ace 1004 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 1005
6bda09f9 1006 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
1007 PerlIO_printf(Perl_debug_log,
1008 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
1009 (IV)prog->check_offset_min,
1010 (IV)prog->check_offset_max,
1011 (IV)(s-strpos),
1012 (IV)(t-strpos),
1013 (IV)(t-s),
1014 (IV)(strend-strpos)
1015 )
1016 );
1017
2c2d71f5 1018 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 1019 && (!utf8_target
9ef43ace 1020 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
1021 && t > strpos)))
1022 {
2c2d71f5
JH
1023 /* Fixed substring is found far enough so that the match
1024 cannot start at strpos. */
1025 try_at_offset:
cad2e5aa 1026 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
1027 /* Eventually fbm_*() should handle this, but often
1028 anchored_offset is not 0, so this check will not be wasted. */
1029 /* XXXX In the code below we prefer to look for "^" even in
1030 presence of anchored substrings. And we search even
1031 beyond the found float position. These pessimizations
1032 are historical artefacts only. */
1033 find_anchor:
2c2d71f5 1034 while (t < strend - prog->minlen) {
cad2e5aa 1035 if (*t == '\n') {
4ee3650e 1036 if (t < check_at - prog->check_offset_min) {
f2ed9b32 1037 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
1038 /* Since we moved from the found position,
1039 we definitely contradict the found anchored
30944b6d
IZ
1040 substr. Due to the above check we do not
1041 contradict "check" substr.
1042 Thus we can arrive here only if check substr
1043 is float. Redo checking for "other"=="fixed".
1044 */
9041c2e3 1045 strpos = t + 1;
a3621e74 1046 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 1047 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
1048 goto do_other_anchored;
1049 }
4ee3650e
GS
1050 /* We don't contradict the found floating substring. */
1051 /* XXXX Why not check for STCLASS? */
cad2e5aa 1052 s = t + 1;
a3621e74 1053 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 1054 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
1055 goto set_useful;
1056 }
4ee3650e
GS
1057 /* Position contradicts check-string */
1058 /* XXXX probably better to look for check-string
1059 than for "\n", so one should lower the limit for t? */
a3621e74 1060 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 1061 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 1062 other_last = strpos = s = t + 1;
cad2e5aa
JH
1063 goto restart;
1064 }
1065 t++;
1066 }
a3621e74 1067 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 1068 PL_colors[0], PL_colors[1]));
2c2d71f5 1069 goto fail_finish;
cad2e5aa 1070 }
f5952150 1071 else {
a3621e74 1072 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 1073 PL_colors[0], PL_colors[1]));
f5952150 1074 }
cad2e5aa
JH
1075 s = t;
1076 set_useful:
f2ed9b32 1077 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1078 }
1079 else {
f5952150 1080 /* The found string does not prohibit matching at strpos,
2c2d71f5 1081 - no optimization of calling REx engine can be performed,
f5952150
GS
1082 unless it was an MBOL and we are not after MBOL,
1083 or a future STCLASS check will fail this. */
2c2d71f5
JH
1084 try_at_start:
1085 /* Even in this situation we may use MBOL flag if strpos is offset
1086 wrt the start of the string. */
05b4157f 1087 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1088 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1089 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1090 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1091 {
cad2e5aa
JH
1092 t = strpos;
1093 goto find_anchor;
1094 }
a3621e74 1095 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1096 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1097 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1098 );
2c2d71f5 1099 success_at_start:
bbe252da 1100 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1101 && (utf8_target ? (
33b8afdf
JH
1102 prog->check_utf8 /* Could be deleted already */
1103 && --BmUSEFUL(prog->check_utf8) < 0
1104 && (prog->check_utf8 == prog->float_utf8)
1105 ) : (
1106 prog->check_substr /* Could be deleted already */
1107 && --BmUSEFUL(prog->check_substr) < 0
1108 && (prog->check_substr == prog->float_substr)
1109 )))
66e933ab 1110 {
cad2e5aa 1111 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1112 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1113 /* XXX Does the destruction order has to change with utf8_target? */
1114 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1115 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1116 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1117 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1118 check = NULL; /* abort */
cad2e5aa 1119 s = strpos;
486ec47a 1120 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1121 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1122 if (prog->intflags & PREGf_IMPLICIT)
1123 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1124 /* XXXX This is a remnant of the old implementation. It
1125 looks wasteful, since now INTUIT can use many
6eb5f6b9 1126 other heuristics. */
bbe252da 1127 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1128 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1129 }
1130 else
1131 s = strpos;
1132 }
1133
6eb5f6b9
JH
1134 /* Last resort... */
1135 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1136 /* trie stclasses are too expensive to use here, we are better off to
1137 leave it to regmatch itself */
f8fc2ecf 1138 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1139 /* minlen == 0 is possible if regstclass is \b or \B,
1140 and the fixed substr is ''$.
1141 Since minlen is already taken into account, s+1 is before strend;
1142 accidentally, minlen >= 1 guaranties no false positives at s + 1
1143 even for \b or \B. But (minlen? 1 : 0) below assumes that
1144 regstclass does not come from lookahead... */
1145 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1146 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1147 const U8* const str = (U8*)STRING(progi->regstclass);
1148 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1149 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1150 : 1);
1de06328
YO
1151 char * endpos;
1152 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1153 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1154 else if (prog->float_substr || prog->float_utf8)
1155 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1156 else
1157 endpos= strend;
1158
d8080198
YO
1159 if (checked_upto < s)
1160 checked_upto = s;
1161 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1162 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1163
6eb5f6b9 1164 t = s;
d8080198
YO
1165 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1166 if (s) {
1167 checked_upto = s;
1168 } else {
6eb5f6b9 1169#ifdef DEBUGGING
cbbf8932 1170 const char *what = NULL;
6eb5f6b9
JH
1171#endif
1172 if (endpos == strend) {
a3621e74 1173 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1174 "Could not match STCLASS...\n") );
1175 goto fail;
1176 }
a3621e74 1177 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1178 "This position contradicts STCLASS...\n") );
bbe252da 1179 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1180 goto fail;
d8080198
YO
1181 checked_upto = HOPBACKc(endpos, start_shift);
1182 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1183 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1184 /* Contradict one of substrings */
33b8afdf 1185 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1186 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1187 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1188 hop_and_restart:
1aa99e6b 1189 s = HOP3c(t, 1, strend);
66e933ab
GS
1190 if (s + start_shift + end_shift > strend) {
1191 /* XXXX Should be taken into account earlier? */
a3621e74 1192 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1193 "Could not match STCLASS...\n") );
1194 goto fail;
1195 }
5e39e1e5
HS
1196 if (!check)
1197 goto giveup;
a3621e74 1198 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1199 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1200 what, (long)(s + start_shift - i_strpos)) );
1201 goto restart;
1202 }
66e933ab 1203 /* Have both, check_string is floating */
6eb5f6b9
JH
1204 if (t + start_shift >= check_at) /* Contradicts floating=check */
1205 goto retry_floating_check;
1206 /* Recheck anchored substring, but not floating... */
9041c2e3 1207 s = check_at;
5e39e1e5
HS
1208 if (!check)
1209 goto giveup;
a3621e74 1210 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1211 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1212 (long)(other_last - i_strpos)) );
1213 goto do_other_anchored;
1214 }
60e71179
GS
1215 /* Another way we could have checked stclass at the
1216 current position only: */
1217 if (ml_anch) {
1218 s = t = t + 1;
5e39e1e5
HS
1219 if (!check)
1220 goto giveup;
a3621e74 1221 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1222 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1223 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1224 goto try_at_offset;
66e933ab 1225 }
f2ed9b32 1226 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1227 goto fail;
486ec47a 1228 /* Check is floating substring. */
6eb5f6b9
JH
1229 retry_floating_check:
1230 t = check_at - start_shift;
a3621e74 1231 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1232 goto hop_and_restart;
1233 }
b7953727 1234 if (t != s) {
a3621e74 1235 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1236 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1237 (long)(t - i_strpos), (long)(s - i_strpos))
1238 );
1239 }
1240 else {
a3621e74 1241 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1242 "Does not contradict STCLASS...\n");
1243 );
1244 }
6eb5f6b9 1245 }
5e39e1e5 1246 giveup:
a3621e74 1247 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1248 PL_colors[4], (check ? "Guessed" : "Giving up"),
1249 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1250 return s;
2c2d71f5
JH
1251
1252 fail_finish: /* Substring not found */
33b8afdf 1253 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1254 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1255 fail:
a3621e74 1256 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1257 PL_colors[4], PL_colors[5]));
bd61b366 1258 return NULL;
cad2e5aa 1259}
9661b544 1260
a0a388a1
YO
1261#define DECL_TRIE_TYPE(scan) \
1262 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1263 trie_type = ((scan->flags == EXACT) \
1264 ? (utf8_target ? trie_utf8 : trie_plain) \
1265 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1266
1267#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1268uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1269 STRLEN skiplen; \
1270 switch (trie_type) { \
1271 case trie_utf8_fold: \
1272 if ( foldlen>0 ) { \
1273 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1274 foldlen -= len; \
1275 uscan += len; \
1276 len=0; \
1277 } else { \
1278 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1279 len = UTF8SKIP(uc); \
1280 skiplen = UNISKIP( uvc ); \
1281 foldlen -= skiplen; \
1282 uscan = foldbuf + skiplen; \
1283 } \
1284 break; \
1285 case trie_latin_utf8_fold: \
1286 if ( foldlen>0 ) { \
1287 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1288 foldlen -= len; \
1289 uscan += len; \
1290 len=0; \
1291 } else { \
1292 len = 1; \
1293 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1294 skiplen = UNISKIP( uvc ); \
1295 foldlen -= skiplen; \
1296 uscan = foldbuf + skiplen; \
1297 } \
1298 break; \
1299 case trie_utf8: \
1300 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1301 break; \
1302 case trie_plain: \
1303 uvc = (UV)*uc; \
1304 len = 1; \
1305 } \
1306 if (uvc < 256) { \
1307 charid = trie->charmap[ uvc ]; \
1308 } \
1309 else { \
1310 charid = 0; \
1311 if (widecharmap) { \
1312 SV** const svpp = hv_fetch(widecharmap, \
1313 (char*)&uvc, sizeof(UV), 0); \
1314 if (svpp) \
1315 charid = (U16)SvIV(*svpp); \
1316 } \
1317 } \
4cadc6a9
YO
1318} STMT_END
1319
4cadc6a9
YO
1320#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1321STMT_START { \
1322 while (s <= e) { \
1323 if ( (CoNd) \
fac1af77 1324 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1325 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1326 goto got_it; \
1327 s++; \
1328 } \
1329} STMT_END
1330
1331#define REXEC_FBC_UTF8_SCAN(CoDe) \
1332STMT_START { \
9a902117 1333 while (s < strend) { \
4cadc6a9 1334 CoDe \
9a902117 1335 s += UTF8SKIP(s); \
4cadc6a9
YO
1336 } \
1337} STMT_END
1338
1339#define REXEC_FBC_SCAN(CoDe) \
1340STMT_START { \
1341 while (s < strend) { \
1342 CoDe \
1343 s++; \
1344 } \
1345} STMT_END
1346
1347#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1348REXEC_FBC_UTF8_SCAN( \
1349 if (CoNd) { \
7aee35ff 1350 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1351 goto got_it; \
1352 else \
1353 tmp = doevery; \
1354 } \
1355 else \
1356 tmp = 1; \
1357)
1358
1359#define REXEC_FBC_CLASS_SCAN(CoNd) \
1360REXEC_FBC_SCAN( \
1361 if (CoNd) { \
24b23f37 1362 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1363 goto got_it; \
1364 else \
1365 tmp = doevery; \
1366 } \
1367 else \
1368 tmp = 1; \
1369)
1370
1371#define REXEC_FBC_TRYIT \
24b23f37 1372if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1373 goto got_it
1374
e1d1eefb 1375#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1376 if (utf8_target) { \
e1d1eefb
YO
1377 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1378 } \
1379 else { \
1380 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1381 }
e1d1eefb 1382
4cadc6a9 1383#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1384 if (utf8_target) { \
4cadc6a9
YO
1385 UtFpReLoAd; \
1386 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1387 } \
1388 else { \
1389 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1390 }
4cadc6a9
YO
1391
1392#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1393 PL_reg_flags |= RF_tainted; \
f2ed9b32 1394 if (utf8_target) { \
4cadc6a9
YO
1395 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1396 } \
1397 else { \
1398 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1399 }
4cadc6a9 1400
786e8c11
YO
1401#define DUMP_EXEC_POS(li,s,doutf8) \
1402 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1403
cfaf538b
KW
1404
1405#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1406 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1407 tmp = TEST_NON_UTF8(tmp); \
1408 REXEC_FBC_UTF8_SCAN( \
1409 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1410 tmp = !tmp; \
1411 IF_SUCCESS; \
1412 } \
1413 else { \
1414 IF_FAIL; \
1415 } \
1416 ); \
1417
1418#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1419 if (s == PL_bostr) { \
1420 tmp = '\n'; \
1421 } \
1422 else { \
1423 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1424 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1425 } \
1426 tmp = TeSt1_UtF8; \
1427 LOAD_UTF8_CHARCLASS_ALNUM(); \
1428 REXEC_FBC_UTF8_SCAN( \
1429 if (tmp == ! (TeSt2_UtF8)) { \
1430 tmp = !tmp; \
1431 IF_SUCCESS; \
1432 } \
1433 else { \
1434 IF_FAIL; \
1435 } \
1436 ); \
1437
63ac0dad
KW
1438/* The only difference between the BOUND and NBOUND cases is that
1439 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1440 * NBOUND. This is accomplished by passing it in either the if or else clause,
1441 * with the other one being empty */
1442#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1443 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1444
1445#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1446 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1447
1448#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1449 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1450
1451#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1452 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1453
63ac0dad
KW
1454
1455/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1456 * be passed in completely with the variable name being tested, which isn't
1457 * such a clean interface, but this is easier to read than it was before. We
1458 * are looking for the boundary (or non-boundary between a word and non-word
1459 * character. The utf8 and non-utf8 cases have the same logic, but the details
1460 * must be different. Find the "wordness" of the character just prior to this
1461 * one, and compare it with the wordness of this one. If they differ, we have
1462 * a boundary. At the beginning of the string, pretend that the previous
1463 * character was a new-line */
cfaf538b 1464#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1465 if (utf8_target) { \
cfaf538b 1466 UTF8_CODE \
63ac0dad
KW
1467 } \
1468 else { /* Not utf8 */ \
1469 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1470 tmp = TEST_NON_UTF8(tmp); \
1471 REXEC_FBC_SCAN( \
1472 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1473 tmp = !tmp; \
1474 IF_SUCCESS; \
1475 } \
1476 else { \
1477 IF_FAIL; \
1478 } \
1479 ); \
1480 } \
1481 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1482 goto got_it;
1483
786e8c11
YO
1484/* We know what class REx starts with. Try to find this position... */
1485/* if reginfo is NULL, its a dryrun */
1486/* annoyingly all the vars in this routine have different names from their counterparts
1487 in regmatch. /grrr */
1488
3c3eec57 1489STATIC char *
07be1b83 1490S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1491 const char *strend, regmatch_info *reginfo)
a687059c 1492{
73104a1b
KW
1493 dVAR;
1494 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1495 char *pat_string; /* The pattern's exactish string */
1496 char *pat_end; /* ptr to end char of pat_string */
1497 re_fold_t folder; /* Function for computing non-utf8 folds */
1498 const U8 *fold_array; /* array for folding ords < 256 */
1499 STRLEN ln;
1500 STRLEN lnc;
1501 STRLEN uskip;
1502 U8 c1;
1503 U8 c2;
1504 char *e;
1505 I32 tmp = 1; /* Scratch variable? */
1506 const bool utf8_target = PL_reg_match_utf8;
1507 UV utf8_fold_flags = 0;
1508 RXi_GET_DECL(prog,progi);
2f7f8cb1 1509
73104a1b 1510 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1511
73104a1b
KW
1512 /* We know what class it must start with. */
1513 switch (OP(c)) {
1514 case ANYOF:
1515 if (utf8_target) {
1516 REXEC_FBC_UTF8_CLASS_SCAN(
1517 reginclass(prog, c, (U8*)s, utf8_target));
1518 }
1519 else {
1520 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1521 }
1522 break;
1523 case CANY:
1524 REXEC_FBC_SCAN(
1525 if (tmp && (!reginfo || regtry(reginfo, &s)))
1526 goto got_it;
1527 else
1528 tmp = doevery;
1529 );
1530 break;
1531
1532 case EXACTFA:
1533 if (UTF_PATTERN || utf8_target) {
1534 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1535 goto do_exactf_utf8;
1536 }
1537 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1538 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1539 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1540
73104a1b
KW
1541 case EXACTF:
1542 if (utf8_target) {
16d951b7 1543
73104a1b
KW
1544 /* regcomp.c already folded this if pattern is in UTF-8 */
1545 utf8_fold_flags = 0;
1546 goto do_exactf_utf8;
1547 }
1548 fold_array = PL_fold;
1549 folder = foldEQ;
1550 goto do_exactf_non_utf8;
1551
1552 case EXACTFL:
1553 if (UTF_PATTERN || utf8_target) {
1554 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1555 goto do_exactf_utf8;
1556 }
1557 fold_array = PL_fold_locale;
1558 folder = foldEQ_locale;
1559 goto do_exactf_non_utf8;
3c760661 1560
73104a1b
KW
1561 case EXACTFU_SS:
1562 if (UTF_PATTERN) {
1563 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1564 }
1565 goto do_exactf_utf8;
16d951b7 1566
73104a1b
KW
1567 case EXACTFU_TRICKYFOLD:
1568 case EXACTFU:
1569 if (UTF_PATTERN || utf8_target) {
1570 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1571 goto do_exactf_utf8;
1572 }
fac1af77 1573
73104a1b
KW
1574 /* Any 'ss' in the pattern should have been replaced by regcomp,
1575 * so we don't have to worry here about this single special case
1576 * in the Latin1 range */
1577 fold_array = PL_fold_latin1;
1578 folder = foldEQ_latin1;
1579
1580 /* FALL THROUGH */
1581
1582 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1583 are no glitches with fold-length differences
1584 between the target string and pattern */
1585
1586 /* The idea in the non-utf8 EXACTF* cases is to first find the
1587 * first character of the EXACTF* node and then, if necessary,
1588 * case-insensitively compare the full text of the node. c1 is the
1589 * first character. c2 is its fold. This logic will not work for
1590 * Unicode semantics and the german sharp ss, which hence should
1591 * not be compiled into a node that gets here. */
1592 pat_string = STRING(c);
1593 ln = STR_LEN(c); /* length to match in octets/bytes */
1594
1595 /* We know that we have to match at least 'ln' bytes (which is the
1596 * same as characters, since not utf8). If we have to match 3
1597 * characters, and there are only 2 availabe, we know without
1598 * trying that it will fail; so don't start a match past the
1599 * required minimum number from the far end */
1600 e = HOP3c(strend, -((I32)ln), s);
1601
1602 if (!reginfo && e < s) {
1603 e = s; /* Due to minlen logic of intuit() */
1604 }
fac1af77 1605
73104a1b
KW
1606 c1 = *pat_string;
1607 c2 = fold_array[c1];
1608 if (c1 == c2) { /* If char and fold are the same */
1609 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1610 }
1611 else {
1612 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1613 }
1614 break;
fac1af77 1615
73104a1b
KW
1616 do_exactf_utf8:
1617 {
1618 unsigned expansion;
1619
1620 /* If one of the operands is in utf8, we can't use the simpler folding
1621 * above, due to the fact that many different characters can have the
1622 * same fold, or portion of a fold, or different- length fold */
1623 pat_string = STRING(c);
1624 ln = STR_LEN(c); /* length to match in octets/bytes */
1625 pat_end = pat_string + ln;
1626 lnc = (UTF_PATTERN) /* length to match in characters */
1627 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1628 : ln;
1629
1630 /* We have 'lnc' characters to match in the pattern, but because of
1631 * multi-character folding, each character in the target can match
1632 * up to 3 characters (Unicode guarantees it will never exceed
1633 * this) if it is utf8-encoded; and up to 2 if not (based on the
1634 * fact that the Latin 1 folds are already determined, and the
1635 * only multi-char fold in that range is the sharp-s folding to
1636 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1637 * string character. Adjust lnc accordingly, rounding up, so that
1638 * if we need to match at least 4+1/3 chars, that really is 5. */
1639 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1640 lnc = (lnc + expansion - 1) / expansion;
1641
1642 /* As in the non-UTF8 case, if we have to match 3 characters, and
1643 * only 2 are left, it's guaranteed to fail, so don't start a
1644 * match that would require us to go beyond the end of the string
1645 */
1646 e = HOP3c(strend, -((I32)lnc), s);
1647
1648 if (!reginfo && e < s) {
1649 e = s; /* Due to minlen logic of intuit() */
1650 }
0658cdde 1651
73104a1b
KW
1652 /* XXX Note that we could recalculate e to stop the loop earlier,
1653 * as the worst case expansion above will rarely be met, and as we
1654 * go along we would usually find that e moves further to the left.
1655 * This would happen only after we reached the point in the loop
1656 * where if there were no expansion we should fail. Unclear if
1657 * worth the expense */
1658
1659 while (s <= e) {
1660 char *my_strend= (char *)strend;
1661 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1662 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1663 && (!reginfo || regtry(reginfo, &s)) )
1664 {
1665 goto got_it;
1666 }
1667 s += (utf8_target) ? UTF8SKIP(s) : 1;
1668 }
1669 break;
1670 }
1671 case BOUNDL:
1672 PL_reg_flags |= RF_tainted;
1673 FBC_BOUND(isALNUM_LC,
1674 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1675 isALNUM_LC_utf8((U8*)s));
1676 break;
1677 case NBOUNDL:
1678 PL_reg_flags |= RF_tainted;
1679 FBC_NBOUND(isALNUM_LC,
1680 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1681 isALNUM_LC_utf8((U8*)s));
1682 break;
1683 case BOUND:
1684 FBC_BOUND(isWORDCHAR,
1685 isALNUM_uni(tmp),
1686 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1687 break;
1688 case BOUNDA:
1689 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1690 isWORDCHAR_A(tmp),
1691 isWORDCHAR_A((U8*)s));
1692 break;
1693 case NBOUND:
1694 FBC_NBOUND(isWORDCHAR,
1695 isALNUM_uni(tmp),
1696 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1697 break;
1698 case NBOUNDA:
1699 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1700 isWORDCHAR_A(tmp),
1701 isWORDCHAR_A((U8*)s));
1702 break;
1703 case BOUNDU:
1704 FBC_BOUND(isWORDCHAR_L1,
1705 isALNUM_uni(tmp),
1706 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1707 break;
1708 case NBOUNDU:
1709 FBC_NBOUND(isWORDCHAR_L1,
1710 isALNUM_uni(tmp),
1711 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1712 break;
1713 case ALNUML:
1714 REXEC_FBC_CSCAN_TAINT(
1715 isALNUM_LC_utf8((U8*)s),
1716 isALNUM_LC(*s)
1717 );
1718 break;
1719 case ALNUMU:
1720 REXEC_FBC_CSCAN_PRELOAD(
1721 LOAD_UTF8_CHARCLASS_ALNUM(),
1722 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1723 isWORDCHAR_L1((U8) *s)
1724 );
1725 break;
1726 case ALNUM:
1727 REXEC_FBC_CSCAN_PRELOAD(
1728 LOAD_UTF8_CHARCLASS_ALNUM(),
1729 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1730 isWORDCHAR((U8) *s)
1731 );
1732 break;
1733 case ALNUMA:
1734 /* Don't need to worry about utf8, as it can match only a single
1735 * byte invariant character */
1736 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1737 break;
1738 case NALNUMU:
1739 REXEC_FBC_CSCAN_PRELOAD(
1740 LOAD_UTF8_CHARCLASS_ALNUM(),
1741 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1742 ! isWORDCHAR_L1((U8) *s)
1743 );
1744 break;
1745 case NALNUM:
1746 REXEC_FBC_CSCAN_PRELOAD(
1747 LOAD_UTF8_CHARCLASS_ALNUM(),
1748 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1749 ! isALNUM(*s)
1750 );
1751 break;
1752 case NALNUMA:
1753 REXEC_FBC_CSCAN(
1754 !isWORDCHAR_A(*s),
1755 !isWORDCHAR_A(*s)
1756 );
1757 break;
1758 case NALNUML:
1759 REXEC_FBC_CSCAN_TAINT(
1760 !isALNUM_LC_utf8((U8*)s),
1761 !isALNUM_LC(*s)
1762 );
1763 break;
1764 case SPACEU:
1765 REXEC_FBC_CSCAN(
1766 is_XPERLSPACE_utf8(s),
1767 isSPACE_L1((U8) *s)
1768 );
1769 break;
1770 case SPACE:
1771 REXEC_FBC_CSCAN(
1772 is_XPERLSPACE_utf8(s),
1773 isSPACE((U8) *s)
1774 );
1775 break;
1776 case SPACEA:
1777 /* Don't need to worry about utf8, as it can match only a single
1778 * byte invariant character */
1779 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1780 break;
1781 case SPACEL:
1782 REXEC_FBC_CSCAN_TAINT(
1783 isSPACE_LC_utf8((U8*)s),
1784 isSPACE_LC(*s)
1785 );
1786 break;
1787 case NSPACEU:
1788 REXEC_FBC_CSCAN(
1789 ! is_XPERLSPACE_utf8(s),
1790 ! isSPACE_L1((U8) *s)
1791 );
1792 break;
1793 case NSPACE:
1794 REXEC_FBC_CSCAN(
1795 ! is_XPERLSPACE_utf8(s),
1796 ! isSPACE((U8) *s)
1797 );
1798 break;
1799 case NSPACEA:
1800 REXEC_FBC_CSCAN(
1801 !isSPACE_A(*s),
1802 !isSPACE_A(*s)
1803 );
1804 break;
1805 case NSPACEL:
1806 REXEC_FBC_CSCAN_TAINT(
1807 !isSPACE_LC_utf8((U8*)s),
1808 !isSPACE_LC(*s)
1809 );
1810 break;
1811 case DIGIT:
1812 REXEC_FBC_CSCAN_PRELOAD(
1813 LOAD_UTF8_CHARCLASS_DIGIT(),
1814 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1815 isDIGIT(*s)
1816 );
1817 break;
1818 case DIGITA:
1819 /* Don't need to worry about utf8, as it can match only a single
1820 * byte invariant character */
1821 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1822 break;
1823 case DIGITL:
1824 REXEC_FBC_CSCAN_TAINT(
1825 isDIGIT_LC_utf8((U8*)s),
1826 isDIGIT_LC(*s)
1827 );
1828 break;
1829 case NDIGIT:
1830 REXEC_FBC_CSCAN_PRELOAD(
1831 LOAD_UTF8_CHARCLASS_DIGIT(),
1832 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1833 !isDIGIT(*s)
1834 );
1835 break;
1836 case NDIGITA:
1837 REXEC_FBC_CSCAN(
1838 !isDIGIT_A(*s),
1839 !isDIGIT_A(*s)
1840 );
1841 break;
1842 case NDIGITL:
1843 REXEC_FBC_CSCAN_TAINT(
1844 !isDIGIT_LC_utf8((U8*)s),
1845 !isDIGIT_LC(*s)
1846 );
1847 break;
1848 case LNBREAK:
1849 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1850 is_LNBREAK_latin1_safe(s, strend)
1851 );
1852 break;
1853 case VERTWS:
1854 REXEC_FBC_CSCAN(
1855 is_VERTWS_utf8_safe(s, strend),
1856 is_VERTWS_latin1_safe(s, strend)
1857 );
1858 break;
1859 case NVERTWS:
1860 REXEC_FBC_CSCAN(
1861 !is_VERTWS_utf8_safe(s, strend),
1862 !is_VERTWS_latin1_safe(s, strend)
1863 );
1864 break;
1865 case HORIZWS:
1866 REXEC_FBC_CSCAN(
1867 is_HORIZWS_utf8_safe(s, strend),
1868 is_HORIZWS_latin1_safe(s, strend)
1869 );
1870 break;
1871 case NHORIZWS:
1872 REXEC_FBC_CSCAN(
1873 !is_HORIZWS_utf8_safe(s, strend),
1874 !is_HORIZWS_latin1_safe(s, strend)
1875 );
1876 break;
1877 case POSIXA:
1878 /* Don't need to worry about utf8, as it can match only a single
1879 * byte invariant character. The flag in this node type is the
1880 * class number to pass to _generic_isCC() to build a mask for
1881 * searching in PL_charclass[] */
1882 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1883 break;
1884 case NPOSIXA:
1885 REXEC_FBC_CSCAN(
1886 !_generic_isCC_A(*s, FLAGS(c)),
1887 !_generic_isCC_A(*s, FLAGS(c))
1888 );
1889 break;
1890
1891 case AHOCORASICKC:
1892 case AHOCORASICK:
1893 {
1894 DECL_TRIE_TYPE(c);
1895 /* what trie are we using right now */
1896 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1897 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1898 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1899
1900 const char *last_start = strend - trie->minlen;
6148ee25 1901#ifdef DEBUGGING
73104a1b 1902 const char *real_start = s;
6148ee25 1903#endif
73104a1b
KW
1904 STRLEN maxlen = trie->maxlen;
1905 SV *sv_points;
1906 U8 **points; /* map of where we were in the input string
1907 when reading a given char. For ASCII this
1908 is unnecessary overhead as the relationship
1909 is always 1:1, but for Unicode, especially
1910 case folded Unicode this is not true. */
1911 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1912 U8 *bitmap=NULL;
1913
1914
1915 GET_RE_DEBUG_FLAGS_DECL;
1916
1917 /* We can't just allocate points here. We need to wrap it in
1918 * an SV so it gets freed properly if there is a croak while
1919 * running the match */
1920 ENTER;
1921 SAVETMPS;
1922 sv_points=newSV(maxlen * sizeof(U8 *));
1923 SvCUR_set(sv_points,
1924 maxlen * sizeof(U8 *));
1925 SvPOK_on(sv_points);
1926 sv_2mortal(sv_points);
1927 points=(U8**)SvPV_nolen(sv_points );
1928 if ( trie_type != trie_utf8_fold
1929 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1930 {
1931 if (trie->bitmap)
1932 bitmap=(U8*)trie->bitmap;
1933 else
1934 bitmap=(U8*)ANYOF_BITMAP(c);
1935 }
1936 /* this is the Aho-Corasick algorithm modified a touch
1937 to include special handling for long "unknown char" sequences.
1938 The basic idea being that we use AC as long as we are dealing
1939 with a possible matching char, when we encounter an unknown char
1940 (and we have not encountered an accepting state) we scan forward
1941 until we find a legal starting char.
1942 AC matching is basically that of trie matching, except that when
1943 we encounter a failing transition, we fall back to the current
1944 states "fail state", and try the current char again, a process
1945 we repeat until we reach the root state, state 1, or a legal
1946 transition. If we fail on the root state then we can either
1947 terminate if we have reached an accepting state previously, or
1948 restart the entire process from the beginning if we have not.
1949
1950 */
1951 while (s <= last_start) {
1952 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1953 U8 *uc = (U8*)s;
1954 U16 charid = 0;
1955 U32 base = 1;
1956 U32 state = 1;
1957 UV uvc = 0;
1958 STRLEN len = 0;
1959 STRLEN foldlen = 0;
1960 U8 *uscan = (U8*)NULL;
1961 U8 *leftmost = NULL;
1962#ifdef DEBUGGING
1963 U32 accepted_word= 0;
786e8c11 1964#endif
73104a1b
KW
1965 U32 pointpos = 0;
1966
1967 while ( state && uc <= (U8*)strend ) {
1968 int failed=0;
1969 U32 word = aho->states[ state ].wordnum;
1970
1971 if( state==1 ) {
1972 if ( bitmap ) {
1973 DEBUG_TRIE_EXECUTE_r(
1974 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1975 dump_exec_pos( (char *)uc, c, strend, real_start,
1976 (char *)uc, utf8_target );
1977 PerlIO_printf( Perl_debug_log,
1978 " Scanning for legal start char...\n");
1979 }
1980 );
1981 if (utf8_target) {
1982 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1983 uc += UTF8SKIP(uc);
1984 }
1985 } else {
1986 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1987 uc++;
1988 }
786e8c11 1989 }
73104a1b 1990 s= (char *)uc;
07be1b83 1991 }
73104a1b
KW
1992 if (uc >(U8*)last_start) break;
1993 }
1994
1995 if ( word ) {
1996 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1997 if (!leftmost || lpos < leftmost) {
1998 DEBUG_r(accepted_word=word);
1999 leftmost= lpos;
7016d6eb 2000 }
73104a1b 2001 if (base==0) break;
7016d6eb 2002
73104a1b
KW
2003 }
2004 points[pointpos++ % maxlen]= uc;
2005 if (foldlen || uc < (U8*)strend) {
2006 REXEC_TRIE_READ_CHAR(trie_type, trie,
2007 widecharmap, uc,
2008 uscan, len, uvc, charid, foldlen,
2009 foldbuf, uniflags);
2010 DEBUG_TRIE_EXECUTE_r({
2011 dump_exec_pos( (char *)uc, c, strend,
2012 real_start, s, utf8_target);
2013 PerlIO_printf(Perl_debug_log,
2014 " Charid:%3u CP:%4"UVxf" ",
2015 charid, uvc);
2016 });
2017 }
2018 else {
2019 len = 0;
2020 charid = 0;
2021 }
07be1b83 2022
73104a1b
KW
2023
2024 do {
6148ee25 2025#ifdef DEBUGGING
73104a1b 2026 word = aho->states[ state ].wordnum;
6148ee25 2027#endif
73104a1b
KW
2028 base = aho->states[ state ].trans.base;
2029
2030 DEBUG_TRIE_EXECUTE_r({
2031 if (failed)
2032 dump_exec_pos( (char *)uc, c, strend, real_start,
2033 s, utf8_target );
2034 PerlIO_printf( Perl_debug_log,
2035 "%sState: %4"UVxf", word=%"UVxf,
2036 failed ? " Fail transition to " : "",
2037 (UV)state, (UV)word);
2038 });
2039 if ( base ) {
2040 U32 tmp;
2041 I32 offset;
2042 if (charid &&
2043 ( ((offset = base + charid
2044 - 1 - trie->uniquecharcount)) >= 0)
2045 && ((U32)offset < trie->lasttrans)
2046 && trie->trans[offset].check == state
2047 && (tmp=trie->trans[offset].next))
2048 {
2049 DEBUG_TRIE_EXECUTE_r(
2050 PerlIO_printf( Perl_debug_log," - legal\n"));
2051 state = tmp;
2052 break;
07be1b83
YO
2053 }
2054 else {
786e8c11 2055 DEBUG_TRIE_EXECUTE_r(
73104a1b 2056 PerlIO_printf( Perl_debug_log," - fail\n"));
786e8c11 2057 failed = 1;
73104a1b 2058 state = aho->fail[state];
07be1b83 2059 }
07be1b83 2060 }
73104a1b
KW
2061 else {
2062 /* we must be accepting here */
2063 DEBUG_TRIE_EXECUTE_r(
2064 PerlIO_printf( Perl_debug_log," - accepting\n"));
2065 failed = 1;
2066 break;
786e8c11 2067 }
73104a1b
KW
2068 } while(state);
2069 uc += len;
2070 if (failed) {
2071 if (leftmost)
2072 break;
2073 if (!state) state = 1;
07be1b83 2074 }
73104a1b
KW
2075 }
2076 if ( aho->states[ state ].wordnum ) {
2077 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2078 if (!leftmost || lpos < leftmost) {
2079 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2080 leftmost = lpos;
07be1b83
YO
2081 }
2082 }
73104a1b
KW
2083 if (leftmost) {
2084 s = (char*)leftmost;
2085 DEBUG_TRIE_EXECUTE_r({
2086 PerlIO_printf(
2087 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2088 (UV)accepted_word, (IV)(s - real_start)
2089 );
2090 });
2091 if (!reginfo || regtry(reginfo, &s)) {
2092 FREETMPS;
2093 LEAVE;
2094 goto got_it;
2095 }
2096 s = HOPc(s,1);
2097 DEBUG_TRIE_EXECUTE_r({
2098 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2099 });
2100 } else {
2101 DEBUG_TRIE_EXECUTE_r(
2102 PerlIO_printf( Perl_debug_log,"No match.\n"));
2103 break;
2104 }
2105 }
2106 FREETMPS;
2107 LEAVE;
2108 }
2109 break;
2110 default:
2111 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2112 break;
2113 }
2114 return 0;
2115 got_it:
2116 return s;
6eb5f6b9
JH
2117}
2118
fae667d5 2119
6eb5f6b9
JH
2120/*
2121 - regexec_flags - match a regexp against a string
2122 */
2123I32
5aaab254 2124Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
6eb5f6b9 2125 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2126/* stringarg: the point in the string at which to begin matching */
2127/* strend: pointer to null at end of string */
2128/* strbeg: real beginning of string */
2129/* minend: end of match must be >= minend bytes after stringarg. */
2130/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2131 * itself is accessed via the pointers above */
2132/* data: May be used for some additional optimizations.
2133 Currently its only used, with a U32 cast, for transmitting
2134 the ganch offset when doing a /g match. This will change */
2135/* nosave: For optimizations. */
2136
6eb5f6b9 2137{
97aff369 2138 dVAR;
8d919b0a 2139 struct regexp *const prog = ReANY(rx);
5aaab254 2140 char *s;
eb578fdb 2141 regnode *c;
5aaab254 2142 char *startpos = stringarg;
6eb5f6b9
JH
2143 I32 minlen; /* must match at least this many chars */
2144 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2145 I32 end_shift = 0; /* Same for the end. */ /* CC */
2146 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2147 char *scream_olds = NULL;
f2ed9b32 2148 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2149 I32 multiline;
f8fc2ecf 2150 RXi_GET_DECL(prog,progi);
3b0527fe 2151 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2152 regexp_paren_pair *swap = NULL;
a3621e74
YO
2153 GET_RE_DEBUG_FLAGS_DECL;
2154
7918f24d 2155 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2156 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2157
2158 /* Be paranoid... */
2159 if (prog == NULL || startpos == NULL) {
2160 Perl_croak(aTHX_ "NULL regexp parameter");
2161 return 0;
2162 }
2163
bbe252da 2164 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2165 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2166
f2ed9b32 2167 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2168 DEBUG_EXECUTE_r(
f2ed9b32 2169 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2170 "Matching");
2171 );
bac06658 2172
6eb5f6b9 2173 minlen = prog->minlen;
1de06328
YO
2174
2175 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2177 "String too short [regexec_flags]...\n"));
2178 goto phooey;
1aa99e6b 2179 }
6eb5f6b9 2180
1de06328 2181
6eb5f6b9 2182 /* Check validity of program. */
f8fc2ecf 2183 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2184 Perl_croak(aTHX_ "corrupted regexp program");
2185 }
2186
2187 PL_reg_flags = 0;
ed301438 2188 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2189 PL_reg_maxiter = 0;
2190
3c8556c3 2191 if (RX_UTF8(rx))
6eb5f6b9
JH
2192 PL_reg_flags |= RF_utf8;
2193
2194 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2195 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2196 PL_bostr = strbeg;
3b0527fe 2197 reginfo.sv = sv;
6eb5f6b9
JH
2198
2199 /* Mark end of line for $ (and such) */
2200 PL_regeol = strend;
2201
2202 /* see how far we have to get to not match where we matched before */
3b0527fe 2203 reginfo.till = startpos+minend;
6eb5f6b9 2204
6eb5f6b9
JH
2205 /* If there is a "must appear" string, look for it. */
2206 s = startpos;
2207
bbe252da 2208 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2209 MAGIC *mg;
2c296965 2210 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2211 reginfo.ganch = startpos + prog->gofs;
2c296965 2212 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2213 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2214 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2215 && SvMAGIC(sv)
14befaf4
DM
2216 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2217 && mg->mg_len >= 0) {
3b0527fe 2218 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2219 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2220 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2221
bbe252da 2222 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2223 if (s > reginfo.ganch)
6eb5f6b9 2224 goto phooey;
58e23c8d 2225 s = reginfo.ganch - prog->gofs;
2c296965 2226 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2227 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2228 if (s < strbeg)
2229 goto phooey;
6eb5f6b9
JH
2230 }
2231 }
58e23c8d 2232 else if (data) {
70685ca0 2233 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2234 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2235 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2236
2237 } else { /* pos() not defined */
3b0527fe 2238 reginfo.ganch = strbeg;
2c296965
YO
2239 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2240 "GPOS: reginfo.ganch = strbeg\n"));
2241 }
6eb5f6b9 2242 }
288b8c02 2243 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2244 /* We have to be careful. If the previous successful match
2245 was from this regex we don't want a subsequent partially
2246 successful match to clobber the old results.
2247 So when we detect this possibility we add a swap buffer
2248 to the re, and switch the buffer each match. If we fail
2249 we switch it back, otherwise we leave it swapped.
2250 */
2251 swap = prog->offs;
2252 /* do we need a save destructor here for eval dies? */
2253 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2254 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2255 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2256 PTR2UV(prog),
2257 PTR2UV(swap),
2258 PTR2UV(prog->offs)
2259 ));
c74340f9 2260 }
a0714e2c 2261 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2262 re_scream_pos_data d;
2263
2264 d.scream_olds = &scream_olds;
2265 d.scream_pos = &scream_pos;
288b8c02 2266 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2267 if (!s) {
a3621e74 2268 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2269 goto phooey; /* not present */
3fa9c3d7 2270 }
6eb5f6b9
JH
2271 }
2272
1de06328 2273
6eb5f6b9
JH
2274
2275 /* Simplest case: anchored match need be tried only once. */
2276 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2277 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2278 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2279 goto got_it;
bbe252da
YO
2280 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2281 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2282 {
2283 char *end;
2284
2285 if (minlen)
2286 dontbother = minlen - 1;
1aa99e6b 2287 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2288 /* for multiline we only have to try after newlines */
33b8afdf 2289 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2290 /* because of the goto we can not easily reuse the macros for bifurcating the
2291 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2292 if (utf8_target) {
2293 if (s == startpos)
2294 goto after_try_utf8;
2295 while (1) {
2296 if (regtry(&reginfo, &s)) {
2297 goto got_it;
2298 }
2299 after_try_utf8:
2300 if (s > end) {
2301 goto phooey;
2302 }
2303 if (prog->extflags & RXf_USE_INTUIT) {
2304 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2305 if (!s) {
2306 goto phooey;
2307 }
2308 }
2309 else {
2310 s += UTF8SKIP(s);
2311 }
2312 }
2313 } /* end search for check string in unicode */
2314 else {
2315 if (s == startpos) {
2316 goto after_try_latin;
2317 }
2318 while (1) {
2319 if (regtry(&reginfo, &s)) {
2320 goto got_it;
2321 }
2322 after_try_latin:
2323 if (s > end) {
2324 goto phooey;
2325 }
2326 if (prog->extflags & RXf_USE_INTUIT) {
2327 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2328 if (!s) {
2329 goto phooey;
2330 }
2331 }
2332 else {
2333 s++;
2334 }
2335 }
2336 } /* end search for check string in latin*/
2337 } /* end search for check string */
2338 else { /* search for newline */
2339 if (s > startpos) {
2340 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2341 s--;
92f3d482 2342 }
21eede78
YO
2343 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2344 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2345 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2346 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2347 goto got_it;
2348 }
92f3d482
YO
2349 }
2350 } /* end search for newline */
2351 } /* end anchored/multiline check string search */
6eb5f6b9 2352 goto phooey;
bbe252da 2353 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2354 {
486ec47a 2355 /* the warning about reginfo.ganch being used without initialization
bbe252da 2356 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2357 and we only enter this block when the same bit is set. */
58e23c8d 2358 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2359
2360 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2361 goto got_it;
2362 goto phooey;
2363 }
2364
2365 /* Messy cases: unanchored match. */
bbe252da 2366 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2367 /* we have /x+whatever/ */
f2ed9b32 2368 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2369 char ch;
bf93d4cc
GS
2370#ifdef DEBUGGING
2371 int did_match = 0;
2372#endif
f2ed9b32 2373 if (utf8_target) {
7e0d5ad7
KW
2374 if (! prog->anchored_utf8) {
2375 to_utf8_substr(prog);
2376 }
2377 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2378 REXEC_FBC_SCAN(
6eb5f6b9 2379 if (*s == ch) {
a3621e74 2380 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2381 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2382 s += UTF8SKIP(s);
2383 while (s < strend && *s == ch)
2384 s += UTF8SKIP(s);
2385 }
4cadc6a9 2386 );
7e0d5ad7 2387
6eb5f6b9
JH
2388 }
2389 else {
7e0d5ad7
KW
2390 if (! prog->anchored_substr) {
2391 if (! to_byte_substr(prog)) {
6b54ddc5 2392 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2393 }
2394 }
2395 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2396 REXEC_FBC_SCAN(
6eb5f6b9 2397 if (*s == ch) {
a3621e74 2398 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2399 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2400 s++;
2401 while (s < strend && *s == ch)
2402 s++;
2403 }
4cadc6a9 2404 );
6eb5f6b9 2405 }
a3621e74 2406 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2407 PerlIO_printf(Perl_debug_log,
b7953727
JH
2408 "Did not find anchored character...\n")
2409 );
6eb5f6b9 2410 }
a0714e2c
SS
2411 else if (prog->anchored_substr != NULL
2412 || prog->anchored_utf8 != NULL
2413 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2414 && prog->float_max_offset < strend - s)) {
2415 SV *must;
2416 I32 back_max;
2417 I32 back_min;
2418 char *last;
6eb5f6b9 2419 char *last1; /* Last position checked before */
bf93d4cc
GS
2420#ifdef DEBUGGING
2421 int did_match = 0;
2422#endif
33b8afdf 2423 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2424 if (utf8_target) {
2425 if (! prog->anchored_utf8) {
2426 to_utf8_substr(prog);
2427 }
2428 must = prog->anchored_utf8;
2429 }
2430 else {
2431 if (! prog->anchored_substr) {
2432 if (! to_byte_substr(prog)) {
6b54ddc5 2433 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2434 }
2435 }
2436 must = prog->anchored_substr;
2437 }
33b8afdf
JH
2438 back_max = back_min = prog->anchored_offset;
2439 } else {
7e0d5ad7
KW
2440 if (utf8_target) {
2441 if (! prog->float_utf8) {
2442 to_utf8_substr(prog);
2443 }
2444 must = prog->float_utf8;
2445 }
2446 else {
2447 if (! prog->float_substr) {
2448 if (! to_byte_substr(prog)) {
6b54ddc5 2449 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2450 }
2451 }
2452 must = prog->float_substr;
2453 }
33b8afdf
JH
2454 back_max = prog->float_max_offset;
2455 back_min = prog->float_min_offset;
2456 }
1de06328 2457
1de06328
YO
2458 if (back_min<0) {
2459 last = strend;
2460 } else {
2461 last = HOP3c(strend, /* Cannot start after this */
2462 -(I32)(CHR_SVLEN(must)
2463 - (SvTAIL(must) != 0) + back_min), strbeg);
2464 }
6eb5f6b9
JH
2465 if (s > PL_bostr)
2466 last1 = HOPc(s, -1);
2467 else
2468 last1 = s - 1; /* bogus */
2469
a0288114 2470 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2471 check_substr==must. */
2472 scream_pos = -1;
2473 dontbother = end_shift;
2474 strend = HOPc(strend, -dontbother);
2475 while ( (s <= last) &&
c33e64f0 2476 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2477 (unsigned char*)strend, must,
c33e64f0 2478 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2479 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2480 if (HOPc(s, -back_max) > last1) {
2481 last1 = HOPc(s, -back_min);
2482 s = HOPc(s, -back_max);
2483 }
2484 else {
52657f30 2485 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2486
2487 last1 = HOPc(s, -back_min);
52657f30 2488 s = t;
6eb5f6b9 2489 }
f2ed9b32 2490 if (utf8_target) {
6eb5f6b9 2491 while (s <= last1) {
24b23f37 2492 if (regtry(&reginfo, &s))
6eb5f6b9 2493 goto got_it;
7016d6eb
DM
2494 if (s >= last1) {
2495 s++; /* to break out of outer loop */
2496 break;
2497 }
2498 s += UTF8SKIP(s);
6eb5f6b9
JH
2499 }
2500 }
2501 else {
2502 while (s <= last1) {
24b23f37 2503 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2504 goto got_it;
2505 s++;
2506 }
2507 }
2508 }
ab3bbdeb 2509 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2510 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2511 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2512 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2513 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2514 ? "anchored" : "floating"),
ab3bbdeb
YO
2515 quoted, RE_SV_TAIL(must));
2516 });
6eb5f6b9
JH
2517 goto phooey;
2518 }
f8fc2ecf 2519 else if ( (c = progi->regstclass) ) {
f14c76ed 2520 if (minlen) {
f8fc2ecf 2521 const OPCODE op = OP(progi->regstclass);
66e933ab 2522 /* don't bother with what can't match */
786e8c11 2523 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2524 strend = HOPc(strend, -(minlen - 1));
2525 }
a3621e74 2526 DEBUG_EXECUTE_r({
be8e71aa 2527 SV * const prop = sv_newmortal();
32fc9b6a 2528 regprop(prog, prop, c);
0df25f3d 2529 {
f2ed9b32 2530 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2531 s,strend-s,60);
0df25f3d 2532 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2533 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2534 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2535 quoted, (int)(strend - s));
0df25f3d 2536 }
ffc61ed2 2537 });
3b0527fe 2538 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2539 goto got_it;
07be1b83 2540 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2541 }
2542 else {
2543 dontbother = 0;
a0714e2c 2544 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2545 /* Trim the end. */
6af40bd7 2546 char *last= NULL;
33b8afdf 2547 SV* float_real;
c33e64f0
FC
2548 STRLEN len;
2549 const char *little;
33b8afdf 2550
7e0d5ad7
KW
2551 if (utf8_target) {
2552 if (! prog->float_utf8) {
2553 to_utf8_substr(prog);
2554 }
2555 float_real = prog->float_utf8;
2556 }
2557 else {
2558 if (! prog->float_substr) {
2559 if (! to_byte_substr(prog)) {
6b54ddc5 2560 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2561 }
2562 }
2563 float_real = prog->float_substr;
2564 }
d6a28714 2565
c33e64f0
FC
2566 little = SvPV_const(float_real, len);
2567 if (SvTAIL(float_real)) {
7f18ad16
KW
2568 /* This means that float_real contains an artificial \n on
2569 * the end due to the presence of something like this:
2570 * /foo$/ where we can match both "foo" and "foo\n" at the
2571 * end of the string. So we have to compare the end of the
2572 * string first against the float_real without the \n and
2573 * then against the full float_real with the string. We
2574 * have to watch out for cases where the string might be
2575 * smaller than the float_real or the float_real without
2576 * the \n. */
1a13b075
YO
2577 char *checkpos= strend - len;
2578 DEBUG_OPTIMISE_r(
2579 PerlIO_printf(Perl_debug_log,
2580 "%sChecking for float_real.%s\n",
2581 PL_colors[4], PL_colors[5]));
2582 if (checkpos + 1 < strbeg) {
7f18ad16
KW
2583 /* can't match, even if we remove the trailing \n
2584 * string is too short to match */
1a13b075
YO
2585 DEBUG_EXECUTE_r(
2586 PerlIO_printf(Perl_debug_log,
2587 "%sString shorter than required trailing substring, cannot match.%s\n",
2588 PL_colors[4], PL_colors[5]));
2589 goto phooey;
2590 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
2591 /* can match, the end of the string matches without the
2592 * "\n" */
1a13b075
YO
2593 last = checkpos + 1;
2594 } else if (checkpos < strbeg) {
7f18ad16
KW
2595 /* cant match, string is too short when the "\n" is
2596 * included */
1a13b075
YO
2597 DEBUG_EXECUTE_r(
2598 PerlIO_printf(Perl_debug_log,
2599 "%sString does not contain required trailing substring, cannot match.%s\n",
2600 PL_colors[4], PL_colors[5]));
2601 goto phooey;
2602 } else if (!multiline) {
7f18ad16
KW
2603 /* non multiline match, so compare with the "\n" at the
2604 * end of the string */
1a13b075
YO
2605 if (memEQ(checkpos, little, len)) {
2606 last= checkpos;
2607 } else {
2608 DEBUG_EXECUTE_r(
2609 PerlIO_printf(Perl_debug_log,
2610 "%sString does not contain required trailing substring, cannot match.%s\n",
2611 PL_colors[4], PL_colors[5]));
2612 goto phooey;
2613 }
2614 } else {
7f18ad16
KW
2615 /* multiline match, so we have to search for a place
2616 * where the full string is located */
d6a28714 2617 goto find_last;
1a13b075 2618 }
c33e64f0 2619 } else {
d6a28714 2620 find_last:
9041c2e3 2621 if (len)
d6a28714 2622 last = rninstr(s, strend, little, little + len);
b8c5462f 2623 else
a0288114 2624 last = strend; /* matching "$" */
b8c5462f 2625 }
6af40bd7 2626 if (!last) {
7f18ad16
KW
2627 /* at one point this block contained a comment which was
2628 * probably incorrect, which said that this was a "should not
2629 * happen" case. Even if it was true when it was written I am
2630 * pretty sure it is not anymore, so I have removed the comment
2631 * and replaced it with this one. Yves */
6bda09f9
YO
2632 DEBUG_EXECUTE_r(
2633 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2634 "String does not contain required substring, cannot match.\n"
2635 ));
2636 goto phooey;
bf93d4cc 2637 }
d6a28714
JH
2638 dontbother = strend - last + prog->float_min_offset;
2639 }
2640 if (minlen && (dontbother < minlen))
2641 dontbother = minlen - 1;
2642 strend -= dontbother; /* this one's always in bytes! */
2643 /* We don't know much -- general case. */
f2ed9b32 2644 if (utf8_target) {
d6a28714 2645 for (;;) {
24b23f37 2646 if (regtry(&reginfo, &s))
d6a28714
JH
2647 goto got_it;
2648 if (s >= strend)
2649 break;
b8c5462f 2650 s += UTF8SKIP(s);
d6a28714
JH
2651 };
2652 }
2653 else {
2654 do {
24b23f37 2655 if (regtry(&reginfo, &s))
d6a28714
JH
2656 goto got_it;
2657 } while (s++ < strend);
2658 }
2659 }
2660
2661 /* Failure. */
2662 goto phooey;
2663
2664got_it:
495f47a5
DM
2665 DEBUG_BUFFERS_r(
2666 if (swap)
2667 PerlIO_printf(Perl_debug_log,
2668 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2669 PTR2UV(prog),
2670 PTR2UV(swap)
2671 );
2672 );
e9105d30 2673 Safefree(swap);
288b8c02 2674 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2675
ed301438 2676 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2677 restore_pos(aTHX_ prog);
5daac39c
NC
2678 if (RXp_PAREN_NAMES(prog))
2679 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2680
2681 /* make sure $`, $&, $', and $digit will work later */
2682 if ( !(flags & REXEC_NOT_FIRST) ) {
d6a28714 2683 if (flags & REXEC_COPY_STR) {
db2c6cb3
FC
2684#ifdef PERL_ANY_COW
2685 if (SvCANCOW(sv)) {
ed252734
NC
2686 if (DEBUG_C_TEST) {
2687 PerlIO_printf(Perl_debug_log,
2688 "Copy on write: regexp capture, type %d\n",
2689 (int) SvTYPE(sv));
2690 }
77f8f7c1 2691 RX_MATCH_COPY_FREE(rx);
ed252734 2692 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2693 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734 2694 assert (SvPOKp(prog->saved_copy));
6502e081
DM
2695 prog->sublen = PL_regeol - strbeg;
2696 prog->suboffset = 0;
2697 prog->subcoffset = 0;
ed252734
NC
2698 } else
2699#endif
2700 {
6502e081
DM
2701 I32 min = 0;
2702 I32 max = PL_regeol - strbeg;
2703 I32 sublen;
2704
2705 if ( (flags & REXEC_COPY_SKIP_POST)
2706 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2707 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2708 ) { /* don't copy $' part of string */
3de645a8 2709 U32 n = 0;
6502e081
DM
2710 max = -1;
2711 /* calculate the right-most part of the string covered
2712 * by a capture. Due to look-ahead, this may be to
2713 * the right of $&, so we have to scan all captures */
2714 while (n <= prog->lastparen) {
2715 if (prog->offs[n].end > max)
2716 max = prog->offs[n].end;
2717 n++;
2718 }
2719 if (max == -1)
2720 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2721 ? prog->offs[0].start
2722 : 0;
2723 assert(max >= 0 && max <= PL_regeol - strbeg);
2724 }
2725
2726 if ( (flags & REXEC_COPY_SKIP_PRE)
2727 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2728 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2729 ) { /* don't copy $` part of string */
3de645a8 2730 U32 n = 0;
6502e081
DM
2731 min = max;
2732 /* calculate the left-most part of the string covered
2733 * by a capture. Due to look-behind, this may be to
2734 * the left of $&, so we have to scan all captures */
2735 while (min && n <= prog->lastparen) {
2736 if ( prog->offs[n].start != -1
2737 && prog->offs[n].start < min)
2738 {
2739 min = prog->offs[n].start;
2740 }
2741 n++;
2742 }
2743 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2744 && min > prog->offs[0].end
2745 )
2746 min = prog->offs[0].end;
2747
2748 }
2749
2750 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2751 sublen = max - min;
2752
2753 if (RX_MATCH_COPIED(rx)) {
2754 if (sublen > prog->sublen)
2755 prog->subbeg =
2756 (char*)saferealloc(prog->subbeg, sublen+1);
2757 }
2758 else
2759 prog->subbeg = (char*)safemalloc(sublen+1);
2760 Copy(strbeg + min, prog->subbeg, sublen, char);
2761 prog->subbeg[sublen] = '\0';
2762 prog->suboffset = min;
2763 prog->sublen = sublen;
77f8f7c1 2764 RX_MATCH_COPIED_on(rx);
6502e081 2765 }
6502e081
DM
2766 prog->subcoffset = prog->suboffset;
2767 if (prog->suboffset && utf8_target) {
2768 /* Convert byte offset to chars.
2769 * XXX ideally should only compute this if @-/@+
2770 * has been seen, a la PL_sawampersand ??? */
2771
2772 /* If there's a direct correspondence between the
2773 * string which we're matching and the original SV,
2774 * then we can use the utf8 len cache associated with
2775 * the SV. In particular, it means that under //g,
2776 * sv_pos_b2u() will use the previously cached
2777 * position to speed up working out the new length of
2778 * subcoffset, rather than counting from the start of
2779 * the string each time. This stops
2780 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2781 * from going quadratic */
2782 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2783 sv_pos_b2u(sv, &(prog->subcoffset));
2784 else
2785 prog->subcoffset = utf8_length((U8*)strbeg,
2786 (U8*)(strbeg+prog->suboffset));
2787 }
d6a28714
JH
2788 }
2789 else {
6502e081 2790 RX_MATCH_COPY_FREE(rx);
d6a28714 2791 prog->subbeg = strbeg;
6502e081
DM
2792 prog->suboffset = 0;
2793 prog->subcoffset = 0;
d6a28714
JH
2794 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2795 }
2796 }
9041c2e3 2797
d6a28714
JH
2798 return 1;
2799
2800phooey:
a3621e74 2801 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2802 PL_colors[4], PL_colors[5]));
ed301438 2803 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2804 restore_pos(aTHX_ prog);
e9105d30 2805 if (swap) {
c74340f9 2806 /* we failed :-( roll it back */
495f47a5
DM
2807 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2808 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2809 PTR2UV(prog),
2810 PTR2UV(prog->offs),
2811 PTR2UV(swap)
2812 ));
e9105d30
GG
2813 Safefree(prog->offs);
2814 prog->offs = swap;
2815 }
d6a28714
JH
2816 return 0;
2817}
2818
6bda09f9 2819
ec43f78b
DM
2820/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2821 * Do inc before dec, in case old and new rex are the same */
2822#define SET_reg_curpm(Re2) \
2823 if (PL_reg_state.re_state_eval_setup_done) { \
2824 (void)ReREFCNT_inc(Re2); \
2825 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2826 PM_SETRE((PL_reg_curpm), (Re2)); \
2827 }
2828
2829
d6a28714
JH
2830/*
2831 - regtry - try match at specific point
2832 */
2833STATIC I32 /* 0 failure, 1 success */
f73aaa43 2834S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 2835{
97aff369 2836 dVAR;
d6a28714 2837 CHECKPOINT lastcp;
288b8c02 2838 REGEXP *const rx = reginfo->prog;
8d919b0a 2839 regexp *const prog = ReANY(rx);
f73aaa43 2840 I32 result;
f8fc2ecf 2841 RXi_GET_DECL(prog,progi);
a3621e74 2842 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2843
2844 PERL_ARGS_ASSERT_REGTRY;
2845
24b23f37 2846 reginfo->cutpoint=NULL;
d6a28714 2847
ed301438
DM
2848 if ((prog->extflags & RXf_EVAL_SEEN)
2849 && !PL_reg_state.re_state_eval_setup_done)
2850 {
d6a28714
JH
2851 MAGIC *mg;
2852
ed301438 2853 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2854 if (reginfo->sv) {
d6a28714 2855 /* Make $_ available to executed code. */
3b0527fe 2856 if (reginfo->sv != DEFSV) {
59f00321 2857 SAVE_DEFSV;
414bf5ae 2858 DEFSV_set(reginfo->sv);
b8c5462f 2859 }
d6a28714 2860
3b0527fe
DM
2861 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2862 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2863 /* prepare for quick setting of pos */
d300d9fa 2864#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2865 if (SvIsCOW(reginfo->sv))
2866 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2867#endif
3dab1dad 2868 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2869 &PL_vtbl_mglob, NULL, 0);
d6a28714 2870 mg->mg_len = -1;
b8c5462f 2871 }
d6a28714
JH
2872 PL_reg_magic = mg;
2873 PL_reg_oldpos = mg->mg_len;
4f639d21 2874 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2875 }
09687e5a 2876 if (!PL_reg_curpm) {
a02a5408 2877 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2878#ifdef USE_ITHREADS
2879 {
14a49a24 2880 SV* const repointer = &PL_sv_undef;
92313705
NC
2881 /* this regexp is also owned by the new PL_reg_curpm, which
2882 will try to free it. */
d2ece331 2883 av_push(PL_regex_padav, repointer);
09687e5a
AB
2884 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2885 PL_regex_pad = AvARRAY(PL_regex_padav);
2886 }
2887#endif
2888 }
ec43f78b 2889 SET_reg_curpm(rx);
d6a28714
JH
2890 PL_reg_oldcurpm = PL_curpm;
2891 PL_curpm = PL_reg_curpm;
07bc277f 2892 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2893 /* Here is a serious problem: we cannot rewrite subbeg,
2894 since it may be needed if this match fails. Thus
2895 $` inside (?{}) could fail... */
2896 PL_reg_oldsaved = prog->subbeg;
2897 PL_reg_oldsavedlen = prog->sublen;
6502e081
DM
2898 PL_reg_oldsavedoffset = prog->suboffset;
2899 PL_reg_oldsavedcoffset = prog->suboffset;
db2c6cb3 2900#ifdef PERL_ANY_COW
ed252734
NC
2901 PL_nrs = prog->saved_copy;
2902#endif
07bc277f 2903 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2904 }
2905 else
bd61b366 2906 PL_reg_oldsaved = NULL;
d6a28714 2907 prog->subbeg = PL_bostr;
6502e081
DM
2908 prog->suboffset = 0;
2909 prog->subcoffset = 0;
d6a28714
JH
2910 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2911 }
97ca13b7 2912#ifdef DEBUGGING
f73aaa43 2913 PL_reg_starttry = *startposp;
97ca13b7 2914#endif
f73aaa43 2915 prog->offs[0].start = *startposp - PL_bostr;
d6a28714 2916 prog->lastparen = 0;
03994de8 2917 prog->lastcloseparen = 0;
d6a28714
JH
2918
2919 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2920 to do this again and again, prog->lastparen should take care of
3dd2943c 2921 this! --ilya*/
dafc8851
JH
2922
2923 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2924 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2925 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2926 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2927 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2928 * Meanwhile, this code *is* needed for the
daf18116
JH
2929 * above-mentioned test suite tests to succeed. The common theme
2930 * on those tests seems to be returning null fields from matches.
225593e1 2931 * --jhi updated by dapm */
dafc8851 2932#if 1
d6a28714 2933 if (prog->nparens) {
b93070ed 2934 regexp_paren_pair *pp = prog->offs;
eb578fdb 2935 I32 i;
b93070ed 2936 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2937 ++pp;
2938 pp->start = -1;
2939 pp->end = -1;
d6a28714
JH
2940 }
2941 }
dafc8851 2942#endif
02db2b7b 2943 REGCP_SET(lastcp);
f73aaa43
DM
2944 result = regmatch(reginfo, *startposp, progi->program + 1);
2945 if (result != -1) {
2946 prog->offs[0].end = result;
d6a28714
JH
2947 return 1;
2948 }
24b23f37 2949 if (reginfo->cutpoint)
f73aaa43 2950 *startposp= reginfo->cutpoint;
02db2b7b 2951 REGCP_UNWIND(lastcp);
d6a28714
JH
2952 return 0;
2953}
2954
02db2b7b 2955
8ba1375e
MJD
2956#define sayYES goto yes
2957#define sayNO goto no
262b90c4 2958#define sayNO_SILENT goto no_silent
8ba1375e 2959
f9f4320a
YO
2960/* we dont use STMT_START/END here because it leads to
2961 "unreachable code" warnings, which are bogus, but distracting. */
2962#define CACHEsayNO \
c476f425
DM
2963 if (ST.cache_mask) \
2964 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2965 sayNO
3298f257 2966
a3621e74 2967/* this is used to determine how far from the left messages like
265c4333
YO
2968 'failed...' are printed. It should be set such that messages
2969 are inline with the regop output that created them.
a3621e74 2970*/
265c4333 2971#define REPORT_CODE_OFF 32
a3621e74
YO
2972
2973
40a82448
DM
2974#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2975#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
2976#define CHRTEST_NOT_A_CP_1 -999
2977#define CHRTEST_NOT_A_CP_2 -998
9e137952 2978
86545054
DM
2979#define SLAB_FIRST(s) (&(s)->states[0])
2980#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2981
5d9a96ca
DM
2982/* grab a new slab and return the first slot in it */
2983
2984STATIC regmatch_state *
2985S_push_slab(pTHX)
2986{
a35a87e7 2987#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2988 dMY_CXT;
2989#endif
5d9a96ca
DM
2990 regmatch_slab *s = PL_regmatch_slab->next;
2991 if (!s) {
2992 Newx(s, 1, regmatch_slab);
2993 s->prev = PL_regmatch_slab;
2994 s->next = NULL;
2995 PL_regmatch_slab->next = s;
2996 }
2997 PL_regmatch_slab = s;
86545054 2998 return SLAB_FIRST(s);
5d9a96ca 2999}
5b47454d 3000
95b24440 3001
40a82448
DM
3002/* push a new state then goto it */
3003
4d5016e5
DM
3004#define PUSH_STATE_GOTO(state, node, input) \
3005 pushinput = input; \
40a82448
DM
3006 scan = node; \
3007 st->resume_state = state; \
3008 goto push_state;
3009
3010/* push a new state with success backtracking, then goto it */
3011
4d5016e5
DM
3012#define PUSH_YES_STATE_GOTO(state, node, input) \
3013 pushinput = input; \
40a82448
DM
3014 scan = node; \
3015 st->resume_state = state; \
3016 goto push_yes_state;
3017
aa283a38 3018
aa283a38 3019
4d5016e5 3020
d6a28714 3021/*
95b24440 3022
bf1f174e
DM
3023regmatch() - main matching routine
3024
3025This is basically one big switch statement in a loop. We execute an op,
3026set 'next' to point the next op, and continue. If we come to a point which
3027we may need to backtrack to on failure such as (A|B|C), we push a
3028backtrack state onto the backtrack stack. On failure, we pop the top
3029state, and re-enter the loop at the state indicated. If there are no more
3030states to pop, we return failure.
3031
3032Sometimes we also need to backtrack on success; for example /A+/, where
3033after successfully matching one A, we need to go back and try to
3034match another one; similarly for lookahead assertions: if the assertion
3035completes successfully, we backtrack to the state just before the assertion
3036and then carry on. In these cases, the pushed state is marked as
3037'backtrack on success too'. This marking is in fact done by a chain of
3038pointers, each pointing to the previous 'yes' state. On success, we pop to
3039the nearest yes state, discarding any intermediate failure-only states.
3040Sometimes a yes state is pushed just to force some cleanup code to be
3041called at the end of a successful match or submatch; e.g. (??{$re}) uses
3042it to free the inner regex.
3043
3044Note that failure backtracking rewinds the cursor position, while
3045success backtracking leaves it alone.
3046
3047A pattern is complete when the END op is executed, while a subpattern
3048such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3049ops trigger the "pop to last yes state if any, otherwise return true"
3050behaviour.
3051
3052A common convention in this function is to use A and B to refer to the two
3053subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3054the subpattern to be matched possibly multiple times, while B is the entire
3055rest of the pattern. Variable and state names reflect this convention.
3056
3057The states in the main switch are the union of ops and failure/success of
3058substates associated with with that op. For example, IFMATCH is the op
3059that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3060'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3061successfully matched A and IFMATCH_A_fail is a state saying that we have
3062just failed to match A. Resume states always come in pairs. The backtrack
3063state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3064at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3065on success or failure.
3066
3067The struct that holds a backtracking state is actually a big union, with
3068one variant for each major type of op. The variable st points to the
3069top-most backtrack struct. To make the code clearer, within each
3070block of code we #define ST to alias the relevant union.
3071
3072Here's a concrete example of a (vastly oversimplified) IFMATCH
3073implementation:
3074
3075 switch (state) {
3076 ....
3077
3078#define ST st->u.ifmatch
3079
3080 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3081 ST.foo = ...; // some state we wish to save
95b24440 3082 ...
bf1f174e
DM
3083 // push a yes backtrack state with a resume value of
3084 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3085 // first node of A:
4d5016e5 3086 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3087 // NOTREACHED
3088
3089 case IFMATCH_A: // we have successfully executed A; now continue with B
3090 next = B;
3091 bar = ST.foo; // do something with the preserved value
3092 break;
3093
3094 case IFMATCH_A_fail: // A failed, so the assertion failed
3095 ...; // do some housekeeping, then ...
3096 sayNO; // propagate the failure
3097
3098#undef ST
95b24440 3099
bf1f174e
DM
3100 ...
3101 }
95b24440 3102
bf1f174e
DM
3103For any old-timers reading this who are familiar with the old recursive
3104approach, the code above is equivalent to:
95b24440 3105
bf1f174e
DM
3106 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3107 {
3108 int foo = ...
95b24440 3109 ...
bf1f174e
DM
3110 if (regmatch(A)) {
3111 next = B;
3112 bar = foo;
3113 break;
95b24440 3114 }
bf1f174e
DM
3115 ...; // do some housekeeping, then ...
3116 sayNO; // propagate the failure
95b24440 3117 }
bf1f174e
DM
3118
3119The topmost backtrack state, pointed to by st, is usually free. If you
3120want to claim it, populate any ST.foo fields in it with values you wish to
3121save, then do one of
3122
4d5016e5
DM
3123 PUSH_STATE_GOTO(resume_state, node, newinput);
3124 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3125
3126which sets that backtrack state's resume value to 'resume_state', pushes a
3127new free entry to the top of the backtrack stack, then goes to 'node'.
3128On backtracking, the free slot is popped, and the saved state becomes the
3129new free state. An ST.foo field in this new top state can be temporarily
3130accessed to retrieve values, but once the main loop is re-entered, it
3131becomes available for reuse.
3132
3133Note that the depth of the backtrack stack constantly increases during the
3134left-to-right execution of the pattern, rather than going up and down with
3135the pattern nesting. For example the stack is at its maximum at Z at the
3136end of the pattern, rather than at X in the following:
3137
3138 /(((X)+)+)+....(Y)+....Z/
3139
3140The only exceptions to this are lookahead/behind assertions and the cut,
3141(?>A), which pop all the backtrack states associated with A before
3142continuing.
3143
486ec47a 3144Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3145PL_regmatch_state and st always point to the currently active state,
3146and PL_regmatch_slab points to the slab currently containing
3147PL_regmatch_state. The first time regmatch() is called, the first slab is
3148allocated, and is never freed until interpreter destruction. When the slab
3149is full, a new one is allocated and chained to the end. At exit from
3150regmatch(), slabs allocated since entry are freed.
3151
3152*/
95b24440 3153
40a82448 3154
5bc10b2c 3155#define DEBUG_STATE_pp(pp) \
265c4333 3156 DEBUG_STATE_r({ \
f2ed9b32 3157 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3158 PerlIO_printf(Perl_debug_log, \
5d458dd8 3159 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3160 depth*2, "", \
13d6edb4 3161 PL_reg_name[st->resume_state], \
5d458dd8
YO
3162 ((st==yes_state||st==mark_state) ? "[" : ""), \
3163 ((st==yes_state) ? "Y" : ""), \
3164 ((st==mark_state) ? "M" : ""), \
3165 ((st==yes_state||st==mark_state) ? "]" : "") \
3166 ); \
265c4333 3167 });
5bc10b2c 3168
40a82448 3169
3dab1dad 3170#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3171
3df15adc 3172#ifdef DEBUGGING
5bc10b2c 3173
ab3bbdeb 3174STATIC void
f2ed9b32 3175S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3176 const char *start, const char *end, const char *blurb)
3177{
efd26800 3178 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3179
3180 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3181
ab3bbdeb
YO
3182 if (!PL_colorset)
3183 reginitcolors();
3184 {
3185 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3186 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3187
f2ed9b32 3188 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3189 start, end - start, 60);
3190
3191 PerlIO_printf(Perl_debug_log,
3192 "%s%s REx%s %s against %s\n",
3193 PL_colors[4], blurb, PL_colors[5], s0, s1);
3194
f2ed9b32 3195 if (utf8_target||utf8_pat)
1de06328
YO
3196 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3197 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3198 utf8_pat && utf8_target ? " and " : "",
3199 utf8_target ? "string" : ""
ab3bbdeb
YO
3200 );
3201 }
3202}
3df15adc
YO
3203
3204STATIC void
786e8c11
YO
3205S_dump_exec_pos(pTHX_ const char *locinput,
3206 const regnode *scan,
3207 const char *loc_regeol,
3208 const char *loc_bostr,
3209 const char *loc_reg_starttry,
f2ed9b32 3210 const bool utf8_target)
07be1b83 3211{
786e8c11 3212 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3213 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3214 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3215 /* The part of the string before starttry has one color
3216 (pref0_len chars), between starttry and current
3217 position another one (pref_len - pref0_len chars),
3218 after the current position the third one.
3219 We assume that pref0_len <= pref_len, otherwise we
3220 decrease pref0_len. */
786e8c11
YO
3221 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3222 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3223 int pref0_len;
3224
7918f24d
NC
3225 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3226
f2ed9b32 3227 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3228 pref_len++;
786e8c11
YO
3229 pref0_len = pref_len - (locinput - loc_reg_starttry);
3230 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3231 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3232 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3233 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3234 l--;
3235 if (pref0_len < 0)
3236 pref0_len = 0;
3237 if (pref0_len > pref_len)
3238 pref0_len = pref_len;
3239 {
f2ed9b32 3240 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3241
ab3bbdeb 3242 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3243 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3244
ab3bbdeb 3245 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3246 (locinput - pref_len + pref0_len),
1de06328 3247 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3248
ab3bbdeb 3249 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3250 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3251
1de06328 3252 const STRLEN tlen=len0+len1+len2;
3df15adc 3253 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3254 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3255 (IV)(locinput - loc_bostr),
07be1b83 3256 len0, s0,
07be1b83 3257 len1, s1,
07be1b83 3258 (docolor ? "" : "> <"),
07be1b83 3259 len2, s2,
f9f4320a 3260 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3261 "");
3262 }
3263}
3df15adc 3264
07be1b83
YO
3265#endif
3266
0a4db386
YO
3267/* reg_check_named_buff_matched()
3268 * Checks to see if a named buffer has matched. The data array of
3269 * buffer numbers corresponding to the buffer is expected to reside
3270 * in the regexp->data->data array in the slot stored in the ARG() of
3271 * node involved. Note that this routine doesn't actually care about the
3272 * name, that information is not preserved from compilation to execution.
3273 * Returns the index of the leftmost defined buffer with the given name
3274 * or 0 if non of the buffers matched.
3275 */
3276STATIC I32
7918f24d
NC
3277S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3278{
0a4db386 3279 I32 n;
f8fc2ecf 3280 RXi_GET_DECL(rex,rexi);
ad64d0ec 3281 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3282 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3283
3284 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3285
0a4db386 3286 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3287 if ((I32)rex->lastparen >= nums[n] &&
3288 rex->offs[nums[n]].end != -1)
0a4db386
YO
3289 {
3290 return nums[n];
3291 }
3292 }
3293 return 0;
3294}
3295
2f554ef7
DM
3296
3297/* free all slabs above current one - called during LEAVE_SCOPE */
3298
3299STATIC void
3300S_clear_backtrack_stack(pTHX_ void *p)
3301{
3302 regmatch_slab *s = PL_regmatch_slab->next;
3303 PERL_UNUSED_ARG(p);
3304
3305 if (!s)
3306 return;
3307 PL_regmatch_slab->next = NULL;
3308 while (s) {
3309 regmatch_slab * const osl = s;
3310 s = s->next;
3311 Safefree(osl);
3312 }
3313}
c74f6de9 3314static bool
79a2a0e8 3315S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
c74f6de9 3316{
79a2a0e8
KW
3317 /* This function determines if there are one or two characters that match
3318 * the first character of the passed-in EXACTish node <text_node>, and if
3319 * so, returns them in the passed-in pointers.
c74f6de9 3320 *
79a2a0e8
KW
3321 * If it determines that no possible character in the target string can
3322 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3323 * the first character in <text_node> requires UTF-8 to represent, and the
3324 * target string isn't in UTF-8.)
c74f6de9 3325 *
79a2a0e8
KW
3326 * If there are more than two characters that could match the beginning of
3327 * <text_node>, or if more context is required to determine a match or not,
3328 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3329 *
3330 * The motiviation behind this function is to allow the caller to set up
3331 * tight loops for matching. If <text_node> is of type EXACT, there is
3332 * only one possible character that can match its first character, and so
3333 * the situation is quite simple. But things get much more complicated if
3334 * folding is involved. It may be that the first character of an EXACTFish
3335 * node doesn't participate in any possible fold, e.g., punctuation, so it
3336 * can be matched only by itself. The vast majority of characters that are
3337 * in folds match just two things, their lower and upper-case equivalents.
3338 * But not all are like that; some have multiple possible matches, or match
3339 * sequences of more than one character. This function sorts all that out.
3340 *
3341 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3342 * loop of trying to match A*, we know we can't exit where the thing
3343 * following it isn't a B. And something can't be a B unless it is the
3344 * beginning of B. By putting a quick test for that beginning in a tight
3345 * loop, we can rule out things that can't possibly be B without having to
3346 * break out of the loop, thus avoiding work. Similarly, if A is a single
3347 * character, we can make a tight loop matching A*, using the outputs of
3348 * this function.
3349 *
3350 * If the target string to match isn't in UTF-8, and there aren't
3351 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3352 * the one or two possible octets (which are characters in this situation)
3353 * that can match. In all cases, if there is only one character that can
3354 * match, *<c1p> and *<c2p> will be identical.
3355 *
3356 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3357 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3358 * can match the beginning of <text_node>. They should be declared with at
3359 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3360 * undefined what these contain.) If one or both of the buffers are
3361 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3362 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3363 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3364 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3365 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
c74f6de9
KW
3366
3367 const bool utf8_target = PL_reg_match_utf8;
79a2a0e8 3368
ddb0d839
KW
3369 UV c1 = CHRTEST_NOT_A_CP_1;
3370 UV c2 = CHRTEST_NOT_A_CP_2;
79a2a0e8
KW
3371 bool use_chrtest_void = FALSE;
3372
3373 /* Used when we have both utf8 input and utf8 output, to avoid converting
3374 * to/from code points */
3375 bool utf8_has_been_setup = FALSE;
3376
c74f6de9
KW
3377 dVAR;
3378
b4291290 3379 U8 *pat = (U8*)STRING(text_node);
c74f6de9 3380
79a2a0e8
KW
3381 if (OP(text_node) == EXACT) {
3382
3383 /* In an exact node, only one thing can be matched, that first
3384 * character. If both the pat and the target are UTF-8, we can just
3385 * copy the input to the output, avoiding finding the code point of
3386 * that character */
3387 if (! UTF_PATTERN) {
3388 c2 = c1 = *pat;
3389 }
3390 else if (utf8_target) {
3391 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3392 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3393 utf8_has_been_setup = TRUE;
3394 }
3395 else {
3396 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
c74f6de9 3397 }
79a2a0e8
KW
3398 }
3399 else /* an EXACTFish node */
3400 if ((UTF_PATTERN
3401 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3402 pat + STR_LEN(text_node)))
3403 || (! UTF_PATTERN
3404 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3405 pat + STR_LEN(text_node))))
3406 {
3407 /* Multi-character folds require more context to sort out. Also
3408 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3409 * handled outside this routine */
3410 use_chrtest_void = TRUE;
3411 }
3412 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3413 c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3414 if (c1 > 256) {
3415 /* Load the folds hash, if not already done */
3416 SV** listp;
3417 if (! PL_utf8_foldclosures) {
3418 if (! PL_utf8_tofold) {
3419 U8 dummy[UTF8_MAXBYTES+1];
3420
3421 /* Force loading this by folding an above-Latin1 char */
3422 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3423 assert(PL_utf8_tofold); /* Verify that worked */
3424 }
3425 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3426 }
3427
3428 /* The fold closures data structure is a hash with the keys being
3429 * the UTF-8 of every character that is folded to, like 'k', and
3430 * the values each an array of all code points that fold to its
3431 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3432 * not included */
3433 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3434 (char *) pat,
3435 UTF8SKIP(pat),
3436 FALSE))))
3437 {
3438 /* Not found in the hash, therefore there are no folds
3439 * containing it, so there is only a single character that
3440 * could match */
3441 c2 = c1;
3442 }
3443 else { /* Does participate in folds */
3444 AV* list = (AV*) *listp;
3445 if (av_len(list) != 1) {
3446
3447 /* If there aren't exactly two folds to this, it is outside
3448 * the scope of this function */
3449 use_chrtest_void = TRUE;
3450 }
3451 else { /* There are two. Get them */
3452 SV** c_p = av_fetch(list, 0, FALSE);
3453 if (c_p == NULL) {
3454 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3455 }
3456 c1 = SvUV(*c_p);
3457
3458 c_p = av_fetch(list, 1, FALSE);
3459 if (c_p == NULL) {
3460 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3461 }
3462 c2 = SvUV(*c_p);
3463
3464 /* Folds that cross the 255/256 boundary are forbidden if
3465 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3466 * pattern character is above 256, and its only other match
3467 * is below 256, the only legal match will be to itself.
3468 * We have thrown away the original, so have to compute
3469 * which is the one above 255 */
3470 if ((c1 < 256) != (c2 < 256)) {
3471 if (OP(text_node) == EXACTFL
3472 || (OP(text_node) == EXACTFA
3473 && (isASCII(c1) || isASCII(c2))))
3474 {
3475 if (c1 < 256) {
3476 c1 = c2;
3477 }
3478 else {
3479 c2 = c1;
3480 }
3481 }
3482 }
3483 }
3484 }
3485 }
3486 else /* Here, c1 is < 255 */
3487 if (utf8_target
3488 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3489 && OP(text_node) != EXACTFL
3490 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
c74f6de9
KW
3491 {
3492 /* Here, there could be something above Latin1 in the target which
79a2a0e8
KW
3493 * folds to this character in the pattern. All such cases except
3494 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3495 * involved in their folds, so are outside the scope of this
3496 * function */
3497 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3498 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3499 }
3500 else {
3501 use_chrtest_void = TRUE;
3502 }
c74f6de9
KW
3503 }
3504 else { /* Here nothing above Latin1 can fold to the pattern character */
3505 switch (OP(text_node)) {
3506
3507 case EXACTFL: /* /l rules */
79a2a0e8 3508 c2 = PL_fold_locale[c1];
c74f6de9
KW
3509 break;
3510
3511 case EXACTF:
3512 if (! utf8_target) { /* /d rules */
79a2a0e8 3513 c2 = PL_fold[c1];
c74f6de9
KW
3514 break;
3515 }
3516 /* FALLTHROUGH */
3517 /* /u rules for all these. This happens to work for
79a2a0e8 3518 * EXACTFA as nothing in Latin1 folds to ASCII */
c74f6de9
KW
3519 case EXACTFA:
3520 case EXACTFU_TRICKYFOLD:
79a2a0e8 3521 case EXACTFU_SS:
c74f6de9 3522 case EXACTFU:
79a2a0e8 3523 c2 = PL_fold_latin1[c1];
c74f6de9
KW
3524 break;
3525
878531d3
KW
3526 default:
3527 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3528 assert(0); /* NOTREACHED */
c74f6de9
KW
3529 }
3530 }
3531 }
79a2a0e8
KW
3532
3533 /* Here have figured things out. Set up the returns */
3534 if (use_chrtest_void) {
3535 *c2p = *c1p = CHRTEST_VOID;
3536 }
3537 else if (utf8_target) {
3538 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3539 uvchr_to_utf8(c1_utf8, c1);
3540 uvchr_to_utf8(c2_utf8, c2);
c74f6de9 3541 }
c74f6de9 3542
79a2a0e8
KW
3543 /* Invariants are stored in both the utf8 and byte outputs; Use
3544 * negative numbers otherwise for the byte ones. Make sure that the
3545 * byte ones are the same iff the utf8 ones are the same */
3546 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3547 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3548 ? *c2_utf8
3549 : (c1 == c2)
3550 ? CHRTEST_NOT_A_CP_1
3551 : CHRTEST_NOT_A_CP_2;
3552 }
3553 else if (c1 > 255) {
3554 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3555 can represent */
3556 return FALSE;
3557 }
c74f6de9 3558
79a2a0e8
KW
3559 *c1p = *c2p = c2; /* c2 is the only representable value */
3560 }
3561 else { /* c1 is representable; see about c2 */
3562 *c1p = c1;
3563 *c2p = (c2 < 256) ? c2 : c1;
c74f6de9 3564 }
2f554ef7 3565
c74f6de9
KW
3566 return TRUE;
3567}
2f554ef7 3568
f73aaa43
DM
3569/* returns -1 on failure, $+[0] on success */
3570STATIC I32
3571S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
d6a28714 3572{
a35a87e7 3573#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3574 dMY_CXT;
3575#endif
27da23d5 3576 dVAR;
eb578fdb 3577 const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3578 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02 3579 REGEXP *rex_sv = reginfo->prog;