This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: [perl #40302] sigtrap(3pm) man page is vague
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
9041c2e3 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 **** Alterations to Henry's code are...
57 ****
4bb101f2 58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 60 ****
9ef589d8
LW
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
a687059c
LW
63 *
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
67 */
68#include "EXTERN.h"
864dbfa3 69#define PERL_IN_REGEXEC_C
a687059c 70#include "perl.h"
0f5d15d6 71
54df2634
NC
72#ifdef PERL_IN_XSUB_RE
73# include "re_comp.h"
74#else
75# include "regcomp.h"
76#endif
a687059c 77
c277df42
IZ
78#define RF_tainted 1 /* tainted information used? */
79#define RF_warned 2 /* warned about big count? */
ce862d02 80#define RF_evaled 4 /* Did an EVAL with setting? */
ab3bbdeb 81#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 82
eb160463 83#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
84
85#define RS_init 1 /* eval environment created */
86#define RS_set 2 /* replsv value is set */
c277df42 87
a687059c
LW
88#ifndef STATIC
89#define STATIC static
90#endif
91
32fc9b6a 92#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 93
c277df42
IZ
94/*
95 * Forwards.
96 */
97
33b8afdf 98#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 99#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 100
3dab1dad
YO
101#define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
52657f30 103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
104 : (U8*)(pos + off))
105#define HOPBACKc(pos, off) \
07be1b83
YO
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
8e11feef 109 ? (U8*)pos - off \
3dab1dad 110 : NULL)
efb30f32 111
53c4c00c 112#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 113#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 114
1a4fad37
AL
115#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 121
3dab1dad
YO
122/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
123
5f80c4cf 124/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
125#define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
130)
131
cca55fe3 132#define HAS_TEXT(rn) ( \
3dab1dad 133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 134)
e2d8ce26 135
a84d97b6
HS
136/*
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
139*/
cca55fe3 140#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 144 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 145 else if (type == PLUS) \
cca55fe3 146 rn = NEXTOPER(rn); \
3dab1dad 147 else if (type == IFMATCH) \
a84d97b6 148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 149 else rn += NEXT_OFF(rn); \
3dab1dad 150 } \
5f80c4cf 151} STMT_END
74750237 152
acfe0abc 153static void restore_pos(pTHX_ void *arg);
51371543 154
76e3520e 155STATIC CHECKPOINT
cea2e8a9 156S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 157{
97aff369 158 dVAR;
a3b680e6 159 const int retval = PL_savestack_ix;
b1ce53c5 160#define REGCP_PAREN_ELEMS 4
a3b680e6 161 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 162 int p;
40a82448 163 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 164
e49a9654
IH
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
167
a01268b5 168#define REGCP_OTHER_ELEMS 6
4b3c1a47 169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 170 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 171/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
3280af22 174 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 175 SSPUSHINT(p);
40a82448
DM
176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
177 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
178 (UV)p, (IV)PL_regstartp[p],
179 (IV)(PL_reg_start_tmp[p] - PL_bostr),
180 (IV)PL_regendp[p]
181 ));
a0d0e21e 182 }
b1ce53c5 183/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
184 SSPUSHINT(PL_regsize);
185 SSPUSHINT(*PL_reglastparen);
a01268b5 186 SSPUSHINT(*PL_reglastcloseparen);
3280af22 187 SSPUSHPTR(PL_reginput);
41123dfd
JH
188#define REGCP_FRAME_ELEMS 2
189/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
190 * are needed for the regexp context stack bookkeeping. */
191 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 192 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 193
a0d0e21e
LW
194 return retval;
195}
196
c277df42 197/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
198#define REGCP_SET(cp) \
199 DEBUG_STATE_r( \
ab3bbdeb 200 PerlIO_printf(Perl_debug_log, \
e4f74956 201 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
202 (IV)PL_savestack_ix)); \
203 cp = PL_savestack_ix
c3464db5 204
ab3bbdeb 205#define REGCP_UNWIND(cp) \
e4f74956 206 DEBUG_STATE_r( \
ab3bbdeb 207 if (cp != PL_savestack_ix) \
e4f74956
YO
208 PerlIO_printf(Perl_debug_log, \
209 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
210 (IV)(cp), (IV)PL_savestack_ix)); \
211 regcpblow(cp)
c277df42 212
76e3520e 213STATIC char *
097eb12c 214S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 215{
97aff369 216 dVAR;
b1ce53c5 217 I32 i;
a0d0e21e 218 char *input;
b1ce53c5 219
a3621e74
YO
220 GET_RE_DEBUG_FLAGS_DECL;
221
b1ce53c5 222 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 223 i = SSPOPINT;
b1ce53c5
JH
224 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
225 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 226 input = (char *) SSPOPPTR;
a01268b5 227 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
228 *PL_reglastparen = SSPOPINT;
229 PL_regsize = SSPOPINT;
b1ce53c5
JH
230
231 /* Now restore the parentheses context. */
41123dfd
JH
232 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
233 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 234 I32 tmps;
097eb12c 235 U32 paren = (U32)SSPOPINT;
3280af22 236 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
237 PL_regstartp[paren] = SSPOPINT;
238 tmps = SSPOPINT;
3280af22
NIS
239 if (paren <= *PL_reglastparen)
240 PL_regendp[paren] = tmps;
a3621e74 241 DEBUG_EXECUTE_r(
c3464db5 242 PerlIO_printf(Perl_debug_log,
b900a521 243 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 244 (UV)paren, (IV)PL_regstartp[paren],
b900a521 245 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 246 (IV)PL_regendp[paren],
3280af22 247 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 248 );
a0d0e21e 249 }
a3621e74 250 DEBUG_EXECUTE_r(
bb7a0f54 251 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 252 PerlIO_printf(Perl_debug_log,
faccc32b 253 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 254 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
255 }
256 );
daf18116 257#if 1
dafc8851
JH
258 /* It would seem that the similar code in regtry()
259 * already takes care of this, and in fact it is in
260 * a better location to since this code can #if 0-ed out
261 * but the code in regtry() is needed or otherwise tests
262 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
263 * (as of patchlevel 7877) will fail. Then again,
264 * this code seems to be necessary or otherwise
265 * building DynaLoader will fail:
266 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
267 * --jhi */
bb7a0f54 268 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
269 if (i > PL_regsize)
270 PL_regstartp[i] = -1;
271 PL_regendp[i] = -1;
a0d0e21e 272 }
dafc8851 273#endif
a0d0e21e
LW
274 return input;
275}
276
02db2b7b 277#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 278
a687059c 279/*
e50aee73 280 * pregexec and friends
a687059c
LW
281 */
282
76234dfb 283#ifndef PERL_IN_XSUB_RE
a687059c 284/*
c277df42 285 - pregexec - match a regexp against a string
a687059c 286 */
c277df42 287I32
864dbfa3 288Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 289 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
290/* strend: pointer to null at end of string */
291/* strbeg: real beginning of string */
292/* minend: end of match must be >=minend after stringarg. */
293/* nosave: For optimizations. */
294{
295 return
9041c2e3 296 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
297 nosave ? 0 : REXEC_COPY_STR);
298}
76234dfb 299#endif
22e551b9 300
9041c2e3 301/*
cad2e5aa
JH
302 * Need to implement the following flags for reg_anch:
303 *
304 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
305 * USE_INTUIT_ML
306 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
307 * INTUIT_AUTORITATIVE_ML
308 * INTUIT_ONCE_NOML - Intuit can match in one location only.
309 * INTUIT_ONCE_ML
310 *
311 * Another flag for this function: SECOND_TIME (so that float substrs
312 * with giant delta may be not rechecked).
313 */
314
315/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
316
3f7c398e 317/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
318 Otherwise, only SvCUR(sv) is used to get strbeg. */
319
320/* XXXX We assume that strpos is strbeg unless sv. */
321
6eb5f6b9
JH
322/* XXXX Some places assume that there is a fixed substring.
323 An update may be needed if optimizer marks as "INTUITable"
324 RExen without fixed substrings. Similarly, it is assumed that
325 lengths of all the strings are no more than minlen, thus they
326 cannot come from lookahead.
327 (Or minlen should take into account lookahead.) */
328
2c2d71f5
JH
329/* A failure to find a constant substring means that there is no need to make
330 an expensive call to REx engine, thus we celebrate a failure. Similarly,
331 finding a substring too deep into the string means that less calls to
30944b6d
IZ
332 regtry() should be needed.
333
334 REx compiler's optimizer found 4 possible hints:
335 a) Anchored substring;
336 b) Fixed substring;
337 c) Whether we are anchored (beginning-of-line or \G);
338 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 339 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
340 string which does not contradict any of them.
341 */
2c2d71f5 342
6eb5f6b9
JH
343/* Most of decisions we do here should have been done at compile time.
344 The nodes of the REx which we used for the search should have been
345 deleted from the finite automaton. */
346
cad2e5aa
JH
347char *
348Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
349 char *strend, U32 flags, re_scream_pos_data *data)
350{
97aff369 351 dVAR;
b7953727 352 register I32 start_shift = 0;
cad2e5aa 353 /* Should be nonnegative! */
b7953727 354 register I32 end_shift = 0;
2c2d71f5
JH
355 register char *s;
356 register SV *check;
a1933d95 357 char *strbeg;
cad2e5aa 358 char *t;
a3b680e6 359 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 360 I32 ml_anch;
bd61b366
SS
361 register char *other_last = NULL; /* other substr checked before this */
362 char *check_at = NULL; /* check substr found at this pos */
1df70142 363 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 364#ifdef DEBUGGING
890ce7af 365 const char * const i_strpos = strpos;
30944b6d 366#endif
a3621e74
YO
367
368 GET_RE_DEBUG_FLAGS_DECL;
369
a30b2f1f 370 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 371
b8d68ded 372 if (prog->reganch & ROPT_UTF8) {
b8d68ded
JH
373 PL_reg_flags |= RF_utf8;
374 }
ab3bbdeb
YO
375 DEBUG_EXECUTE_r(
376 debug_start_match(prog, do_utf8, strpos, strend,
377 "Guessing start of match for");
2a782b5b 378 );
cad2e5aa 379
c344f387
JH
380 /* CHR_DIST() would be more correct here but it makes things slow. */
381 if (prog->minlen > strend - strpos) {
a3621e74 382 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 383 "String too short... [re_intuit_start]\n"));
cad2e5aa 384 goto fail;
2c2d71f5 385 }
a1933d95 386 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 387 PL_regeol = strend;
33b8afdf
JH
388 if (do_utf8) {
389 if (!prog->check_utf8 && prog->check_substr)
390 to_utf8_substr(prog);
391 check = prog->check_utf8;
392 } else {
393 if (!prog->check_substr && prog->check_utf8)
394 to_byte_substr(prog);
395 check = prog->check_substr;
396 }
397 if (check == &PL_sv_undef) {
a3621e74 398 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
399 "Non-utf string cannot match utf check string\n"));
400 goto fail;
401 }
2c2d71f5 402 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
403 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
404 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 405 && !multiline ) ); /* Check after \n? */
cad2e5aa 406
7e25d62c
JH
407 if (!ml_anch) {
408 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
409 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 410 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
411 && sv && !SvROK(sv)
412 && (strpos != strbeg)) {
a3621e74 413 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
414 goto fail;
415 }
416 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 417 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 418 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
419 I32 slen;
420
1aa99e6b 421 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
422 if (SvTAIL(check)) {
423 slen = SvCUR(check); /* >= 1 */
cad2e5aa 424
9041c2e3 425 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 426 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 427 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 428 goto fail_finish;
cad2e5aa
JH
429 }
430 /* Now should match s[0..slen-2] */
431 slen--;
3f7c398e 432 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 433 || (slen > 1
3f7c398e 434 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 435 report_neq:
a3621e74 436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
437 goto fail_finish;
438 }
cad2e5aa 439 }
3f7c398e 440 else if (*SvPVX_const(check) != *s
653099ff 441 || ((slen = SvCUR(check)) > 1
3f7c398e 442 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 443 goto report_neq;
c315bfe8 444 check_at = s;
2c2d71f5 445 goto success_at_start;
7e25d62c 446 }
cad2e5aa 447 }
2c2d71f5 448 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 449 s = strpos;
2c2d71f5 450 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 451 end_shift = prog->minlen - start_shift -
653099ff 452 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 453 if (!ml_anch) {
a3b680e6 454 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 455 - (SvTAIL(check) != 0);
a3b680e6 456 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
457
458 if (end_shift < eshift)
459 end_shift = eshift;
460 }
cad2e5aa 461 }
2c2d71f5 462 else { /* Can match at random position */
cad2e5aa
JH
463 ml_anch = 0;
464 s = strpos;
2c2d71f5
JH
465 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
466 /* Should be nonnegative! */
467 end_shift = prog->minlen - start_shift -
653099ff 468 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
469 }
470
2c2d71f5 471#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 472 if (end_shift < 0)
6bbae5e6 473 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
474#endif
475
2c2d71f5
JH
476 restart:
477 /* Find a possible match in the region s..strend by looking for
478 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 479 if (flags & REXEC_SCREAM) {
cad2e5aa 480 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 481 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 482
2c2d71f5
JH
483 if (PL_screamfirst[BmRARE(check)] >= 0
484 || ( BmRARE(check) == '\n'
485 && (BmPREVIOUS(check) == SvCUR(check) - 1)
486 && SvTAIL(check) ))
9041c2e3 487 s = screaminstr(sv, check,
2c2d71f5 488 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 489 else
2c2d71f5 490 goto fail_finish;
4addbd3b
HS
491 /* we may be pointing at the wrong string */
492 if (s && RX_MATCH_COPIED(prog))
3f7c398e 493 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
494 if (data)
495 *data->scream_olds = s;
496 }
f33976b4 497 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
498 s = fbm_instr((U8*)(s + start_shift),
499 (U8*)(strend - end_shift),
7fba1cd6 500 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 501 else
1aa99e6b
IH
502 s = fbm_instr(HOP3(s, start_shift, strend),
503 HOP3(strend, -end_shift, strbeg),
7fba1cd6 504 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
505
506 /* Update the count-of-usability, remove useless subpatterns,
507 unshift s. */
2c2d71f5 508
ab3bbdeb
YO
509 DEBUG_EXECUTE_r({
510 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
511 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
512 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 513 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
514 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
515 ? "anchored" : "floating"),
516 quoted,
517 RE_SV_TAIL(check),
518 (s ? " at offset " : "...\n") );
519 });
2c2d71f5
JH
520
521 if (!s)
522 goto fail_finish;
523
6eb5f6b9
JH
524 check_at = s;
525
2c2d71f5 526 /* Finish the diagnostic message */
a3621e74 527 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
528
529 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
530 Start with the other substr.
531 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 532 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
533 *always* match. Probably should be marked during compile...
534 Probably it is right to do no SCREAM here...
535 */
536
33b8afdf 537 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 538 /* Take into account the "other" substring. */
2c2d71f5
JH
539 /* XXXX May be hopelessly wrong for UTF... */
540 if (!other_last)
6eb5f6b9 541 other_last = strpos;
33b8afdf 542 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
543 do_other_anchored:
544 {
890ce7af
AL
545 char * const last = HOP3c(s, -start_shift, strbeg);
546 char *last1, *last2;
be8e71aa 547 char * const saved_s = s;
33b8afdf 548 SV* must;
2c2d71f5 549
2c2d71f5
JH
550 t = s - prog->check_offset_max;
551 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 552 && (!do_utf8
0ce71af7 553 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 554 && t > strpos)))
6f207bd3 555 NOOP;
2c2d71f5
JH
556 else
557 t = strpos;
1aa99e6b 558 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
559 if (t < other_last) /* These positions already checked */
560 t = other_last;
1aa99e6b 561 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
562 if (last < last1)
563 last1 = last;
564 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
565 /* On end-of-str: see comment below. */
33b8afdf
JH
566 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
567 if (must == &PL_sv_undef) {
568 s = (char*)NULL;
a3621e74 569 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
570 }
571 else
572 s = fbm_instr(
573 (unsigned char*)t,
574 HOP3(HOP3(last1, prog->anchored_offset, strend)
575 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
576 must,
7fba1cd6 577 multiline ? FBMrf_MULTILINE : 0
33b8afdf 578 );
ab3bbdeb
YO
579 DEBUG_EXECUTE_r({
580 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
581 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
582 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 583 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
584 quoted, RE_SV_TAIL(must));
585 });
586
587
2c2d71f5
JH
588 if (!s) {
589 if (last1 >= last2) {
a3621e74 590 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
591 ", giving up...\n"));
592 goto fail_finish;
593 }
a3621e74 594 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 595 ", trying floating at offset %ld...\n",
be8e71aa 596 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
597 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
598 s = HOP3c(last, 1, strend);
2c2d71f5
JH
599 goto restart;
600 }
601 else {
a3621e74 602 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 603 (long)(s - i_strpos)));
1aa99e6b
IH
604 t = HOP3c(s, -prog->anchored_offset, strbeg);
605 other_last = HOP3c(s, 1, strend);
be8e71aa 606 s = saved_s;
2c2d71f5
JH
607 if (t == strpos)
608 goto try_at_start;
2c2d71f5
JH
609 goto try_at_offset;
610 }
30944b6d 611 }
2c2d71f5
JH
612 }
613 else { /* Take into account the floating substring. */
33b8afdf 614 char *last, *last1;
be8e71aa 615 char * const saved_s = s;
33b8afdf
JH
616 SV* must;
617
618 t = HOP3c(s, -start_shift, strbeg);
619 last1 = last =
620 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
621 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
622 last = HOP3c(t, prog->float_max_offset, strend);
623 s = HOP3c(t, prog->float_min_offset, strend);
624 if (s < other_last)
625 s = other_last;
2c2d71f5 626 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
627 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
628 /* fbm_instr() takes into account exact value of end-of-str
629 if the check is SvTAIL(ed). Since false positives are OK,
630 and end-of-str is not later than strend we are OK. */
631 if (must == &PL_sv_undef) {
632 s = (char*)NULL;
a3621e74 633 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
634 }
635 else
2c2d71f5 636 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
637 (unsigned char*)last + SvCUR(must)
638 - (SvTAIL(must)!=0),
7fba1cd6 639 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
640 DEBUG_EXECUTE_r({
641 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
642 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
643 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 644 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
645 quoted, RE_SV_TAIL(must));
646 });
33b8afdf
JH
647 if (!s) {
648 if (last1 == last) {
a3621e74 649 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
650 ", giving up...\n"));
651 goto fail_finish;
2c2d71f5 652 }
a3621e74 653 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 654 ", trying anchored starting at offset %ld...\n",
be8e71aa 655 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
656 other_last = last;
657 s = HOP3c(t, 1, strend);
658 goto restart;
659 }
660 else {
a3621e74 661 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
662 (long)(s - i_strpos)));
663 other_last = s; /* Fix this later. --Hugo */
be8e71aa 664 s = saved_s;
33b8afdf
JH
665 if (t == strpos)
666 goto try_at_start;
667 goto try_at_offset;
668 }
2c2d71f5 669 }
cad2e5aa 670 }
2c2d71f5
JH
671
672 t = s - prog->check_offset_max;
2c2d71f5 673 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 674 && (!do_utf8
0ce71af7 675 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
1aa99e6b 676 && t > strpos))) {
2c2d71f5
JH
677 /* Fixed substring is found far enough so that the match
678 cannot start at strpos. */
679 try_at_offset:
cad2e5aa 680 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
681 /* Eventually fbm_*() should handle this, but often
682 anchored_offset is not 0, so this check will not be wasted. */
683 /* XXXX In the code below we prefer to look for "^" even in
684 presence of anchored substrings. And we search even
685 beyond the found float position. These pessimizations
686 are historical artefacts only. */
687 find_anchor:
2c2d71f5 688 while (t < strend - prog->minlen) {
cad2e5aa 689 if (*t == '\n') {
4ee3650e 690 if (t < check_at - prog->check_offset_min) {
33b8afdf 691 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
692 /* Since we moved from the found position,
693 we definitely contradict the found anchored
30944b6d
IZ
694 substr. Due to the above check we do not
695 contradict "check" substr.
696 Thus we can arrive here only if check substr
697 is float. Redo checking for "other"=="fixed".
698 */
9041c2e3 699 strpos = t + 1;
a3621e74 700 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 701 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
702 goto do_other_anchored;
703 }
4ee3650e
GS
704 /* We don't contradict the found floating substring. */
705 /* XXXX Why not check for STCLASS? */
cad2e5aa 706 s = t + 1;
a3621e74 707 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 708 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
709 goto set_useful;
710 }
4ee3650e
GS
711 /* Position contradicts check-string */
712 /* XXXX probably better to look for check-string
713 than for "\n", so one should lower the limit for t? */
a3621e74 714 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 715 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 716 other_last = strpos = s = t + 1;
cad2e5aa
JH
717 goto restart;
718 }
719 t++;
720 }
a3621e74 721 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 722 PL_colors[0], PL_colors[1]));
2c2d71f5 723 goto fail_finish;
cad2e5aa 724 }
f5952150 725 else {
a3621e74 726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 727 PL_colors[0], PL_colors[1]));
f5952150 728 }
cad2e5aa
JH
729 s = t;
730 set_useful:
33b8afdf 731 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
732 }
733 else {
f5952150 734 /* The found string does not prohibit matching at strpos,
2c2d71f5 735 - no optimization of calling REx engine can be performed,
f5952150
GS
736 unless it was an MBOL and we are not after MBOL,
737 or a future STCLASS check will fail this. */
2c2d71f5
JH
738 try_at_start:
739 /* Even in this situation we may use MBOL flag if strpos is offset
740 wrt the start of the string. */
05b4157f 741 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 742 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
743 /* May be due to an implicit anchor of m{.*foo} */
744 && !(prog->reganch & ROPT_IMPLICIT))
745 {
cad2e5aa
JH
746 t = strpos;
747 goto find_anchor;
748 }
a3621e74 749 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 750 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 751 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 752 );
2c2d71f5 753 success_at_start:
30944b6d 754 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
755 && (do_utf8 ? (
756 prog->check_utf8 /* Could be deleted already */
757 && --BmUSEFUL(prog->check_utf8) < 0
758 && (prog->check_utf8 == prog->float_utf8)
759 ) : (
760 prog->check_substr /* Could be deleted already */
761 && --BmUSEFUL(prog->check_substr) < 0
762 && (prog->check_substr == prog->float_substr)
763 )))
66e933ab 764 {
cad2e5aa 765 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 766 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
767 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
768 if (do_utf8 ? prog->check_substr : prog->check_utf8)
769 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
770 prog->check_substr = prog->check_utf8 = NULL; /* disable */
771 prog->float_substr = prog->float_utf8 = NULL; /* clear */
772 check = NULL; /* abort */
cad2e5aa 773 s = strpos;
3cf5c195
IZ
774 /* XXXX This is a remnant of the old implementation. It
775 looks wasteful, since now INTUIT can use many
6eb5f6b9 776 other heuristics. */
cad2e5aa
JH
777 prog->reganch &= ~RE_USE_INTUIT;
778 }
779 else
780 s = strpos;
781 }
782
6eb5f6b9
JH
783 /* Last resort... */
784 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
786e8c11 785 if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) {
6eb5f6b9
JH
786 /* minlen == 0 is possible if regstclass is \b or \B,
787 and the fixed substr is ''$.
788 Since minlen is already taken into account, s+1 is before strend;
789 accidentally, minlen >= 1 guaranties no false positives at s + 1
790 even for \b or \B. But (minlen? 1 : 0) below assumes that
791 regstclass does not come from lookahead... */
792 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
793 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 794 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 795 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 796 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 797 : 1);
07be1b83 798 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 799 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 800 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
801 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
802 cl_l, strend)
803 : strend);
07be1b83
YO
804 /*if (OP(prog->regstclass) == TRIE)
805 endpos++;*/
6eb5f6b9 806 t = s;
3b0527fe 807 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
808 if (!s) {
809#ifdef DEBUGGING
cbbf8932 810 const char *what = NULL;
6eb5f6b9
JH
811#endif
812 if (endpos == strend) {
a3621e74 813 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
814 "Could not match STCLASS...\n") );
815 goto fail;
816 }
a3621e74 817 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 818 "This position contradicts STCLASS...\n") );
653099ff
GS
819 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
820 goto fail;
6eb5f6b9 821 /* Contradict one of substrings */
33b8afdf
JH
822 if (prog->anchored_substr || prog->anchored_utf8) {
823 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 824 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 825 hop_and_restart:
1aa99e6b 826 s = HOP3c(t, 1, strend);
66e933ab
GS
827 if (s + start_shift + end_shift > strend) {
828 /* XXXX Should be taken into account earlier? */
a3621e74 829 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
830 "Could not match STCLASS...\n") );
831 goto fail;
832 }
5e39e1e5
HS
833 if (!check)
834 goto giveup;
a3621e74 835 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 836 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
837 what, (long)(s + start_shift - i_strpos)) );
838 goto restart;
839 }
66e933ab 840 /* Have both, check_string is floating */
6eb5f6b9
JH
841 if (t + start_shift >= check_at) /* Contradicts floating=check */
842 goto retry_floating_check;
843 /* Recheck anchored substring, but not floating... */
9041c2e3 844 s = check_at;
5e39e1e5
HS
845 if (!check)
846 goto giveup;
a3621e74 847 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 848 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
849 (long)(other_last - i_strpos)) );
850 goto do_other_anchored;
851 }
60e71179
GS
852 /* Another way we could have checked stclass at the
853 current position only: */
854 if (ml_anch) {
855 s = t = t + 1;
5e39e1e5
HS
856 if (!check)
857 goto giveup;
a3621e74 858 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 859 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 860 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 861 goto try_at_offset;
66e933ab 862 }
33b8afdf 863 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 864 goto fail;
6eb5f6b9
JH
865 /* Check is floating subtring. */
866 retry_floating_check:
867 t = check_at - start_shift;
a3621e74 868 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
869 goto hop_and_restart;
870 }
b7953727 871 if (t != s) {
a3621e74 872 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 873 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
874 (long)(t - i_strpos), (long)(s - i_strpos))
875 );
876 }
877 else {
a3621e74 878 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
879 "Does not contradict STCLASS...\n");
880 );
881 }
6eb5f6b9 882 }
5e39e1e5 883 giveup:
a3621e74 884 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
885 PL_colors[4], (check ? "Guessed" : "Giving up"),
886 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 887 return s;
2c2d71f5
JH
888
889 fail_finish: /* Substring not found */
33b8afdf
JH
890 if (prog->check_substr || prog->check_utf8) /* could be removed already */
891 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 892 fail:
a3621e74 893 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 894 PL_colors[4], PL_colors[5]));
bd61b366 895 return NULL;
cad2e5aa 896}
9661b544 897
786e8c11 898
3b0527fe 899
4cadc6a9
YO
900#define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
901foldlen, foldbuf, uniflags) STMT_START { \
902 switch (trie_type) { \
903 case trie_utf8_fold: \
904 if ( foldlen>0 ) { \
905 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
906 foldlen -= len; \
907 uscan += len; \
908 len=0; \
909 } else { \
910 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
911 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
912 foldlen -= UNISKIP( uvc ); \
913 uscan = foldbuf + UNISKIP( uvc ); \
914 } \
915 break; \
916 case trie_utf8: \
917 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
918 break; \
919 case trie_plain: \
920 uvc = (UV)*uc; \
921 len = 1; \
922 } \
923 \
924 if (uvc < 256) { \
925 charid = trie->charmap[ uvc ]; \
926 } \
927 else { \
928 charid = 0; \
929 if (trie->widecharmap) { \
930 SV** const svpp = hv_fetch(trie->widecharmap, \
931 (char*)&uvc, sizeof(UV), 0); \
932 if (svpp) \
933 charid = (U16)SvIV(*svpp); \
934 } \
935 } \
936} STMT_END
937
938#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
939 if ( (CoNd) \
940 && (ln == len || \
941 ibcmp_utf8(s, NULL, 0, do_utf8, \
942 m, NULL, ln, (bool)UTF)) \
943 && (!reginfo || regtry(reginfo, s)) ) \
944 goto got_it; \
945 else { \
946 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
947 uvchr_to_utf8(tmpbuf, c); \
948 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
949 if ( f != c \
950 && (f == c1 || f == c2) \
951 && (ln == foldlen || \
952 !ibcmp_utf8((char *) foldbuf, \
953 NULL, foldlen, do_utf8, \
954 m, \
955 NULL, ln, (bool)UTF)) \
956 && (!reginfo || regtry(reginfo, s)) ) \
957 goto got_it; \
958 } \
959 s += len
960
961#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
962STMT_START { \
963 while (s <= e) { \
964 if ( (CoNd) \
965 && (ln == 1 || !(OP(c) == EXACTF \
966 ? ibcmp(s, m, ln) \
967 : ibcmp_locale(s, m, ln))) \
968 && (!reginfo || regtry(reginfo, s)) ) \
969 goto got_it; \
970 s++; \
971 } \
972} STMT_END
973
974#define REXEC_FBC_UTF8_SCAN(CoDe) \
975STMT_START { \
976 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
977 CoDe \
978 s += uskip; \
979 } \
980} STMT_END
981
982#define REXEC_FBC_SCAN(CoDe) \
983STMT_START { \
984 while (s < strend) { \
985 CoDe \
986 s++; \
987 } \
988} STMT_END
989
990#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
991REXEC_FBC_UTF8_SCAN( \
992 if (CoNd) { \
993 if (tmp && (!reginfo || regtry(reginfo, s))) \
994 goto got_it; \
995 else \
996 tmp = doevery; \
997 } \
998 else \
999 tmp = 1; \
1000)
1001
1002#define REXEC_FBC_CLASS_SCAN(CoNd) \
1003REXEC_FBC_SCAN( \
1004 if (CoNd) { \
1005 if (tmp && (!reginfo || regtry(reginfo, s))) \
1006 goto got_it; \
1007 else \
1008 tmp = doevery; \
1009 } \
1010 else \
1011 tmp = 1; \
1012)
1013
1014#define REXEC_FBC_TRYIT \
1015if ((!reginfo || regtry(reginfo, s))) \
1016 goto got_it
1017
1018#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1019 if (do_utf8) { \
1020 UtFpReLoAd; \
1021 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1022 } \
1023 else { \
1024 REXEC_FBC_CLASS_SCAN(CoNd); \
1025 } \
1026 break
1027
1028#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1029 PL_reg_flags |= RF_tainted; \
1030 if (do_utf8) { \
1031 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1032 } \
1033 else { \
1034 REXEC_FBC_CLASS_SCAN(CoNd); \
1035 } \
1036 break
1037
786e8c11
YO
1038#define DUMP_EXEC_POS(li,s,doutf8) \
1039 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1040
1041/* We know what class REx starts with. Try to find this position... */
1042/* if reginfo is NULL, its a dryrun */
1043/* annoyingly all the vars in this routine have different names from their counterparts
1044 in regmatch. /grrr */
1045
3c3eec57 1046STATIC char *
07be1b83
YO
1047S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1048 const char *strend, const regmatch_info *reginfo)
a687059c 1049{
27da23d5 1050 dVAR;
1df70142 1051 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 1052 char *m;
d8093b23 1053 STRLEN ln;
5dab1207 1054 STRLEN lnc;
078c425b 1055 register STRLEN uskip;
d8093b23
G
1056 unsigned int c1;
1057 unsigned int c2;
6eb5f6b9
JH
1058 char *e;
1059 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1060 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 1061
6eb5f6b9
JH
1062 /* We know what class it must start with. */
1063 switch (OP(c)) {
6eb5f6b9 1064 case ANYOF:
388cc4de 1065 if (do_utf8) {
4cadc6a9 1066 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1067 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1068 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1069 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1070 }
1071 else {
1072 while (s < strend) {
1073 STRLEN skip = 1;
1074
32fc9b6a 1075 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1076 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1077 /* The assignment of 2 is intentional:
1078 * for the folded sharp s, the skip is 2. */
1079 (skip = SHARP_S_SKIP))) {
3b0527fe 1080 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
1081 goto got_it;
1082 else
1083 tmp = doevery;
1084 }
1085 else
1086 tmp = 1;
1087 s += skip;
1088 }
a0d0e21e 1089 }
6eb5f6b9 1090 break;
f33976b4 1091 case CANY:
4cadc6a9 1092 REXEC_FBC_SCAN(
3b0527fe 1093 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
1094 goto got_it;
1095 else
1096 tmp = doevery;
4cadc6a9 1097 );
f33976b4 1098 break;
6eb5f6b9 1099 case EXACTF:
5dab1207
NIS
1100 m = STRING(c);
1101 ln = STR_LEN(c); /* length to match in octets/bytes */
1102 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1103 if (UTF) {
a2a2844f 1104 STRLEN ulen1, ulen2;
5dab1207 1105 U8 *sm = (U8 *) m;
89ebb4a3
JH
1106 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1107 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1108 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1109
1110 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1111 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1112
89ebb4a3 1113 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1114 0, uniflags);
89ebb4a3 1115 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1116 0, uniflags);
5dab1207
NIS
1117 lnc = 0;
1118 while (sm < ((U8 *) m + ln)) {
1119 lnc++;
1120 sm += UTF8SKIP(sm);
1121 }
1aa99e6b
IH
1122 }
1123 else {
1124 c1 = *(U8*)m;
1125 c2 = PL_fold[c1];
1126 }
6eb5f6b9
JH
1127 goto do_exactf;
1128 case EXACTFL:
5dab1207
NIS
1129 m = STRING(c);
1130 ln = STR_LEN(c);
1131 lnc = (I32) ln;
d8093b23 1132 c1 = *(U8*)m;
6eb5f6b9
JH
1133 c2 = PL_fold_locale[c1];
1134 do_exactf:
db12adc6 1135 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1136
3b0527fe 1137 if (!reginfo && e < s)
6eb5f6b9 1138 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1139
60a8b682
JH
1140 /* The idea in the EXACTF* cases is to first find the
1141 * first character of the EXACTF* node and then, if
1142 * necessary, case-insensitively compare the full
1143 * text of the node. The c1 and c2 are the first
1144 * characters (though in Unicode it gets a bit
1145 * more complicated because there are more cases
7f16dd3d
JH
1146 * than just upper and lower: one needs to use
1147 * the so-called folding case for case-insensitive
1148 * matching (called "loose matching" in Unicode).
1149 * ibcmp_utf8() will do just that. */
60a8b682 1150
1aa99e6b 1151 if (do_utf8) {
575cac57 1152 UV c, f;
89ebb4a3 1153 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1154 STRLEN len, foldlen;
4ad0818d 1155 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1156 if (c1 == c2) {
5dab1207
NIS
1157 /* Upper and lower of 1st char are equal -
1158 * probably not a "letter". */
1aa99e6b 1159 while (s <= e) {
89ebb4a3 1160 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1161 uniflags);
4cadc6a9 1162 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1163 }
09091399
JH
1164 }
1165 else {
1aa99e6b 1166 while (s <= e) {
89ebb4a3 1167 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1168 uniflags);
80aecb99 1169
60a8b682 1170 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1171 * Note that not all the possible combinations
1172 * are handled here: some of them are handled
1173 * by the standard folding rules, and some of
1174 * them (the character class or ANYOF cases)
1175 * are handled during compiletime in
1176 * regexec.c:S_regclass(). */
880bd946
JH
1177 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1178 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1179 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1180
4cadc6a9 1181 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1182 }
09091399 1183 }
1aa99e6b
IH
1184 }
1185 else {
1186 if (c1 == c2)
4cadc6a9 1187 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1188 else
4cadc6a9 1189 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1190 }
1191 break;
bbce6d69 1192 case BOUNDL:
3280af22 1193 PL_reg_flags |= RF_tainted;
bbce6d69 1194 /* FALL THROUGH */
a0d0e21e 1195 case BOUND:
ffc61ed2 1196 if (do_utf8) {
12d33761 1197 if (s == PL_bostr)
ffc61ed2
JH
1198 tmp = '\n';
1199 else {
6136c704 1200 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1201 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1202 }
1203 tmp = ((OP(c) == BOUND ?
9041c2e3 1204 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1205 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1206 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1207 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1208 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1209 isALNUM_LC_utf8((U8*)s)))
1210 {
1211 tmp = !tmp;
4cadc6a9 1212 REXEC_FBC_TRYIT;
a687059c 1213 }
4cadc6a9 1214 );
a0d0e21e 1215 }
667bb95a 1216 else {
12d33761 1217 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1218 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1219 REXEC_FBC_SCAN(
ffc61ed2
JH
1220 if (tmp ==
1221 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1222 tmp = !tmp;
4cadc6a9 1223 REXEC_FBC_TRYIT;
a0ed51b3 1224 }
4cadc6a9 1225 );
a0ed51b3 1226 }
3b0527fe 1227 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1228 goto got_it;
1229 break;
bbce6d69 1230 case NBOUNDL:
3280af22 1231 PL_reg_flags |= RF_tainted;
bbce6d69 1232 /* FALL THROUGH */
a0d0e21e 1233 case NBOUND:
ffc61ed2 1234 if (do_utf8) {
12d33761 1235 if (s == PL_bostr)
ffc61ed2
JH
1236 tmp = '\n';
1237 else {
6136c704 1238 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1239 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1240 }
1241 tmp = ((OP(c) == NBOUND ?
9041c2e3 1242 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1243 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1244 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1245 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1246 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1247 isALNUM_LC_utf8((U8*)s)))
1248 tmp = !tmp;
4cadc6a9
YO
1249 else REXEC_FBC_TRYIT;
1250 );
a0d0e21e 1251 }
667bb95a 1252 else {
12d33761 1253 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1254 tmp = ((OP(c) == NBOUND ?
1255 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1256 REXEC_FBC_SCAN(
ffc61ed2
JH
1257 if (tmp ==
1258 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1259 tmp = !tmp;
4cadc6a9
YO
1260 else REXEC_FBC_TRYIT;
1261 );
a0ed51b3 1262 }
3b0527fe 1263 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1264 goto got_it;
1265 break;
a0d0e21e 1266 case ALNUM:
4cadc6a9
YO
1267 REXEC_FBC_CSCAN_PRELOAD(
1268 LOAD_UTF8_CHARCLASS_ALNUM(),
1269 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1270 isALNUM(*s)
1271 );
bbce6d69 1272 case ALNUML:
4cadc6a9
YO
1273 REXEC_FBC_CSCAN_TAINT(
1274 isALNUM_LC_utf8((U8*)s),
1275 isALNUM_LC(*s)
1276 );
a0d0e21e 1277 case NALNUM:
4cadc6a9
YO
1278 REXEC_FBC_CSCAN_PRELOAD(
1279 LOAD_UTF8_CHARCLASS_ALNUM(),
1280 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1281 !isALNUM(*s)
1282 );
bbce6d69 1283 case NALNUML:
4cadc6a9
YO
1284 REXEC_FBC_CSCAN_TAINT(
1285 !isALNUM_LC_utf8((U8*)s),
1286 !isALNUM_LC(*s)
1287 );
a0d0e21e 1288 case SPACE:
4cadc6a9
YO
1289 REXEC_FBC_CSCAN_PRELOAD(
1290 LOAD_UTF8_CHARCLASS_SPACE(),
1291 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1292 isSPACE(*s)
1293 );
bbce6d69 1294 case SPACEL:
4cadc6a9
YO
1295 REXEC_FBC_CSCAN_TAINT(
1296 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1297 isSPACE_LC(*s)
1298 );
a0d0e21e 1299 case NSPACE:
4cadc6a9
YO
1300 REXEC_FBC_CSCAN_PRELOAD(
1301 LOAD_UTF8_CHARCLASS_SPACE(),
1302 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1303 !isSPACE(*s)
1304 );
bbce6d69 1305 case NSPACEL:
4cadc6a9
YO
1306 REXEC_FBC_CSCAN_TAINT(
1307 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1308 !isSPACE_LC(*s)
1309 );
a0d0e21e 1310 case DIGIT:
4cadc6a9
YO
1311 REXEC_FBC_CSCAN_PRELOAD(
1312 LOAD_UTF8_CHARCLASS_DIGIT(),
1313 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1314 isDIGIT(*s)
1315 );
b8c5462f 1316 case DIGITL:
4cadc6a9
YO
1317 REXEC_FBC_CSCAN_TAINT(
1318 isDIGIT_LC_utf8((U8*)s),
1319 isDIGIT_LC(*s)
1320 );
a0d0e21e 1321 case NDIGIT:
4cadc6a9
YO
1322 REXEC_FBC_CSCAN_PRELOAD(
1323 LOAD_UTF8_CHARCLASS_DIGIT(),
1324 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1325 !isDIGIT(*s)
1326 );
b8c5462f 1327 case NDIGITL:
4cadc6a9
YO
1328 REXEC_FBC_CSCAN_TAINT(
1329 !isDIGIT_LC_utf8((U8*)s),
1330 !isDIGIT_LC(*s)
1331 );
786e8c11 1332 case TRIEC:
07be1b83 1333 case TRIE:
07be1b83
YO
1334 {
1335 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1336 trie_type = do_utf8 ?
1337 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1338 : trie_plain;
1339 /* what trie are we using right now */
1340 reg_ac_data *aho
1341 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1342 reg_trie_data *trie=aho->trie;
1343
1344 const char *last_start = strend - trie->minlen;
6148ee25 1345#ifdef DEBUGGING
07be1b83 1346 const char *real_start = s;
6148ee25 1347#endif
07be1b83 1348 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1349 SV *sv_points;
1350 U8 **points; /* map of where we were in the input string
786e8c11 1351 when reading a given char. For ASCII this
be8e71aa
YO
1352 is unnecessary overhead as the relationship
1353 is always 1:1, but for unicode, especially
1354 case folded unicode this is not true. */
f9e705e8 1355 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1356 U8 *bitmap=NULL;
1357
07be1b83
YO
1358
1359 GET_RE_DEBUG_FLAGS_DECL;
1360
be8e71aa
YO
1361 /* We can't just allocate points here. We need to wrap it in
1362 * an SV so it gets freed properly if there is a croak while
1363 * running the match */
1364 ENTER;
1365 SAVETMPS;
1366 sv_points=newSV(maxlen * sizeof(U8 *));
1367 SvCUR_set(sv_points,
1368 maxlen * sizeof(U8 *));
1369 SvPOK_on(sv_points);
1370 sv_2mortal(sv_points);
1371 points=(U8**)SvPV_nolen(sv_points );
786e8c11
YO
1372 if ( trie_type != trie_utf8_fold && (trie->bitmap || OP(c)==TRIEC) ) {
1373 if (trie->bitmap)
1374 bitmap=(U8*)trie->bitmap;
1375 else
1376 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1377 }
786e8c11
YO
1378 /* this is the Aho-Corasick algorithm modified a touch
1379 to include special handling for long "unknown char"
1380 sequences. The basic idea being that we use AC as long
1381 as we are dealing with a possible matching char, when
1382 we encounter an unknown char (and we have not encountered
1383 an accepting state) we scan forward until we find a legal
1384 starting char.
1385 AC matching is basically that of trie matching, except
1386 that when we encounter a failing transition, we fall back
1387 to the current states "fail state", and try the current char
1388 again, a process we repeat until we reach the root state,
1389 state 1, or a legal transition. If we fail on the root state
1390 then we can either terminate if we have reached an accepting
1391 state previously, or restart the entire process from the beginning
1392 if we have not.
1393
1394 */
07be1b83
YO
1395 while (s <= last_start) {
1396 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1397 U8 *uc = (U8*)s;
1398 U16 charid = 0;
1399 U32 base = 1;
1400 U32 state = 1;
1401 UV uvc = 0;
1402 STRLEN len = 0;
1403 STRLEN foldlen = 0;
1404 U8 *uscan = (U8*)NULL;
1405 U8 *leftmost = NULL;
786e8c11
YO
1406#ifdef DEBUGGING
1407 U32 accepted_word= 0;
1408#endif
07be1b83
YO
1409 U32 pointpos = 0;
1410
1411 while ( state && uc <= (U8*)strend ) {
1412 int failed=0;
786e8c11
YO
1413 U32 word = aho->states[ state ].wordnum;
1414
1415 if( state==1 && bitmap ) {
1416 DEBUG_TRIE_EXECUTE_r(
1417 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1418 dump_exec_pos( (char *)uc, c, strend, real_start,
85c3142d 1419 (char*)uc, do_utf8 );
786e8c11
YO
1420 PerlIO_printf( Perl_debug_log,
1421 " Scanning for legal start char...\n");
1422 }
1423 );
1424 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1425 uc++;
1426 }
1427 s= (char *)uc;
1428 if (uc >(U8*)last_start) break;
1429 }
1430
1431 if ( word ) {
1432 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1433 if (!leftmost || lpos < leftmost) {
1434 DEBUG_r(accepted_word=word);
07be1b83 1435 leftmost= lpos;
786e8c11 1436 }
07be1b83 1437 if (base==0) break;
786e8c11 1438
07be1b83
YO
1439 }
1440 points[pointpos++ % maxlen]= uc;
4cadc6a9
YO
1441 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1442 uvc, charid, foldlen, foldbuf, uniflags);
786e8c11
YO
1443 DEBUG_TRIE_EXECUTE_r({
1444 dump_exec_pos( (char *)uc, c, strend, real_start,
1445 s, do_utf8 );
07be1b83 1446 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1447 " Charid:%3u CP:%4"UVxf" ",
1448 charid, uvc);
1449 });
07be1b83
YO
1450
1451 do {
6148ee25 1452#ifdef DEBUGGING
786e8c11 1453 word = aho->states[ state ].wordnum;
6148ee25 1454#endif
07be1b83
YO
1455 base = aho->states[ state ].trans.base;
1456
786e8c11
YO
1457 DEBUG_TRIE_EXECUTE_r({
1458 if (failed)
1459 dump_exec_pos( (char *)uc, c, strend, real_start,
1460 s, do_utf8 );
07be1b83 1461 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1462 "%sState: %4"UVxf", word=%"UVxf,
1463 failed ? " Fail transition to " : "",
1464 (UV)state, (UV)word);
1465 });
07be1b83
YO
1466 if ( base ) {
1467 U32 tmp;
1468 if (charid &&
1469 (base + charid > trie->uniquecharcount )
1470 && (base + charid - 1 - trie->uniquecharcount
1471 < trie->lasttrans)
1472 && trie->trans[base + charid - 1 -
1473 trie->uniquecharcount].check == state
1474 && (tmp=trie->trans[base + charid - 1 -
1475 trie->uniquecharcount ].next))
1476 {
786e8c11
YO
1477 DEBUG_TRIE_EXECUTE_r(
1478 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1479 state = tmp;
1480 break;
1481 }
1482 else {
786e8c11
YO
1483 DEBUG_TRIE_EXECUTE_r(
1484 PerlIO_printf( Perl_debug_log," - fail\n"));
1485 failed = 1;
1486 state = aho->fail[state];
07be1b83
YO
1487 }
1488 }
1489 else {
1490 /* we must be accepting here */
786e8c11
YO
1491 DEBUG_TRIE_EXECUTE_r(
1492 PerlIO_printf( Perl_debug_log," - accepting\n"));
1493 failed = 1;
07be1b83
YO
1494 break;
1495 }
1496 } while(state);
786e8c11 1497 uc += len;
07be1b83
YO
1498 if (failed) {
1499 if (leftmost)
1500 break;
786e8c11 1501 if (!state) state = 1;
07be1b83
YO
1502 }
1503 }
1504 if ( aho->states[ state ].wordnum ) {
1505 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
786e8c11
YO
1506 if (!leftmost || lpos < leftmost) {
1507 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1508 leftmost = lpos;
786e8c11 1509 }
07be1b83 1510 }
07be1b83
YO
1511 if (leftmost) {
1512 s = (char*)leftmost;
786e8c11
YO
1513 DEBUG_TRIE_EXECUTE_r({
1514 PerlIO_printf(
1515 Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n",
1516 (UV)accepted_word, s - real_start
1517 );
1518 });
be8e71aa
YO
1519 if (!reginfo || regtry(reginfo, s)) {
1520 FREETMPS;
1521 LEAVE;
07be1b83 1522 goto got_it;
be8e71aa 1523 }
07be1b83 1524 s = HOPc(s,1);
786e8c11
YO
1525 DEBUG_TRIE_EXECUTE_r({
1526 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1527 });
07be1b83 1528 } else {
786e8c11
YO
1529 DEBUG_TRIE_EXECUTE_r(
1530 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1531 break;
1532 }
1533 }
be8e71aa
YO
1534 FREETMPS;
1535 LEAVE;
07be1b83
YO
1536 }
1537 break;
b3c9acc1 1538 default:
3c3eec57
GS
1539 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1540 break;
d6a28714 1541 }
6eb5f6b9
JH
1542 return 0;
1543 got_it:
1544 return s;
1545}
1546
1547/*
1548 - regexec_flags - match a regexp against a string
1549 */
1550I32
1551Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1552 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1553/* strend: pointer to null at end of string */
1554/* strbeg: real beginning of string */
1555/* minend: end of match must be >=minend after stringarg. */
1556/* data: May be used for some additional optimizations. */
1557/* nosave: For optimizations. */
1558{
97aff369 1559 dVAR;
6eb5f6b9
JH
1560 register char *s;
1561 register regnode *c;
1562 register char *startpos = stringarg;
6eb5f6b9
JH
1563 I32 minlen; /* must match at least this many chars */
1564 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1565 I32 end_shift = 0; /* Same for the end. */ /* CC */
1566 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1567 char *scream_olds = NULL;
3dab1dad 1568 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1569 const bool do_utf8 = DO_UTF8(sv);
2757e526 1570 I32 multiline;
0df25f3d 1571
3b0527fe 1572 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1573
1574 GET_RE_DEBUG_FLAGS_DECL;
1575
9d4ba2ae 1576 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1577
1578 /* Be paranoid... */
1579 if (prog == NULL || startpos == NULL) {
1580 Perl_croak(aTHX_ "NULL regexp parameter");
1581 return 0;
1582 }
1583
2757e526 1584 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1585 reginfo.prog = prog;
2757e526 1586
bac06658
JH
1587 RX_MATCH_UTF8_set(prog, do_utf8);
1588
6eb5f6b9 1589 minlen = prog->minlen;
61a36c01 1590 if (strend - startpos < minlen) {
a3621e74 1591 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1592 "String too short [regexec_flags]...\n"));
1593 goto phooey;
1aa99e6b 1594 }
6eb5f6b9 1595
6eb5f6b9
JH
1596 /* Check validity of program. */
1597 if (UCHARAT(prog->program) != REG_MAGIC) {
1598 Perl_croak(aTHX_ "corrupted regexp program");
1599 }
1600
1601 PL_reg_flags = 0;
1602 PL_reg_eval_set = 0;
1603 PL_reg_maxiter = 0;
1604
1605 if (prog->reganch & ROPT_UTF8)
1606 PL_reg_flags |= RF_utf8;
1607
1608 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1609 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1610 PL_bostr = strbeg;
3b0527fe 1611 reginfo.sv = sv;
6eb5f6b9
JH
1612
1613 /* Mark end of line for $ (and such) */
1614 PL_regeol = strend;
1615
1616 /* see how far we have to get to not match where we matched before */
3b0527fe 1617 reginfo.till = startpos+minend;
6eb5f6b9 1618
6eb5f6b9
JH
1619 /* If there is a "must appear" string, look for it. */
1620 s = startpos;
1621
3b0527fe 1622 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1623 MAGIC *mg;
1624
1625 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1626 reginfo.ganch = startpos;
6eb5f6b9
JH
1627 else if (sv && SvTYPE(sv) >= SVt_PVMG
1628 && SvMAGIC(sv)
14befaf4
DM
1629 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1630 && mg->mg_len >= 0) {
3b0527fe 1631 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1632 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1633 if (s > reginfo.ganch)
6eb5f6b9 1634 goto phooey;
3b0527fe 1635 s = reginfo.ganch;
6eb5f6b9
JH
1636 }
1637 }
1638 else /* pos() not defined */
3b0527fe 1639 reginfo.ganch = strbeg;
6eb5f6b9
JH
1640 }
1641
a0714e2c 1642 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1643 re_scream_pos_data d;
1644
1645 d.scream_olds = &scream_olds;
1646 d.scream_pos = &scream_pos;
1647 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1648 if (!s) {
a3621e74 1649 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1650 goto phooey; /* not present */
3fa9c3d7 1651 }
6eb5f6b9
JH
1652 }
1653
ab3bbdeb
YO
1654 DEBUG_EXECUTE_r(
1655 debug_start_match(prog, do_utf8, startpos, strend,
1656 "Matching");
2a782b5b 1657 );
6eb5f6b9
JH
1658
1659 /* Simplest case: anchored match need be tried only once. */
1660 /* [unless only anchor is BOL and multiline is set] */
1661 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1662 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1663 goto got_it;
7fba1cd6 1664 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1665 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1666 {
1667 char *end;
1668
1669 if (minlen)
1670 dontbother = minlen - 1;
1aa99e6b 1671 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1672 /* for multiline we only have to try after newlines */
33b8afdf 1673 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1674 if (s == startpos)
1675 goto after_try;
1676 while (1) {
3b0527fe 1677 if (regtry(&reginfo, s))
6eb5f6b9
JH
1678 goto got_it;
1679 after_try:
1680 if (s >= end)
1681 goto phooey;
1682 if (prog->reganch & RE_USE_INTUIT) {
1683 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1684 if (!s)
1685 goto phooey;
1686 }
1687 else
1688 s++;
1689 }
1690 } else {
1691 if (s > startpos)
1692 s--;
1693 while (s < end) {
1694 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1695 if (regtry(&reginfo, s))
6eb5f6b9
JH
1696 goto got_it;
1697 }
1698 }
1699 }
1700 }
1701 goto phooey;
1702 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1703 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1704 goto got_it;
1705 goto phooey;
1706 }
1707
1708 /* Messy cases: unanchored match. */
33b8afdf 1709 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1710 /* we have /x+whatever/ */
1711 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1712 char ch;
bf93d4cc
GS
1713#ifdef DEBUGGING
1714 int did_match = 0;
1715#endif
33b8afdf
JH
1716 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1717 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1718 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1719
1aa99e6b 1720 if (do_utf8) {
4cadc6a9 1721 REXEC_FBC_SCAN(
6eb5f6b9 1722 if (*s == ch) {
a3621e74 1723 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1724 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1725 s += UTF8SKIP(s);
1726 while (s < strend && *s == ch)
1727 s += UTF8SKIP(s);
1728 }
4cadc6a9 1729 );
6eb5f6b9
JH
1730 }
1731 else {
4cadc6a9 1732 REXEC_FBC_SCAN(
6eb5f6b9 1733 if (*s == ch) {
a3621e74 1734 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1735 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1736 s++;
1737 while (s < strend && *s == ch)
1738 s++;
1739 }
4cadc6a9 1740 );
6eb5f6b9 1741 }
a3621e74 1742 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1743 PerlIO_printf(Perl_debug_log,
b7953727
JH
1744 "Did not find anchored character...\n")
1745 );
6eb5f6b9 1746 }
a0714e2c
SS
1747 else if (prog->anchored_substr != NULL
1748 || prog->anchored_utf8 != NULL
1749 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1750 && prog->float_max_offset < strend - s)) {
1751 SV *must;
1752 I32 back_max;
1753 I32 back_min;
1754 char *last;
6eb5f6b9 1755 char *last1; /* Last position checked before */
bf93d4cc
GS
1756#ifdef DEBUGGING
1757 int did_match = 0;
1758#endif
33b8afdf
JH
1759 if (prog->anchored_substr || prog->anchored_utf8) {
1760 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1761 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1762 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1763 back_max = back_min = prog->anchored_offset;
1764 } else {
1765 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1766 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1767 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1768 back_max = prog->float_max_offset;
1769 back_min = prog->float_min_offset;
1770 }
1771 if (must == &PL_sv_undef)
1772 /* could not downgrade utf8 check substring, so must fail */
1773 goto phooey;
1774
1775 last = HOP3c(strend, /* Cannot start after this */
1776 -(I32)(CHR_SVLEN(must)
1777 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1778
1779 if (s > PL_bostr)
1780 last1 = HOPc(s, -1);
1781 else
1782 last1 = s - 1; /* bogus */
1783
a0288114 1784 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1785 check_substr==must. */
1786 scream_pos = -1;
1787 dontbother = end_shift;
1788 strend = HOPc(strend, -dontbother);
1789 while ( (s <= last) &&
9041c2e3 1790 ((flags & REXEC_SCREAM)
1aa99e6b 1791 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1792 end_shift, &scream_pos, 0))
1aa99e6b 1793 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1794 (unsigned char*)strend, must,
7fba1cd6 1795 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1796 /* we may be pointing at the wrong string */
1797 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1798 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1799 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1800 if (HOPc(s, -back_max) > last1) {
1801 last1 = HOPc(s, -back_min);
1802 s = HOPc(s, -back_max);
1803 }
1804 else {
52657f30 1805 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1806
1807 last1 = HOPc(s, -back_min);
52657f30 1808 s = t;
6eb5f6b9 1809 }
1aa99e6b 1810 if (do_utf8) {
6eb5f6b9 1811 while (s <= last1) {
3b0527fe 1812 if (regtry(&reginfo, s))
6eb5f6b9
JH
1813 goto got_it;
1814 s += UTF8SKIP(s);
1815 }
1816 }
1817 else {
1818 while (s <= last1) {
3b0527fe 1819 if (regtry(&reginfo, s))
6eb5f6b9
JH
1820 goto got_it;
1821 s++;
1822 }
1823 }
1824 }
ab3bbdeb
YO
1825 DEBUG_EXECUTE_r(if (!did_match) {
1826 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1827 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1828 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 1829 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 1830 ? "anchored" : "floating"),
ab3bbdeb
YO
1831 quoted, RE_SV_TAIL(must));
1832 });
6eb5f6b9
JH
1833 goto phooey;
1834 }
786e8c11 1835 else if ( (c = prog->regstclass) ) {
f14c76ed 1836 if (minlen) {
be8e71aa 1837 const OPCODE op = OP(prog->regstclass);
66e933ab 1838 /* don't bother with what can't match */
786e8c11 1839 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
1840 strend = HOPc(strend, -(minlen - 1));
1841 }
a3621e74 1842 DEBUG_EXECUTE_r({
be8e71aa 1843 SV * const prop = sv_newmortal();
32fc9b6a 1844 regprop(prog, prop, c);
0df25f3d 1845 {
ab3bbdeb
YO
1846 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1847 s,strend-s,60);
0df25f3d 1848 PerlIO_printf(Perl_debug_log,
ab3bbdeb 1849 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 1850 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 1851 quoted, (int)(strend - s));
0df25f3d 1852 }
ffc61ed2 1853 });
3b0527fe 1854 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 1855 goto got_it;
07be1b83 1856 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
1857 }
1858 else {
1859 dontbother = 0;
a0714e2c 1860 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1861 /* Trim the end. */
d6a28714 1862 char *last;
33b8afdf
JH
1863 SV* float_real;
1864
1865 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1866 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1867 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1868
1869 if (flags & REXEC_SCREAM) {
33b8afdf 1870 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1871 end_shift, &scream_pos, 1); /* last one */
1872 if (!last)
ffc61ed2 1873 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1874 /* we may be pointing at the wrong string */
1875 else if (RX_MATCH_COPIED(prog))
3f7c398e 1876 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1877 }
d6a28714
JH
1878 else {
1879 STRLEN len;
cfd0369c 1880 const char * const little = SvPV_const(float_real, len);
d6a28714 1881
33b8afdf 1882 if (SvTAIL(float_real)) {
d6a28714
JH
1883 if (memEQ(strend - len + 1, little, len - 1))
1884 last = strend - len + 1;
7fba1cd6 1885 else if (!multiline)
9041c2e3 1886 last = memEQ(strend - len, little, len)
bd61b366 1887 ? strend - len : NULL;
b8c5462f 1888 else
d6a28714
JH
1889 goto find_last;
1890 } else {
1891 find_last:
9041c2e3 1892 if (len)
d6a28714 1893 last = rninstr(s, strend, little, little + len);
b8c5462f 1894 else
a0288114 1895 last = strend; /* matching "$" */
b8c5462f 1896 }
b8c5462f 1897 }
bf93d4cc 1898 if (last == NULL) {
a3621e74 1899 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 1900 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 1901 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1902 goto phooey; /* Should not happen! */
1903 }
d6a28714
JH
1904 dontbother = strend - last + prog->float_min_offset;
1905 }
1906 if (minlen && (dontbother < minlen))
1907 dontbother = minlen - 1;
1908 strend -= dontbother; /* this one's always in bytes! */
1909 /* We don't know much -- general case. */
1aa99e6b 1910 if (do_utf8) {
d6a28714 1911 for (;;) {
3b0527fe 1912 if (regtry(&reginfo, s))
d6a28714
JH
1913 goto got_it;
1914 if (s >= strend)
1915 break;
b8c5462f 1916 s += UTF8SKIP(s);
d6a28714
JH
1917 };
1918 }
1919 else {
1920 do {
3b0527fe 1921 if (regtry(&reginfo, s))
d6a28714
JH
1922 goto got_it;
1923 } while (s++ < strend);
1924 }
1925 }
1926
1927 /* Failure. */
1928 goto phooey;
1929
1930got_it:
1931 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1932
1933 if (PL_reg_eval_set) {
1934 /* Preserve the current value of $^R */
1935 if (oreplsv != GvSV(PL_replgv))
1936 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1937 restored, the value remains
1938 the same. */
4f639d21 1939 restore_pos(aTHX_ prog);
d6a28714
JH
1940 }
1941
1942 /* make sure $`, $&, $', and $digit will work later */
1943 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 1944 RX_MATCH_COPY_FREE(prog);
d6a28714 1945 if (flags & REXEC_COPY_STR) {
be8e71aa 1946 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 1947#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
1948 if ((SvIsCOW(sv)
1949 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
1950 if (DEBUG_C_TEST) {
1951 PerlIO_printf(Perl_debug_log,
1952 "Copy on write: regexp capture, type %d\n",
1953 (int) SvTYPE(sv));
1954 }
1955 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 1956 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
1957 assert (SvPOKp(prog->saved_copy));
1958 } else
1959#endif
1960 {
1961 RX_MATCH_COPIED_on(prog);
1962 s = savepvn(strbeg, i);
1963 prog->subbeg = s;
1964 }
d6a28714 1965 prog->sublen = i;
d6a28714
JH
1966 }
1967 else {
1968 prog->subbeg = strbeg;
1969 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1970 }
1971 }
9041c2e3 1972
d6a28714
JH
1973 return 1;
1974
1975phooey:
a3621e74 1976 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 1977 PL_colors[4], PL_colors[5]));
d6a28714 1978 if (PL_reg_eval_set)
4f639d21 1979 restore_pos(aTHX_ prog);
d6a28714
JH
1980 return 0;
1981}
1982
1983/*
1984 - regtry - try match at specific point
1985 */
1986STATIC I32 /* 0 failure, 1 success */
3b0527fe 1987S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 1988{
97aff369 1989 dVAR;
d6a28714
JH
1990 register I32 *sp;
1991 register I32 *ep;
1992 CHECKPOINT lastcp;
3b0527fe 1993 regexp *prog = reginfo->prog;
a3621e74 1994 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 1995
02db2b7b
IZ
1996#ifdef DEBUGGING
1997 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1998#endif
d6a28714
JH
1999 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2000 MAGIC *mg;
2001
2002 PL_reg_eval_set = RS_init;
a3621e74 2003 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2004 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2005 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2006 ));
e8347627 2007 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2008 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2009 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2010 SAVETMPS;
2011 /* Apparently this is not needed, judging by wantarray. */
e8347627 2012 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2013 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2014
3b0527fe 2015 if (reginfo->sv) {
d6a28714 2016 /* Make $_ available to executed code. */
3b0527fe 2017 if (reginfo->sv != DEFSV) {
59f00321 2018 SAVE_DEFSV;
3b0527fe 2019 DEFSV = reginfo->sv;
b8c5462f 2020 }
d6a28714 2021
3b0527fe
DM
2022 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2023 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2024 /* prepare for quick setting of pos */
d300d9fa
NC
2025#ifdef PERL_OLD_COPY_ON_WRITE
2026 if (SvIsCOW(sv))
2027 sv_force_normal_flags(sv, 0);
2028#endif
3dab1dad 2029 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2030 &PL_vtbl_mglob, NULL, 0);
d6a28714 2031 mg->mg_len = -1;
b8c5462f 2032 }
d6a28714
JH
2033 PL_reg_magic = mg;
2034 PL_reg_oldpos = mg->mg_len;
4f639d21 2035 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2036 }
09687e5a 2037 if (!PL_reg_curpm) {
a02a5408 2038 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2039#ifdef USE_ITHREADS
2040 {
be8e71aa 2041 SV* const repointer = newSViv(0);
577e12cc 2042 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2043 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2044 av_push(PL_regex_padav,repointer);
2045 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2046 PL_regex_pad = AvARRAY(PL_regex_padav);
2047 }
2048#endif
2049 }
aaa362c4 2050 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2051 PL_reg_oldcurpm = PL_curpm;
2052 PL_curpm = PL_reg_curpm;
2053 if (RX_MATCH_COPIED(prog)) {
2054 /* Here is a serious problem: we cannot rewrite subbeg,
2055 since it may be needed if this match fails. Thus
2056 $` inside (?{}) could fail... */
2057 PL_reg_oldsaved = prog->subbeg;
2058 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2059#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2060 PL_nrs = prog->saved_copy;
2061#endif
d6a28714
JH
2062 RX_MATCH_COPIED_off(prog);
2063 }
2064 else
bd61b366 2065 PL_reg_oldsaved = NULL;
d6a28714
JH
2066 prog->subbeg = PL_bostr;
2067 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2068 }
973dddac 2069 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2070 PL_reginput = startpos;
2071 PL_regstartp = prog->startp;
2072 PL_regendp = prog->endp;
2073 PL_reglastparen = &prog->lastparen;
a01268b5 2074 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2075 prog->lastparen = 0;
03994de8 2076 prog->lastcloseparen = 0;
d6a28714 2077 PL_regsize = 0;
a3621e74 2078 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2079 if (PL_reg_start_tmpl <= prog->nparens) {
2080 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2081 if(PL_reg_start_tmp)
2082 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2083 else
a02a5408 2084 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2085 }
2086
2087 /* XXXX What this code is doing here?!!! There should be no need
2088 to do this again and again, PL_reglastparen should take care of
3dd2943c 2089 this! --ilya*/
dafc8851
JH
2090
2091 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2092 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2093 * PL_reglastparen), is not needed at all by the test suite
2094 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2095 * enough, for building DynaLoader, or otherwise this
2096 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2097 * will happen. Meanwhile, this code *is* needed for the
2098 * above-mentioned test suite tests to succeed. The common theme
2099 * on those tests seems to be returning null fields from matches.
2100 * --jhi */
dafc8851 2101#if 1
d6a28714
JH
2102 sp = prog->startp;
2103 ep = prog->endp;
2104 if (prog->nparens) {
097eb12c 2105 register I32 i;
eb160463 2106 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2107 *++sp = -1;
2108 *++ep = -1;
2109 }
2110 }
dafc8851 2111#endif
02db2b7b 2112 REGCP_SET(lastcp);
3b0527fe 2113 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2114 prog->endp[0] = PL_reginput - PL_bostr;
2115 return 1;
2116 }
02db2b7b 2117 REGCP_UNWIND(lastcp);
d6a28714
JH
2118 return 0;
2119}
2120
02db2b7b 2121
8ba1375e
MJD
2122#define sayYES goto yes
2123#define sayNO goto no
e0f9d4a8 2124#define sayNO_ANYOF goto no_anyof
8ba1375e 2125#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2126#define sayNO_FINAL goto no_final
2127#define sayNO_SILENT goto do_no
2128#define saySAME(x) if (x) goto yes; else goto no
2129
3ab3c9b4 2130#define CACHEsayNO STMT_START { \
3298f257
DM
2131 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) \
2132 PL_reg_poscache[st->u.whilem.cache_offset] |= \
2133 (1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2134 sayNO; \
2135} STMT_END
2136
3298f257 2137
a3621e74
YO
2138/* this is used to determine how far from the left messages like
2139 'failed...' are printed. Currently 29 makes these messages line
2140 up with the opcode they refer to. Earlier perls used 25 which
2141 left these messages outdented making reviewing a debug output
2142 quite difficult.
2143*/
2144#define REPORT_CODE_OFF 29
2145
2146
2147/* Make sure there is a test for this +1 options in re_tests */
2148#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2149
40a82448
DM
2150#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2151#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2152
86545054
DM
2153#define SLAB_FIRST(s) (&(s)->states[0])
2154#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2155
5d9a96ca
DM
2156/* grab a new slab and return the first slot in it */
2157
2158STATIC regmatch_state *
2159S_push_slab(pTHX)
2160{
54df2634
NC
2161#if PERL_VERSION < 9
2162 dMY_CXT;
2163#endif
5d9a96ca
DM
2164 regmatch_slab *s = PL_regmatch_slab->next;
2165 if (!s) {
2166 Newx(s, 1, regmatch_slab);
2167 s->prev = PL_regmatch_slab;
2168 s->next = NULL;
2169 PL_regmatch_slab->next = s;
2170 }
2171 PL_regmatch_slab = s;
86545054 2172 return SLAB_FIRST(s);
5d9a96ca 2173}
5b47454d 2174
95b24440
DM
2175/* simulate a recursive call to regmatch */
2176
2177#define REGMATCH(ns, where) \
5d9a96ca
DM
2178 st->scan = scan; \
2179 scan = (ns); \
2180 st->resume_state = resume_##where; \
95b24440
DM
2181 goto start_recurse; \
2182 resume_point_##where:
2183
40a82448
DM
2184/* push a new state then goto it */
2185
2186#define PUSH_STATE_GOTO(state, node) \
2187 scan = node; \
2188 st->resume_state = state; \
2189 goto push_state;
2190
2191/* push a new state with success backtracking, then goto it */
2192
2193#define PUSH_YES_STATE_GOTO(state, node) \
2194 scan = node; \
2195 st->resume_state = state; \
2196 goto push_yes_state;
2197
aa283a38 2198
aa283a38 2199
d6a28714
JH
2200/*
2201 - regmatch - main matching routine
2202 *
2203 * Conceptually the strategy is simple: check to see whether the current
2204 * node matches, call self recursively to see whether the rest matches,
2205 * and then act accordingly. In practice we make some effort to avoid
2206 * recursion, in particular by going through "ordinary" nodes (that don't
2207 * need to know whether the rest of the match failed) by a loop instead of
2208 * by recursion.
2209 */
2210/* [lwall] I've hoisted the register declarations to the outer block in order to
2211 * maybe save a little bit of pushing and popping on the stack. It also takes
2212 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2213 *
2214 * This function used to be heavily recursive, but since this had the
2215 * effect of blowing the CPU stack on complex regexes, it has been
2216 * restructured to be iterative, and to save state onto the heap rather
2217 * than the stack. Essentially whereever regmatch() used to be called, it
2218 * pushes the current state, notes where to return, then jumps back into
2219 * the main loop.
2220 *
2221 * Originally the structure of this function used to look something like
2222
2223 S_regmatch() {
2224 int a = 1, b = 2;
2225 ...
2226 while (scan != NULL) {
5d9a96ca 2227 a++; // do stuff with a and b
95b24440
DM
2228 ...
2229 switch (OP(scan)) {
2230 case FOO: {
2231 int local = 3;
2232 ...
2233 if (regmatch(...)) // recurse
2234 goto yes;
2235 }
2236 ...
2237 }
2238 }
2239 yes:
2240 return 1;
2241 }
2242
2243 * Now it looks something like this:
2244
5d9a96ca 2245 typedef struct {
95b24440
DM
2246 int a, b, local;
2247 int resume_state;
5d9a96ca 2248 } regmatch_state;
95b24440
DM
2249
2250 S_regmatch() {
5d9a96ca
DM
2251 regmatch_state *st = new();
2252 int depth=0;
2253 st->a++; // do stuff with a and b
95b24440
DM
2254 ...
2255 while (scan != NULL) {
2256 ...
2257 switch (OP(scan)) {
2258 case FOO: {
5d9a96ca 2259 st->local = 3;
95b24440 2260 ...
5d9a96ca
DM
2261 st->scan = scan;
2262 scan = ...;
2263 st->resume_state = resume_FOO;
2264 goto start_recurse; // recurse
95b24440 2265
5d9a96ca
DM
2266 resume_point_FOO:
2267 if (result)
95b24440
DM
2268 goto yes;
2269 }
2270 ...
2271 }
5d9a96ca
DM
2272 start_recurse:
2273 st = new(); push a new state
2274 st->a = 1; st->b = 2;
2275 depth++;
95b24440 2276 }
5d9a96ca 2277 yes:
95b24440 2278 result = 1;
5d9a96ca
DM
2279 if (depth--) {
2280 st = pop();
95b24440
DM
2281 switch (resume_state) {
2282 case resume_FOO:
2283 goto resume_point_FOO;
2284 ...
2285 }
2286 }
2287 return result
2288 }
2289
2290 * WARNING: this means that any line in this function that contains a
2291 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2292 * regmatch() using gotos instead. Thus the values of any local variables
2293 * not saved in the regmatch_state structure will have been lost when
2294 * execution resumes on the next line .
5d9a96ca
DM
2295 *
2296 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2297 * PL_regmatch_state always points to the currently active state, and
2298 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2299 * The first time regmatch is called, the first slab is allocated, and is
2300 * never freed until interpreter desctruction. When the slab is full,
2301 * a new one is allocated chained to the end. At exit from regmatch, slabs
2302 * allocated since entry are freed.
d6a28714 2303 */
95b24440 2304
40a82448 2305/* *** every FOO_fail should = FOO+1 */
c255a977
DM
2306#define TRIE_next (REGNODE_MAX+1)
2307#define TRIE_next_fail (REGNODE_MAX+2)
2308#define EVAL_A (REGNODE_MAX+3)
2309#define EVAL_A_fail (REGNODE_MAX+4)
2310#define resume_CURLYX (REGNODE_MAX+5)
2311#define resume_WHILEM1 (REGNODE_MAX+6)
2312#define resume_WHILEM2 (REGNODE_MAX+7)
2313#define resume_WHILEM3 (REGNODE_MAX+8)
2314#define resume_WHILEM4 (REGNODE_MAX+9)
2315#define resume_WHILEM5 (REGNODE_MAX+10)
2316#define resume_WHILEM6 (REGNODE_MAX+11)
2317#define BRANCH_next (REGNODE_MAX+12)
2318#define BRANCH_next_fail (REGNODE_MAX+13)
2319#define CURLYM_A (REGNODE_MAX+14)
2320#define CURLYM_A_fail (REGNODE_MAX+15)
2321#define CURLYM_B (REGNODE_MAX+16)
2322#define CURLYM_B_fail (REGNODE_MAX+17)
2323#define IFMATCH_A (REGNODE_MAX+18)
2324#define IFMATCH_A_fail (REGNODE_MAX+19)
2325#define CURLY_B_min_known (REGNODE_MAX+20)
2326#define CURLY_B_min_known_fail (REGNODE_MAX+21)
2327#define CURLY_B_min (REGNODE_MAX+22)
2328#define CURLY_B_min_fail (REGNODE_MAX+23)
2329#define CURLY_B_max (REGNODE_MAX+24)
2330#define CURLY_B_max_fail (REGNODE_MAX+25)
40a82448
DM
2331
2332
3dab1dad 2333#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2334
3df15adc 2335#ifdef DEBUGGING
ab3bbdeb
YO
2336STATIC void
2337S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2338 const char *start, const char *end, const char *blurb)
2339{
2340 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2341 if (!PL_colorset)
2342 reginitcolors();
2343 {
2344 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2345 prog->precomp, prog->prelen, 60);
2346
2347 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2348 start, end - start, 60);
2349
2350 PerlIO_printf(Perl_debug_log,
2351 "%s%s REx%s %s against %s\n",
2352 PL_colors[4], blurb, PL_colors[5], s0, s1);
2353
2354 if (do_utf8||utf8_pat)
2355 PerlIO_printf(Perl_debug_log, "UTF-8 %s...\n",
2356 !do_utf8 ? "pattern" : !utf8_pat ? "string" :
2357 "pattern and string"
2358 );
2359 }
2360}
3df15adc
YO
2361
2362STATIC void
786e8c11
YO
2363S_dump_exec_pos(pTHX_ const char *locinput,
2364 const regnode *scan,
2365 const char *loc_regeol,
2366 const char *loc_bostr,
2367 const char *loc_reg_starttry,
2368 const bool do_utf8)
07be1b83 2369{
786e8c11 2370 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2371 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2372 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2373 /* The part of the string before starttry has one color
2374 (pref0_len chars), between starttry and current
2375 position another one (pref_len - pref0_len chars),
2376 after the current position the third one.
2377 We assume that pref0_len <= pref_len, otherwise we
2378 decrease pref0_len. */
786e8c11
YO
2379 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2380 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2381 int pref0_len;
2382
2383 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2384 pref_len++;
786e8c11
YO
2385 pref0_len = pref_len - (locinput - loc_reg_starttry);
2386 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2387 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2388 ? (5 + taill) - pref_len : loc_regeol - locinput);
07be1b83
YO
2389 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2390 l--;
2391 if (pref0_len < 0)
2392 pref0_len = 0;
2393 if (pref0_len > pref_len)
2394 pref0_len = pref_len;
2395 {
3df15adc 2396 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2397
ab3bbdeb 2398 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
786e8c11 2399 (locinput - pref_len),pref0_len, pref0_len, 4, 5);
0df25f3d 2400
ab3bbdeb 2401 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2402 (locinput - pref_len + pref0_len),
786e8c11 2403 pref_len - pref0_len, pref_len - pref0_len, 2, 3);
0df25f3d 2404
ab3bbdeb 2405 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
786e8c11 2406 locinput, loc_regeol - locinput, l, 0, 1);
0df25f3d 2407
3df15adc 2408 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2409 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2410 (IV)(locinput - loc_bostr),
07be1b83 2411 len0, s0,
07be1b83 2412 len1, s1,
07be1b83 2413 (docolor ? "" : "> <"),
07be1b83 2414 len2, s2,
07be1b83
YO
2415 15 - l - pref_len + 1,
2416 "");
2417 }
2418}
3df15adc 2419
07be1b83
YO
2420#endif
2421
d6a28714 2422STATIC I32 /* 0 failure, 1 success */
3b0527fe 2423S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2424{
54df2634
NC
2425#if PERL_VERSION < 9
2426 dMY_CXT;
2427#endif
27da23d5 2428 dVAR;
95b24440 2429 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2430 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2431
3b0527fe
DM
2432 regexp *rex = reginfo->prog;
2433
5d9a96ca
DM
2434 regmatch_slab *orig_slab;
2435 regmatch_state *orig_state;
a3621e74 2436
5d9a96ca
DM
2437 /* the current state. This is a cached copy of PL_regmatch_state */
2438 register regmatch_state *st;
95b24440 2439
5d9a96ca
DM
2440 /* cache heavy used fields of st in registers */
2441 register regnode *scan;
2442 register regnode *next;
2443 register I32 n = 0; /* initialize to shut up compiler warning */
2444 register char *locinput = PL_reginput;
95b24440 2445
5d9a96ca
DM
2446 /* these variables are NOT saved during a recusive RFEGMATCH: */
2447 register I32 nextchr; /* is always set to UCHARAT(locinput) */
b69b0499 2448 bool result = 0; /* return value of S_regmatch */
5d9a96ca 2449 int depth = 0; /* depth of recursion */
77cb431f
DM
2450 regmatch_state *yes_state = NULL; /* state to pop to on success of
2451 subpattern */
40a82448 2452 U32 state_num;
95b24440 2453
10edeb5d
JH
2454 I32 parenfloor = 0;
2455
95b24440 2456#ifdef DEBUGGING
e68ec53f 2457 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2458 PL_regindent++;
2459#endif
2460
5d9a96ca
DM
2461 /* on first ever call to regmatch, allocate first slab */
2462 if (!PL_regmatch_slab) {
2463 Newx(PL_regmatch_slab, 1, regmatch_slab);
2464 PL_regmatch_slab->prev = NULL;
2465 PL_regmatch_slab->next = NULL;
86545054 2466 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2467 }
2468
2469 /* remember current high-water mark for exit */
2470 /* XXX this should be done with SAVE* instead */
2471 orig_slab = PL_regmatch_slab;
2472 orig_state = PL_regmatch_state;
2473
2474 /* grab next free state slot */
2475 st = ++PL_regmatch_state;
86545054 2476 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2477 st = PL_regmatch_state = S_push_slab(aTHX);
2478
2479 st->minmod = 0;
2480 st->sw = 0;
2481 st->logical = 0;
5d9a96ca 2482 st->cc = NULL;
786e8c11 2483
d6a28714
JH
2484 /* Note that nextchr is a byte even in UTF */
2485 nextchr = UCHARAT(locinput);
2486 scan = prog;
2487 while (scan != NULL) {
8ba1375e 2488
a3621e74 2489 DEBUG_EXECUTE_r( {
6136c704 2490 SV * const prop = sv_newmortal();
786e8c11 2491 DUMP_EXEC_POS( locinput, scan, do_utf8 );
32fc9b6a 2492 regprop(rex, prop, scan);
07be1b83
YO
2493
2494 PerlIO_printf(Perl_debug_log,
2495 "%3"IVdf":%*s%s(%"IVdf")\n",
2496 (IV)(scan - rex->program), PL_regindent*2, "",
2497 SvPVX_const(prop),
2498 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2a782b5b 2499 });
d6a28714
JH
2500
2501 next = scan + NEXT_OFF(scan);
2502 if (next == scan)
2503 next = NULL;
40a82448 2504 state_num = OP(scan);
d6a28714 2505
40a82448
DM
2506 reenter_switch:
2507 switch (state_num) {
d6a28714 2508 case BOL:
7fba1cd6 2509 if (locinput == PL_bostr)
d6a28714 2510 {
3b0527fe 2511 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2512 break;
2513 }
d6a28714
JH
2514 sayNO;
2515 case MBOL:
12d33761
HS
2516 if (locinput == PL_bostr ||
2517 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2518 {
b8c5462f
JH
2519 break;
2520 }
d6a28714
JH
2521 sayNO;
2522 case SBOL:
c2a73568 2523 if (locinput == PL_bostr)
b8c5462f 2524 break;
d6a28714
JH
2525 sayNO;
2526 case GPOS:
3b0527fe 2527 if (locinput == reginfo->ganch)
d6a28714
JH
2528 break;
2529 sayNO;
2530 case EOL:
d6a28714
JH
2531 goto seol;
2532 case MEOL:
d6a28714 2533 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2534 sayNO;
b8c5462f 2535 break;
d6a28714
JH
2536 case SEOL:
2537 seol:
2538 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2539 sayNO;
d6a28714 2540 if (PL_regeol - locinput > 1)
b8c5462f 2541 sayNO;
b8c5462f 2542 break;
d6a28714
JH
2543 case EOS:
2544 if (PL_regeol != locinput)
b8c5462f 2545 sayNO;
d6a28714 2546 break;
ffc61ed2 2547 case SANY:
d6a28714 2548 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2549 sayNO;
f33976b4
DB
2550 if (do_utf8) {
2551 locinput += PL_utf8skip[nextchr];
2552 if (locinput > PL_regeol)
2553 sayNO;
2554 nextchr = UCHARAT(locinput);
2555 }
2556 else
2557 nextchr = UCHARAT(++locinput);
2558 break;
2559 case CANY:
2560 if (!nextchr && locinput >= PL_regeol)
2561 sayNO;
b8c5462f 2562 nextchr = UCHARAT(++locinput);
a0d0e21e 2563 break;
ffc61ed2 2564 case REG_ANY:
1aa99e6b
IH
2565 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2566 sayNO;
2567 if (do_utf8) {
b8c5462f 2568 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2569 if (locinput > PL_regeol)
2570 sayNO;
a0ed51b3 2571 nextchr = UCHARAT(locinput);
a0ed51b3 2572 }
1aa99e6b
IH
2573 else
2574 nextchr = UCHARAT(++locinput);
a0ed51b3 2575 break;
166ba7cd
DM
2576
2577#undef ST
2578#define ST st->u.trie
786e8c11
YO
2579 case TRIEC:
2580 /* In this case the charclass data is available inline so
2581 we can fail fast without a lot of extra overhead.
2582 */
2583 if (scan->flags == EXACT || !do_utf8) {
2584 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2585 DEBUG_EXECUTE_r(
2586 PerlIO_printf(Perl_debug_log,
2587 "%*s %sfailed to match trie start class...%s\n",
2588 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2589 );
2590 sayNO_SILENT;
2591 /* NOTREACHED */
2592 }
2593 }
2594 /* FALL THROUGH */
5b47454d 2595 case TRIE:
3dab1dad 2596 {
07be1b83 2597 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2598 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2599 trie_type = do_utf8 ?
2600 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2601 : trie_plain;
2602
2603 /* what trie are we using right now */
be8e71aa 2604 reg_trie_data * const trie
3dab1dad
YO
2605 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2606 U32 state = trie->startstate;
166ba7cd 2607
3dab1dad
YO
2608 if (trie->bitmap && trie_type != trie_utf8_fold &&
2609 !TRIE_BITMAP_TEST(trie,*locinput)
2610 ) {
2611 if (trie->states[ state ].wordnum) {
2612 DEBUG_EXECUTE_r(
2613 PerlIO_printf(Perl_debug_log,
2614 "%*s %smatched empty string...%s\n",
2615 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2616 );
2617 break;
2618 } else {
2619 DEBUG_EXECUTE_r(
2620 PerlIO_printf(Perl_debug_log,
786e8c11 2621 "%*s %sfailed to match trie start class...%s\n",
3dab1dad
YO
2622 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2623 );
2624 sayNO_SILENT;
2625 }
2626 }
166ba7cd 2627
786e8c11
YO
2628 {
2629 U8 *uc = ( U8* )locinput;
2630
2631 STRLEN len = 0;
2632 STRLEN foldlen = 0;
2633 U8 *uscan = (U8*)NULL;
2634 STRLEN bufflen=0;
2635 SV *sv_accept_buff = NULL;
2636 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2637
2638 ST.accepted = 0; /* how many accepting states we have seen */
2639 ST.B = next;
2640 ST.jump = trie->jump;
2641
2642#ifdef DEBUGGING
2643 ST.me = scan;
2644#endif
2645
2646
2647
07be1b83
YO
2648 /*
2649 traverse the TRIE keeping track of all accepting states
2650 we transition through until we get to a failing node.
2651 */
2652
a3621e74 2653 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11
YO
2654 U32 base = trie->states[ state ].trans.base;
2655 UV uvc;
2656 U16 charid;
2657 /* We use charid to hold the wordnum as we don't use it
2658 for charid until after we have done the wordnum logic.
2659 We define an alias just so that the wordnum logic reads
2660 more naturally. */
2661
2662#define got_wordnum charid
2663 got_wordnum = trie->states[ state ].wordnum;
2664
2665 if ( got_wordnum ) {
2666 if ( ! ST.accepted ) {
5b47454d
DM
2667 ENTER;
2668 SAVETMPS;
2669 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2670 sv_accept_buff=newSV(bufflen *
2671 sizeof(reg_trie_accepted) - 1);
786e8c11 2672 SvCUR_set(sv_accept_buff, 0);
5b47454d
DM
2673 SvPOK_on(sv_accept_buff);
2674 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2675 SAVETMPS;
2676 ST.accept_buff =
5b47454d
DM
2677 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2678 }
786e8c11 2679 do {
166ba7cd 2680 if (ST.accepted >= bufflen) {
5b47454d 2681 bufflen *= 2;
166ba7cd 2682 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2683 SvGROW(sv_accept_buff,
2684 bufflen * sizeof(reg_trie_accepted));
2685 }
2686 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2687 + sizeof(reg_trie_accepted));
a3621e74 2688
786e8c11
YO
2689
2690 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2691 ST.accept_buff[ST.accepted].endpos = uc;
2692 ++ST.accepted;
2693 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2694 }
2695#undef got_wordnum
a3621e74 2696
07be1b83 2697 DEBUG_TRIE_EXECUTE_r({
786e8c11 2698 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
a3621e74 2699 PerlIO_printf( Perl_debug_log,
786e8c11 2700 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
07be1b83 2701 2+PL_regindent * 2, "", PL_colors[4],
786e8c11 2702 (UV)state, (UV)ST.accepted );
07be1b83 2703 });
a3621e74
YO
2704
2705 if ( base ) {
4cadc6a9
YO
2706 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2707 uvc, charid, foldlen, foldbuf, uniflags);
a3621e74 2708
5b47454d
DM
2709 if (charid &&
2710 (base + charid > trie->uniquecharcount )
2711 && (base + charid - 1 - trie->uniquecharcount
2712 < trie->lasttrans)
2713 && trie->trans[base + charid - 1 -
2714 trie->uniquecharcount].check == state)
2715 {
2716 state = trie->trans[base + charid - 1 -
2717 trie->uniquecharcount ].next;
2718 }
2719 else {
2720 state = 0;
2721 }
2722 uc += len;
2723
2724 }
2725 else {
a3621e74
YO
2726 state = 0;
2727 }
2728 DEBUG_TRIE_EXECUTE_r(
e4584336 2729 PerlIO_printf( Perl_debug_log,
786e8c11 2730 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 2731 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2732 );
2733 }
166ba7cd 2734 if (!ST.accepted )
a3621e74 2735 sayNO;
a3621e74 2736
166ba7cd
DM
2737 DEBUG_EXECUTE_r(
2738 PerlIO_printf( Perl_debug_log,
2739 "%*s %sgot %"IVdf" possible matches%s\n",
2740 REPORT_CODE_OFF + PL_regindent * 2, "",
2741 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2742 );
786e8c11 2743 }}
166ba7cd
DM
2744
2745 /* FALL THROUGH */
2746
2747 case TRIE_next_fail: /* we failed - try next alterative */
2748
2749 if ( ST.accepted == 1 ) {
2750 /* only one choice left - just continue */
2751 DEBUG_EXECUTE_r({
2752 reg_trie_data * const trie
2753 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2754 SV ** const tmp = RX_DEBUG(reginfo->prog)
2755 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2756 : NULL;
2757 PerlIO_printf( Perl_debug_log,
2758 "%*s %sonly one match left: #%d <%s>%s\n",
2759 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2760 ST.accept_buff[ 0 ].wordnum,
2761 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2762 PL_colors[5] );
2763 });
2764 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2765 /* in this case we free tmps/leave before we call regmatch
2766 as we wont be using accept_buff again. */
2767 FREETMPS;
2768 LEAVE;
2769 locinput = PL_reginput;
2770 nextchr = UCHARAT(locinput);
786e8c11
YO
2771
2772 if ( !ST.jump )
2773 scan = ST.B;
2774 else
2775 scan = ST.B - ST.jump[ST.accept_buff[0].wordnum];
2776
166ba7cd
DM
2777 continue; /* execute rest of RE */
2778 }
2779
2780 if (!ST.accepted-- ) {
2781 FREETMPS;
2782 LEAVE;
2783 sayNO;
2784 }
2785
a3621e74 2786 /*
166ba7cd
DM
2787 There are at least two accepting states left. Presumably
2788 the number of accepting states is going to be low,
2789 typically two. So we simply scan through to find the one
2790 with lowest wordnum. Once we find it, we swap the last
2791 state into its place and decrement the size. We then try to
2792 match the rest of the pattern at the point where the word
2793 ends. If we succeed, control just continues along the
2794 regex; if we fail we return here to try the next accepting
2795 state
2796 */
a3621e74 2797
166ba7cd
DM
2798 {
2799 U32 best = 0;
2800 U32 cur;
2801 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2802 DEBUG_TRIE_EXECUTE_r(
f2278c82 2803 PerlIO_printf( Perl_debug_log,
166ba7cd
DM
2804 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2805 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2806 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2807 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2808 );
2809
2810 if (ST.accept_buff[cur].wordnum <
2811 ST.accept_buff[best].wordnum)
2812 best = cur;
a3621e74 2813 }
166ba7cd
DM
2814
2815 DEBUG_EXECUTE_r({
2816 reg_trie_data * const trie
2817 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2818 SV ** const tmp = RX_DEBUG(reginfo->prog)
2819 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2820 : NULL;
2821 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
2822 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2823 ST.accept_buff[best].wordnum,
2824 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
2825 PL_colors[5] );
2826 });
2827
2828 if ( best<ST.accepted ) {
2829 reg_trie_accepted tmp = ST.accept_buff[ best ];
2830 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2831 ST.accept_buff[ ST.accepted ] = tmp;
2832 best = ST.accepted;
a3621e74 2833 }
166ba7cd 2834 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
786e8c11
YO
2835 if ( !ST.jump ) {
2836 PUSH_STATE_GOTO(TRIE_next, ST.B);
2837 /* NOTREACHED */
2838 } else {
2839 PUSH_STATE_GOTO(TRIE_next, ST.B - ST.jump[ST.accept_buff[best].wordnum]);
2840 /* NOTREACHED */
2841 }
2842 /* NOTREACHED */
166ba7cd 2843 }
166ba7cd
DM
2844 /* NOTREACHED */
2845
2846#undef ST
2847
95b24440
DM
2848 case EXACT: {
2849 char *s = STRING(scan);
5d9a96ca 2850 st->ln = STR_LEN(scan);
eb160463 2851 if (do_utf8 != UTF) {
bc517b45 2852 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2853 char *l = locinput;
be8e71aa 2854 const char * const e = s + st->ln;
a72c7584 2855
5ff6fc6d
JH
2856 if (do_utf8) {
2857 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2858 while (s < e) {
a3b680e6 2859 STRLEN ulen;
1aa99e6b 2860 if (l >= PL_regeol)
5ff6fc6d
JH
2861 sayNO;
2862 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2863 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2864 uniflags))
5ff6fc6d 2865 sayNO;
bc517b45 2866 l += ulen;
5ff6fc6d 2867 s ++;
1aa99e6b 2868 }
5ff6fc6d
JH
2869 }
2870 else {
2871 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2872 while (s < e) {
a3b680e6 2873 STRLEN ulen;
1aa99e6b
IH
2874 if (l >= PL_regeol)
2875 sayNO;
5ff6fc6d 2876 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2877 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2878 uniflags))
1aa99e6b 2879 sayNO;
bc517b45 2880 s += ulen;
a72c7584 2881 l ++;
1aa99e6b 2882 }
5ff6fc6d 2883 }
1aa99e6b
IH
2884 locinput = l;
2885 nextchr = UCHARAT(locinput);
2886 break;
2887 }
bc517b45 2888 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2889 /* Inline the first character, for speed. */
2890 if (UCHARAT(s) != nextchr)
2891 sayNO;
5d9a96ca 2892 if (PL_regeol - locinput < st->ln)
d6a28714 2893 sayNO;
5d9a96ca 2894 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 2895 sayNO;
5d9a96ca 2896 locinput += st->ln;
d6a28714
JH
2897 nextchr = UCHARAT(locinput);
2898 break;
95b24440 2899 }
d6a28714 2900 case EXACTFL:
b8c5462f
JH
2901 PL_reg_flags |= RF_tainted;
2902 /* FALL THROUGH */
95b24440 2903 case EXACTF: {
be8e71aa 2904 char * const s = STRING(scan);
5d9a96ca 2905 st->ln = STR_LEN(scan);
d6a28714 2906
d07ddd77
JH
2907 if (do_utf8 || UTF) {
2908 /* Either target or the pattern are utf8. */
be8e71aa 2909 const char * const l = locinput;
d07ddd77 2910 char *e = PL_regeol;
bc517b45 2911
5d9a96ca 2912 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 2913 l, &e, 0, do_utf8)) {
5486206c
JH
2914 /* One more case for the sharp s:
2915 * pack("U0U*", 0xDF) =~ /ss/i,
2916 * the 0xC3 0x9F are the UTF-8
2917 * byte sequence for the U+00DF. */
2918 if (!(do_utf8 &&
2919 toLOWER(s[0]) == 's' &&
5d9a96ca 2920 st->ln >= 2 &&
5486206c
JH
2921 toLOWER(s[1]) == 's' &&
2922 (U8)l[0] == 0xC3 &&
2923 e - l >= 2 &&
2924 (U8)l[1] == 0x9F))
2925 sayNO;
2926 }
d07ddd77
JH
2927 locinput = e;
2928 nextchr = UCHARAT(locinput);
2929 break;
a0ed51b3 2930 }
d6a28714 2931
bc517b45
JH
2932 /* Neither the target and the pattern are utf8. */
2933
d6a28714
JH
2934 /* Inline the first character, for speed. */
2935 if (UCHARAT(s) != nextchr &&
2936 UCHARAT(s) != ((OP(scan) == EXACTF)
2937 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2938 sayNO;
5d9a96ca 2939 if (PL_regeol - locinput < st->ln)
b8c5462f 2940 sayNO;
5d9a96ca
DM
2941 if (st->ln > 1 && (OP(scan) == EXACTF
2942 ? ibcmp(s, locinput, st->ln)
2943 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 2944 sayNO;
5d9a96ca 2945 locinput += st->ln;
d6a28714 2946 nextchr = UCHARAT(locinput);
a0d0e21e 2947 break;
95b24440 2948 }
d6a28714 2949 case ANYOF:
ffc61ed2 2950 if (do_utf8) {
9e55ce06
JH
2951 STRLEN inclasslen = PL_regeol - locinput;
2952
32fc9b6a 2953 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2954 sayNO_ANYOF;
ffc61ed2
JH
2955 if (locinput >= PL_regeol)
2956 sayNO;
0f0076b4 2957 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2958 nextchr = UCHARAT(locinput);
e0f9d4a8 2959 break;
ffc61ed2
JH
2960 }
2961 else {
2962 if (nextchr < 0)
2963 nextchr = UCHARAT(locinput);
32fc9b6a 2964 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 2965 sayNO_ANYOF;
ffc61ed2
JH
2966 if (!nextchr && locinput >= PL_regeol)
2967 sayNO;
2968 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2969 break;
2970 }
2971 no_anyof:
2972 /* If we might have the case of the German sharp s
2973 * in a casefolding Unicode character class. */
2974
ebc501f0
JH
2975 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2976 locinput += SHARP_S_SKIP;
e0f9d4a8 2977 nextchr = UCHARAT(locinput);
ffc61ed2 2978 }
e0f9d4a8
JH
2979 else
2980 sayNO;
b8c5462f 2981 break;
d6a28714 2982 case ALNUML:
b8c5462f
JH
2983 PL_reg_flags |= RF_tainted;
2984 /* FALL THROUGH */
d6a28714 2985 case ALNUM:
b8c5462f 2986 if (!nextchr)
4633a7c4 2987 sayNO;
ffc61ed2 2988 if (do_utf8) {
1a4fad37 2989 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2990 if (!(OP(scan) == ALNUM
bb7a0f54 2991 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2992 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2993 {
2994 sayNO;
a0ed51b3 2995 }
b8c5462f 2996 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2997 nextchr = UCHARAT(locinput);
2998 break;
2999 }
ffc61ed2 3000 if (!(OP(scan) == ALNUM
d6a28714 3001 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3002 sayNO;
b8c5462f 3003 nextchr = UCHARAT(++locinput);
a0d0e21e 3004 break;
d6a28714 3005 case NALNUML:
b8c5462f
JH
3006 PL_reg_flags |= RF_tainted;
3007 /* FALL THROUGH */
d6a28714
JH
3008 case NALNUM:
3009 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3010 sayNO;
ffc61ed2 3011 if (do_utf8) {
1a4fad37 3012 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3013 if (OP(scan) == NALNUM
bb7a0f54 3014 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3015 : isALNUM_LC_utf8((U8*)locinput))
3016 {
b8c5462f 3017 sayNO;
d6a28714 3018 }
b8c5462f
JH
3019 locinput += PL_utf8skip[nextchr];
3020 nextchr = UCHARAT(locinput);
3021 break;
3022 }
ffc61ed2 3023 if (OP(scan) == NALNUM
d6a28714 3024 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3025 sayNO;
76e3520e 3026 nextchr = UCHARAT(++locinput);
a0d0e21e 3027 break;
d6a28714
JH
3028 case BOUNDL:
3029 case NBOUNDL:
3280af22 3030 PL_reg_flags |= RF_tainted;
bbce6d69 3031 /* FALL THROUGH */
d6a28714
JH
3032 case BOUND:
3033 case NBOUND:
3034 /* was last char in word? */
ffc61ed2 3035 if (do_utf8) {
12d33761 3036 if (locinput == PL_bostr)
5d9a96ca 3037 st->ln = '\n';
ffc61ed2 3038 else {
a3b680e6 3039 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3040
4ad0818d 3041 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3042 }
3043 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3044 st->ln = isALNUM_uni(st->ln);
1a4fad37 3045 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3046 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3047 }
3048 else {
5d9a96ca 3049 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3050 n = isALNUM_LC_utf8((U8*)locinput);
3051 }
a0ed51b3 3052 }
d6a28714 3053 else {
5d9a96ca 3054 st->ln = (locinput != PL_bostr) ?
12d33761 3055 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3056 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3057 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3058 n = isALNUM(nextchr);
3059 }
3060 else {
5d9a96ca 3061 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3062 n = isALNUM_LC(nextchr);
3063 }
d6a28714 3064 }
5d9a96ca 3065 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3066 OP(scan) == BOUNDL))
3067 sayNO;
a0ed51b3 3068 break;
d6a28714 3069 case SPACEL:
3280af22 3070 PL_reg_flags |= RF_tainted;
bbce6d69 3071 /* FALL THROUGH */
d6a28714 3072 case SPACE:
9442cb0e 3073 if (!nextchr)
4633a7c4 3074 sayNO;
1aa99e6b 3075 if (do_utf8) {
fd400ab9 3076 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3077 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3078 if (!(OP(scan) == SPACE
bb7a0f54 3079 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3080 : isSPACE_LC_utf8((U8*)locinput)))
3081 {
3082 sayNO;
3083 }
3084 locinput += PL_utf8skip[nextchr];
3085 nextchr = UCHARAT(locinput);
3086 break;
d6a28714 3087 }
ffc61ed2
JH
3088 if (!(OP(scan) == SPACE
3089 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3090 sayNO;
3091 nextchr = UCHARAT(++locinput);
3092 }
3093 else {
3094 if (!(OP(scan) == SPACE
3095 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3096 sayNO;
3097 nextchr = UCHARAT(++locinput);
a0ed51b3 3098 }
a0ed51b3 3099 break;
d6a28714 3100 case NSPACEL:
3280af22 3101 PL_reg_flags |= RF_tainted;
bbce6d69 3102 /* FALL THROUGH */
d6a28714 3103 case NSPACE:
9442cb0e 3104 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3105 sayNO;
1aa99e6b 3106 if (do_utf8) {
1a4fad37 3107 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3108 if (OP(scan) == NSPACE
bb7a0f54 3109 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3110 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3111 {
3112 sayNO;
3113 }
3114 locinput += PL_utf8skip[nextchr];
3115 nextchr = UCHARAT(locinput);
3116 break;
a0ed51b3 3117 }
ffc61ed2 3118 if (OP(scan) == NSPACE
d6a28714 3119 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3120 sayNO;
76e3520e 3121 nextchr = UCHARAT(++locinput);
a0d0e21e 3122 break;
d6a28714 3123 case DIGITL:
a0ed51b3
LW
3124 PL_reg_flags |= RF_tainted;
3125 /* FALL THROUGH */
d6a28714 3126 case DIGIT:
9442cb0e 3127 if (!nextchr)
a0ed51b3 3128 sayNO;
1aa99e6b 3129 if (do_utf8) {
1a4fad37 3130 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3131 if (!(OP(scan) == DIGIT
bb7a0f54 3132 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3133 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3134 {
a0ed51b3 3135 sayNO;
dfe13c55 3136 }
6f06b55f 3137 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3138 nextchr = UCHARAT(locinput);
3139 break;
3140 }
ffc61ed2 3141 if (!(OP(scan) == DIGIT
9442cb0e 3142 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3143 sayNO;
3144 nextchr = UCHARAT(++locinput);
3145 break;
d6a28714 3146 case NDIGITL:
b8c5462f
JH
3147 PL_reg_flags |= RF_tainted;
3148 /* FALL THROUGH */
d6a28714 3149 case NDIGIT:
9442cb0e 3150 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3151 sayNO;
1aa99e6b 3152 if (do_utf8) {
1a4fad37 3153 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3154 if (OP(scan) == NDIGIT
bb7a0f54 3155 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3156 : isDIGIT_LC_utf8((U8*)locinput))
3157 {
a0ed51b3 3158 sayNO;
9442cb0e 3159 }
6f06b55f 3160 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3161 nextchr = UCHARAT(locinput);
3162 break;
3163 }
ffc61ed2 3164 if (OP(scan) == NDIGIT
9442cb0e 3165 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3166 sayNO;
3167 nextchr = UCHARAT(++locinput);
3168 break;
3169 case CLUMP:
b7c83a7e 3170 if (locinput >= PL_regeol)
a0ed51b3 3171 sayNO;
b7c83a7e 3172 if (do_utf8) {
1a4fad37 3173 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3174 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3175 sayNO;
3176 locinput += PL_utf8skip[nextchr];
3177 while (locinput < PL_regeol &&
3178 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3179 locinput += UTF8SKIP(locinput);
3180 if (locinput > PL_regeol)
3181 sayNO;
eb08e2da
JH
3182 }
3183 else
3184 locinput++;
a0ed51b3
LW
3185 nextchr = UCHARAT(locinput);
3186 break;
c8756f30 3187 case REFFL:
3280af22 3188 PL_reg_flags |= RF_tainted;
c8756f30 3189 /* FALL THROUGH */
c277df42 3190 case REF:
95b24440
DM
3191 case REFF: {
3192 char *s;
c277df42 3193 n = ARG(scan); /* which paren pair */
5d9a96ca 3194 st->ln = PL_regstartp[n];
2c2d71f5 3195 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3196 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3197 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3198 if (st->ln == PL_regendp[n])
a0d0e21e 3199 break;
a0ed51b3 3200
5d9a96ca 3201 s = PL_bostr + st->ln;
1aa99e6b 3202 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3203 char *l = locinput;
a3b680e6 3204 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3205 /*
3206 * Note that we can't do the "other character" lookup trick as
3207 * in the 8-bit case (no pun intended) because in Unicode we
3208 * have to map both upper and title case to lower case.
3209 */
3210 if (OP(scan) == REFF) {
3211 while (s < e) {
a3b680e6
AL
3212 STRLEN ulen1, ulen2;
3213 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3214 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3215
a0ed51b3
LW
3216 if (l >= PL_regeol)
3217 sayNO;
a2a2844f
JH
3218 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3219 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3220 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3221 sayNO;
a2a2844f
JH
3222 s += ulen1;
3223 l += ulen2;
a0ed51b3
LW
3224 }
3225 }
3226 locinput = l;
3227 nextchr = UCHARAT(locinput);
3228 break;
3229 }
3230
a0d0e21e 3231 /* Inline the first character, for speed. */
76e3520e 3232 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3233 (OP(scan) == REF ||
3234 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3235 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3236 sayNO;
5d9a96ca
DM
3237 st->ln = PL_regendp[n] - st->ln;
3238 if (locinput + st->ln > PL_regeol)
4633a7c4 3239 sayNO;
5d9a96ca
DM
3240 if (st->ln > 1 && (OP(scan) == REF
3241 ? memNE(s, locinput, st->ln)
c8756f30 3242 : (OP(scan) == REFF
5d9a96ca
DM
3243 ? ibcmp(s, locinput, st->ln)
3244 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3245 sayNO;
5d9a96ca 3246 locinput += st->ln;
76e3520e 3247 nextchr = UCHARAT(locinput);
a0d0e21e 3248 break;
95b24440 3249 }
a0d0e21e
LW
3250
3251 case NOTHING:
c277df42 3252 case TAIL:
a0d0e21e
LW
3253 break;
3254 case BACK:
3255 break;
40a82448
DM
3256
3257#undef ST
3258#define ST st->u.eval
3259
3260 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
c277df42 3261 {
c277df42 3262 SV *ret;
8e5e9ebe 3263 {
4aabdb9b
DM
3264 /* execute the code in the {...} */
3265 dSP;
6136c704 3266 SV ** const before = SP;
4aabdb9b
DM
3267 OP_4tree * const oop = PL_op;
3268 COP * const ocurcop = PL_curcop;
3269 PAD *old_comppad;
4aabdb9b
DM
3270
3271 n = ARG(scan);
32fc9b6a 3272 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3273 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3274 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3275 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3276
8e5e9ebe
RGS
3277 CALLRUNOPS(aTHX); /* Scalar context. */
3278 SPAGAIN;
3279 if (SP == before)
075aa684 3280 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3281 else {
3282 ret = POPs;
3283 PUTBACK;
3284 }
4aabdb9b
DM
3285
3286 PL_op = oop;
3287 PAD_RESTORE_LOCAL(old_comppad);
3288 PL_curcop = ocurcop;
3289 if (!st->logical) {
3290 /* /(?{...})/ */
3291 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3292 break;
3293 }
8e5e9ebe 3294 }
4aabdb9b
DM
3295 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3296 regexp *re;
4aabdb9b 3297 {
4f639d21
DM
3298 /* extract RE object from returned value; compiling if
3299 * necessary */
3300
6136c704 3301 MAGIC *mg = NULL;
be8e71aa 3302 const SV *sv;
faf82a0b
AE
3303 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3304 mg = mg_find(sv, PERL_MAGIC_qr);
3305 else if (SvSMAGICAL(ret)) {
3306 if (SvGMAGICAL(ret))
3307 sv_unmagic(ret, PERL_MAGIC_qr);
3308 else
3309 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3310 }
faf82a0b 3311
0f5d15d6
IZ
3312 if (mg) {
3313 re = (regexp *)mg->mg_obj;
df0003d4 3314 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3315 }
3316 else {
3317 STRLEN len;
6136c704 3318 const char * const t = SvPV_const(ret, len);
0f5d15d6 3319 PMOP pm;
a3b680e6 3320 const I32 osize = PL_regsize;
0f5d15d6 3321
5fcd1c1b 3322 Zero(&pm, 1, PMOP);
4aabdb9b 3323 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3324 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3325 if (!(SvFLAGS(ret)
faf82a0b
AE
3326 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3327 | SVs_GMG)))
14befaf4
DM
3328 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3329 PERL_MAGIC_qr,0,0);
0f5d15d6 3330 PL_regsize = osize;
0f5d15d6 3331 }
4aabdb9b 3332 }
aa283a38
DM
3333
3334 /* run the pattern returned from (??{...}) */
4aabdb9b 3335 DEBUG_EXECUTE_r(
ab3bbdeb
YO
3336 debug_start_match(re, do_utf8, locinput, PL_regeol,
3337 "Matching embedded");
4aabdb9b 3338 );
2c2d71f5 3339
40a82448
DM
3340 ST.cp = regcppush(0); /* Save *all* the positions. */
3341 REGCP_SET(ST.lastcp);
4aabdb9b
DM
3342 *PL_reglastparen = 0;
3343 *PL_reglastcloseparen = 0;
4aabdb9b 3344 PL_reginput = locinput;
4aabdb9b
DM
3345
3346 /* XXXX This is too dramatic a measure... */
3347 PL_reg_maxiter = 0;
3348
5d9a96ca 3349 st->logical = 0;
40a82448 3350 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
aa283a38 3351 ((re->reganch & ROPT_UTF8) != 0);
40a82448
DM
3352 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3353 ST.prev_rex = rex;
aa283a38 3354 rex = re;
aa283a38 3355
40a82448 3356 ST.B = next;
aa283a38 3357 /* now continue from first node in postoned RE */
40a82448 3358 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
4aabdb9b 3359 /* NOTREACHED */
a0ed51b3 3360 }
4aabdb9b
DM
3361 /* /(?(?{...})X|Y)/ */
3362 st->sw = SvTRUE(ret);
3363 st->logical = 0;
c277df42
IZ
3364 break;
3365 }
40a82448
DM
3366
3367 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3368 if (ST.toggleutf)
3369 PL_reg_flags ^= RF_utf8;
3370 ReREFCNT_dec(rex);
3371 rex = ST.prev_rex;
3372 /* XXXX This is too dramatic a measure... */
3373 PL_reg_maxiter = 0;
3374 /* Restore parens of the caller without popping the
3375 * savestack */
3376 {
3377 const I32 tmp = PL_savestack_ix;
3378 PL_savestack_ix = ST.lastcp;
3379 regcppop(rex);
3380 PL_savestack_ix = tmp;
3381 }
3382 PL_reginput = locinput;
3383 /* continue at the node following the (??{...}) */
3384 scan = ST.B;
3385 continue;
3386
3387 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3388 /* Restore state to the outer re then re-throw the failure */
3389 if (ST.toggleutf)
3390 PL_reg_flags ^= RF_utf8;
3391 ReREFCNT_dec(rex);
3392 rex = ST.prev_rex;
3393
3394 /* XXXX This is too dramatic a measure... */
3395 PL_reg_maxiter = 0;
3396
3397 PL_reginput = locinput;
3398 REGCP_UNWIND(ST.lastcp);
3399 regcppop(rex);
3400 sayNO_SILENT;
3401
3402#undef ST
3403
a0d0e21e 3404 case OPEN:
c277df42 3405 n = ARG(scan); /* which paren pair */
3280af22
NIS
3406 PL_reg_start_tmp[n] = locinput;
3407 if (n > PL_regsize)
3408 PL_regsize = n;
a0d0e21e
LW
3409 break;
3410 case CLOSE:
c277df42 3411 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3412 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3413 PL_regendp[n] = locinput - PL_bostr;
eb160463 3414 if (n > (I32)*PL_reglastparen)
3280af22 3415 *PL_reglastparen = n;
a01268b5 3416 *PL_reglastcloseparen = n;
a0d0e21e 3417 break;
c277df42
IZ
3418 case GROUPP:
3419 n = ARG(scan); /* which paren pair */
5d9a96ca 3420 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3421 break;
3422 case IFTHEN:
2c2d71f5 3423 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3424 if (st->sw)
c277df42
IZ
3425 next = NEXTOPER(NEXTOPER(scan));
3426 else {
3427 next = scan + ARG(scan);
3428 if (OP(next) == IFTHEN) /* Fake one. */
3429 next = NEXTOPER(NEXTOPER(next));
3430 }
3431 break;
3432 case LOGICAL:
5d9a96ca 3433 st->logical = scan->flags;
c277df42 3434 break;
2ab05381 3435/*******************************************************************
a0374537
DM
3436 cc points to the regmatch_state associated with the most recent CURLYX.
3437 This struct contains info about the innermost (...)* loop (an
3438 "infoblock"), and a pointer to the next outer cc.
2ab05381
IZ
3439
3440 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3441
95b24440 3442 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3443
a0374537 3444 2) This regnode populates cc, and calls regmatch() recursively
95b24440