This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Minor re 'Debug' tweaks, also fix a bug in dumping certain patterns.
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
9041c2e3 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 **** Alterations to Henry's code are...
57 ****
4bb101f2 58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 60 ****
9ef589d8
LW
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
a687059c
LW
63 *
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
67 */
68#include "EXTERN.h"
864dbfa3 69#define PERL_IN_REGEXEC_C
a687059c 70#include "perl.h"
0f5d15d6 71
54df2634
NC
72#ifdef PERL_IN_XSUB_RE
73# include "re_comp.h"
74#else
75# include "regcomp.h"
76#endif
a687059c 77
c277df42
IZ
78#define RF_tainted 1 /* tainted information used? */
79#define RF_warned 2 /* warned about big count? */
faec1544 80
ab3bbdeb 81#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 82
eb160463 83#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
84
85#define RS_init 1 /* eval environment created */
86#define RS_set 2 /* replsv value is set */
c277df42 87
a687059c
LW
88#ifndef STATIC
89#define STATIC static
90#endif
91
32fc9b6a 92#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 93
c277df42
IZ
94/*
95 * Forwards.
96 */
97
33b8afdf 98#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 99#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 100
3dab1dad
YO
101#define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
52657f30 103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
104 : (U8*)(pos + off))
105#define HOPBACKc(pos, off) \
07be1b83
YO
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
8e11feef 109 ? (U8*)pos - off \
3dab1dad 110 : NULL)
efb30f32 111
e7409c1b 112#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 113#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 114
1a4fad37
AL
115#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 121
3dab1dad
YO
122/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
123
5f80c4cf 124/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
125#define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
130)
131
cca55fe3 132#define HAS_TEXT(rn) ( \
3dab1dad 133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 134)
e2d8ce26 135
a84d97b6
HS
136/*
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
139*/
cca55fe3 140#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 144 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 145 else if (type == PLUS) \
cca55fe3 146 rn = NEXTOPER(rn); \
3dab1dad 147 else if (type == IFMATCH) \
a84d97b6 148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 149 else rn += NEXT_OFF(rn); \
3dab1dad 150 } \
5f80c4cf 151} STMT_END
74750237 152
acfe0abc 153static void restore_pos(pTHX_ void *arg);
51371543 154
76e3520e 155STATIC CHECKPOINT
cea2e8a9 156S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 157{
97aff369 158 dVAR;
a3b680e6 159 const int retval = PL_savestack_ix;
b1ce53c5 160#define REGCP_PAREN_ELEMS 4
a3b680e6 161 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 162 int p;
40a82448 163 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 164
e49a9654
IH
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
167
a01268b5 168#define REGCP_OTHER_ELEMS 6
4b3c1a47 169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 170 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 171/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
3280af22 174 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 175 SSPUSHINT(p);
40a82448
DM
176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
177 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
178 (UV)p, (IV)PL_regstartp[p],
179 (IV)(PL_reg_start_tmp[p] - PL_bostr),
180 (IV)PL_regendp[p]
181 ));
a0d0e21e 182 }
b1ce53c5 183/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
184 SSPUSHINT(PL_regsize);
185 SSPUSHINT(*PL_reglastparen);
a01268b5 186 SSPUSHINT(*PL_reglastcloseparen);
3280af22 187 SSPUSHPTR(PL_reginput);
41123dfd
JH
188#define REGCP_FRAME_ELEMS 2
189/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
190 * are needed for the regexp context stack bookkeeping. */
191 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 192 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 193
a0d0e21e
LW
194 return retval;
195}
196
c277df42 197/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
198#define REGCP_SET(cp) \
199 DEBUG_STATE_r( \
ab3bbdeb 200 PerlIO_printf(Perl_debug_log, \
e4f74956 201 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
202 (IV)PL_savestack_ix)); \
203 cp = PL_savestack_ix
c3464db5 204
ab3bbdeb 205#define REGCP_UNWIND(cp) \
e4f74956 206 DEBUG_STATE_r( \
ab3bbdeb 207 if (cp != PL_savestack_ix) \
e4f74956
YO
208 PerlIO_printf(Perl_debug_log, \
209 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
210 (IV)(cp), (IV)PL_savestack_ix)); \
211 regcpblow(cp)
c277df42 212
76e3520e 213STATIC char *
097eb12c 214S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 215{
97aff369 216 dVAR;
b1ce53c5 217 I32 i;
a0d0e21e 218 char *input;
b1ce53c5 219
a3621e74
YO
220 GET_RE_DEBUG_FLAGS_DECL;
221
b1ce53c5 222 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 223 i = SSPOPINT;
b1ce53c5
JH
224 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
225 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 226 input = (char *) SSPOPPTR;
a01268b5 227 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
228 *PL_reglastparen = SSPOPINT;
229 PL_regsize = SSPOPINT;
b1ce53c5
JH
230
231 /* Now restore the parentheses context. */
41123dfd
JH
232 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
233 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 234 I32 tmps;
097eb12c 235 U32 paren = (U32)SSPOPINT;
3280af22 236 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
237 PL_regstartp[paren] = SSPOPINT;
238 tmps = SSPOPINT;
3280af22
NIS
239 if (paren <= *PL_reglastparen)
240 PL_regendp[paren] = tmps;
a3621e74 241 DEBUG_EXECUTE_r(
c3464db5 242 PerlIO_printf(Perl_debug_log,
b900a521 243 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 244 (UV)paren, (IV)PL_regstartp[paren],
b900a521 245 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 246 (IV)PL_regendp[paren],
3280af22 247 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 248 );
a0d0e21e 249 }
a3621e74 250 DEBUG_EXECUTE_r(
bb7a0f54 251 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 252 PerlIO_printf(Perl_debug_log,
faccc32b 253 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 254 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
255 }
256 );
daf18116 257#if 1
dafc8851
JH
258 /* It would seem that the similar code in regtry()
259 * already takes care of this, and in fact it is in
260 * a better location to since this code can #if 0-ed out
261 * but the code in regtry() is needed or otherwise tests
262 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
263 * (as of patchlevel 7877) will fail. Then again,
264 * this code seems to be necessary or otherwise
265 * building DynaLoader will fail:
266 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
267 * --jhi */
bb7a0f54 268 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
269 if (i > PL_regsize)
270 PL_regstartp[i] = -1;
271 PL_regendp[i] = -1;
a0d0e21e 272 }
dafc8851 273#endif
a0d0e21e
LW
274 return input;
275}
276
02db2b7b 277#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 278
a687059c 279/*
e50aee73 280 * pregexec and friends
a687059c
LW
281 */
282
76234dfb 283#ifndef PERL_IN_XSUB_RE
a687059c 284/*
c277df42 285 - pregexec - match a regexp against a string
a687059c 286 */
c277df42 287I32
864dbfa3 288Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 289 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
290/* strend: pointer to null at end of string */
291/* strbeg: real beginning of string */
292/* minend: end of match must be >=minend after stringarg. */
293/* nosave: For optimizations. */
294{
295 return
9041c2e3 296 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
297 nosave ? 0 : REXEC_COPY_STR);
298}
76234dfb 299#endif
22e551b9 300
9041c2e3 301/*
cad2e5aa
JH
302 * Need to implement the following flags for reg_anch:
303 *
304 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
305 * USE_INTUIT_ML
306 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
307 * INTUIT_AUTORITATIVE_ML
308 * INTUIT_ONCE_NOML - Intuit can match in one location only.
309 * INTUIT_ONCE_ML
310 *
311 * Another flag for this function: SECOND_TIME (so that float substrs
312 * with giant delta may be not rechecked).
313 */
314
315/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
316
3f7c398e 317/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
318 Otherwise, only SvCUR(sv) is used to get strbeg. */
319
320/* XXXX We assume that strpos is strbeg unless sv. */
321
6eb5f6b9
JH
322/* XXXX Some places assume that there is a fixed substring.
323 An update may be needed if optimizer marks as "INTUITable"
324 RExen without fixed substrings. Similarly, it is assumed that
325 lengths of all the strings are no more than minlen, thus they
326 cannot come from lookahead.
327 (Or minlen should take into account lookahead.) */
328
2c2d71f5
JH
329/* A failure to find a constant substring means that there is no need to make
330 an expensive call to REx engine, thus we celebrate a failure. Similarly,
331 finding a substring too deep into the string means that less calls to
30944b6d
IZ
332 regtry() should be needed.
333
334 REx compiler's optimizer found 4 possible hints:
335 a) Anchored substring;
336 b) Fixed substring;
337 c) Whether we are anchored (beginning-of-line or \G);
338 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 339 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
340 string which does not contradict any of them.
341 */
2c2d71f5 342
6eb5f6b9
JH
343/* Most of decisions we do here should have been done at compile time.
344 The nodes of the REx which we used for the search should have been
345 deleted from the finite automaton. */
346
cad2e5aa
JH
347char *
348Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
349 char *strend, U32 flags, re_scream_pos_data *data)
350{
97aff369 351 dVAR;
b7953727 352 register I32 start_shift = 0;
cad2e5aa 353 /* Should be nonnegative! */
b7953727 354 register I32 end_shift = 0;
2c2d71f5
JH
355 register char *s;
356 register SV *check;
a1933d95 357 char *strbeg;
cad2e5aa 358 char *t;
1de06328 359 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 360 I32 ml_anch;
bd61b366
SS
361 register char *other_last = NULL; /* other substr checked before this */
362 char *check_at = NULL; /* check substr found at this pos */
1df70142 363 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 364#ifdef DEBUGGING
890ce7af 365 const char * const i_strpos = strpos;
30944b6d 366#endif
a3621e74
YO
367
368 GET_RE_DEBUG_FLAGS_DECL;
369
a30b2f1f 370 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 371
b8d68ded 372 if (prog->reganch & ROPT_UTF8) {
b8d68ded
JH
373 PL_reg_flags |= RF_utf8;
374 }
ab3bbdeb
YO
375 DEBUG_EXECUTE_r(
376 debug_start_match(prog, do_utf8, strpos, strend,
1de06328
YO
377 sv ? "Guessing start of match in sv for"
378 : "Guessing start of match in string for");
2a782b5b 379 );
cad2e5aa 380
c344f387
JH
381 /* CHR_DIST() would be more correct here but it makes things slow. */
382 if (prog->minlen > strend - strpos) {
a3621e74 383 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 384 "String too short... [re_intuit_start]\n"));
cad2e5aa 385 goto fail;
2c2d71f5 386 }
1de06328 387
a1933d95 388 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 389 PL_regeol = strend;
33b8afdf
JH
390 if (do_utf8) {
391 if (!prog->check_utf8 && prog->check_substr)
392 to_utf8_substr(prog);
393 check = prog->check_utf8;
394 } else {
395 if (!prog->check_substr && prog->check_utf8)
396 to_byte_substr(prog);
397 check = prog->check_substr;
398 }
1de06328 399 if (check == &PL_sv_undef) {
a3621e74 400 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 401 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
402 goto fail;
403 }
2c2d71f5 404 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
405 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
406 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 407 && !multiline ) ); /* Check after \n? */
cad2e5aa 408
7e25d62c
JH
409 if (!ml_anch) {
410 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
411 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 412 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
413 && sv && !SvROK(sv)
414 && (strpos != strbeg)) {
a3621e74 415 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
416 goto fail;
417 }
418 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 419 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 420 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
421 I32 slen;
422
1aa99e6b 423 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 424
653099ff
GS
425 if (SvTAIL(check)) {
426 slen = SvCUR(check); /* >= 1 */
cad2e5aa 427
9041c2e3 428 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 429 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 430 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 431 goto fail_finish;
cad2e5aa
JH
432 }
433 /* Now should match s[0..slen-2] */
434 slen--;
3f7c398e 435 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 436 || (slen > 1
3f7c398e 437 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 438 report_neq:
a3621e74 439 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
440 goto fail_finish;
441 }
cad2e5aa 442 }
3f7c398e 443 else if (*SvPVX_const(check) != *s
653099ff 444 || ((slen = SvCUR(check)) > 1
3f7c398e 445 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 446 goto report_neq;
c315bfe8 447 check_at = s;
2c2d71f5 448 goto success_at_start;
7e25d62c 449 }
cad2e5aa 450 }
2c2d71f5 451 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 452 s = strpos;
2c2d71f5 453 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
454 end_shift = prog->check_end_shift;
455
2c2d71f5 456 if (!ml_anch) {
a3b680e6 457 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 458 - (SvTAIL(check) != 0);
a3b680e6 459 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
460
461 if (end_shift < eshift)
462 end_shift = eshift;
463 }
cad2e5aa 464 }
2c2d71f5 465 else { /* Can match at random position */
cad2e5aa
JH
466 ml_anch = 0;
467 s = strpos;
1de06328
YO
468 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
469 end_shift = prog->check_end_shift;
470
471 /* end shift should be non negative here */
cad2e5aa
JH
472 }
473
2c2d71f5 474#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 475 if (end_shift < 0)
1de06328 476 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
b57a0404 477 (IV)end_shift, prog->precomp);
2c2d71f5
JH
478#endif
479
2c2d71f5
JH
480 restart:
481 /* Find a possible match in the region s..strend by looking for
482 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
483
484 {
485 I32 srch_start_shift = start_shift;
486 I32 srch_end_shift = end_shift;
487 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
488 srch_end_shift -= ((strbeg - s) - srch_start_shift);
489 srch_start_shift = strbeg - s;
490 }
491 DEBUG_OPTIMISE_r({
492 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
493 (IV)prog->check_offset_min,
494 (IV)srch_start_shift,
495 (IV)srch_end_shift,
496 (IV)prog->check_end_shift);
497 });
498
cad2e5aa 499 if (flags & REXEC_SCREAM) {
cad2e5aa 500 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 501 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 502
2c2d71f5
JH
503 if (PL_screamfirst[BmRARE(check)] >= 0
504 || ( BmRARE(check) == '\n'
505 && (BmPREVIOUS(check) == SvCUR(check) - 1)
506 && SvTAIL(check) ))
9041c2e3 507 s = screaminstr(sv, check,
1de06328 508 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 509 else
2c2d71f5 510 goto fail_finish;
4addbd3b
HS
511 /* we may be pointing at the wrong string */
512 if (s && RX_MATCH_COPIED(prog))
3f7c398e 513 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
514 if (data)
515 *data->scream_olds = s;
516 }
1de06328
YO
517 else {
518 U8* start_point;
519 U8* end_point;
520 if (prog->reganch & ROPT_CANY_SEEN) {
521 start_point= (U8*)(s + srch_start_shift);
522 end_point= (U8*)(strend - srch_end_shift);
523 } else {
524 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
525 end_point= HOP3(strend, -srch_end_shift, strbeg);
526 }
527 DEBUG_OPTIMISE_r({
56570a2c 528 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 529 (int)(end_point - start_point),
fc8cd66c 530 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
531 start_point);
532 });
533
534 s = fbm_instr( start_point, end_point,
7fba1cd6 535 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
536 }
537 }
cad2e5aa
JH
538 /* Update the count-of-usability, remove useless subpatterns,
539 unshift s. */
2c2d71f5 540
ab3bbdeb
YO
541 DEBUG_EXECUTE_r({
542 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
543 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
544 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 545 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
546 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
547 ? "anchored" : "floating"),
548 quoted,
549 RE_SV_TAIL(check),
550 (s ? " at offset " : "...\n") );
551 });
2c2d71f5
JH
552
553 if (!s)
554 goto fail_finish;
2c2d71f5 555 /* Finish the diagnostic message */
a3621e74 556 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 557
1de06328
YO
558 /* XXX dmq: first branch is for positive lookbehind...
559 Our check string is offset from the beginning of the pattern.
560 So we need to do any stclass tests offset forward from that
561 point. I think. :-(
562 */
563
564
565
566 check_at=s;
567
568
2c2d71f5
JH
569 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
570 Start with the other substr.
571 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 572 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
573 *always* match. Probably should be marked during compile...
574 Probably it is right to do no SCREAM here...
575 */
576
1de06328
YO
577 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
578 : (prog->float_substr && prog->anchored_substr))
579 {
30944b6d 580 /* Take into account the "other" substring. */
2c2d71f5
JH
581 /* XXXX May be hopelessly wrong for UTF... */
582 if (!other_last)
6eb5f6b9 583 other_last = strpos;
33b8afdf 584 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
585 do_other_anchored:
586 {
890ce7af
AL
587 char * const last = HOP3c(s, -start_shift, strbeg);
588 char *last1, *last2;
be8e71aa 589 char * const saved_s = s;
33b8afdf 590 SV* must;
2c2d71f5 591
2c2d71f5
JH
592 t = s - prog->check_offset_max;
593 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 594 && (!do_utf8
0ce71af7 595 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 596 && t > strpos)))
6f207bd3 597 NOOP;
2c2d71f5
JH
598 else
599 t = strpos;
1aa99e6b 600 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
601 if (t < other_last) /* These positions already checked */
602 t = other_last;
1aa99e6b 603 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
604 if (last < last1)
605 last1 = last;
1de06328
YO
606 /* XXXX It is not documented what units *_offsets are in.
607 We assume bytes, but this is clearly wrong.
608 Meaning this code needs to be carefully reviewed for errors.
609 dmq.
610 */
611
2c2d71f5 612 /* On end-of-str: see comment below. */
33b8afdf
JH
613 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
614 if (must == &PL_sv_undef) {
615 s = (char*)NULL;
1de06328 616 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
617 }
618 else
619 s = fbm_instr(
620 (unsigned char*)t,
621 HOP3(HOP3(last1, prog->anchored_offset, strend)
622 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
623 must,
7fba1cd6 624 multiline ? FBMrf_MULTILINE : 0
33b8afdf 625 );
ab3bbdeb
YO
626 DEBUG_EXECUTE_r({
627 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
628 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
629 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 630 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
631 quoted, RE_SV_TAIL(must));
632 });
633
634
2c2d71f5
JH
635 if (!s) {
636 if (last1 >= last2) {
a3621e74 637 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
638 ", giving up...\n"));
639 goto fail_finish;
640 }
a3621e74 641 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 642 ", trying floating at offset %ld...\n",
be8e71aa 643 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
644 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
645 s = HOP3c(last, 1, strend);
2c2d71f5
JH
646 goto restart;
647 }
648 else {
a3621e74 649 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 650 (long)(s - i_strpos)));
1aa99e6b
IH
651 t = HOP3c(s, -prog->anchored_offset, strbeg);
652 other_last = HOP3c(s, 1, strend);
be8e71aa 653 s = saved_s;
2c2d71f5
JH
654 if (t == strpos)
655 goto try_at_start;
2c2d71f5
JH
656 goto try_at_offset;
657 }
30944b6d 658 }
2c2d71f5
JH
659 }
660 else { /* Take into account the floating substring. */
33b8afdf 661 char *last, *last1;
be8e71aa 662 char * const saved_s = s;
33b8afdf
JH
663 SV* must;
664
665 t = HOP3c(s, -start_shift, strbeg);
666 last1 = last =
667 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
668 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
669 last = HOP3c(t, prog->float_max_offset, strend);
670 s = HOP3c(t, prog->float_min_offset, strend);
671 if (s < other_last)
672 s = other_last;
2c2d71f5 673 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
674 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
675 /* fbm_instr() takes into account exact value of end-of-str
676 if the check is SvTAIL(ed). Since false positives are OK,
677 and end-of-str is not later than strend we are OK. */
678 if (must == &PL_sv_undef) {
679 s = (char*)NULL;
1de06328 680 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
681 }
682 else
2c2d71f5 683 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
684 (unsigned char*)last + SvCUR(must)
685 - (SvTAIL(must)!=0),
7fba1cd6 686 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
687 DEBUG_EXECUTE_r({
688 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
689 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
690 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 691 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
692 quoted, RE_SV_TAIL(must));
693 });
33b8afdf
JH
694 if (!s) {
695 if (last1 == last) {
a3621e74 696 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
697 ", giving up...\n"));
698 goto fail_finish;
2c2d71f5 699 }
a3621e74 700 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 701 ", trying anchored starting at offset %ld...\n",
be8e71aa 702 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
703 other_last = last;
704 s = HOP3c(t, 1, strend);
705 goto restart;
706 }
707 else {
a3621e74 708 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
709 (long)(s - i_strpos)));
710 other_last = s; /* Fix this later. --Hugo */
be8e71aa 711 s = saved_s;
33b8afdf
JH
712 if (t == strpos)
713 goto try_at_start;
714 goto try_at_offset;
715 }
2c2d71f5 716 }
cad2e5aa 717 }
2c2d71f5 718
1de06328 719
9ef43ace 720 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328
YO
721
722 DEBUG_OPTIMISE_r(
723 PerlIO_printf(Perl_debug_log,
724 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
725 (IV)prog->check_offset_min,
726 (IV)prog->check_offset_max,
727 (IV)(s-strpos),
728 (IV)(t-strpos),
729 (IV)(t-s),
730 (IV)(strend-strpos)
731 )
732 );
733
2c2d71f5 734 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 735 && (!do_utf8
9ef43ace 736 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
737 && t > strpos)))
738 {
2c2d71f5
JH
739 /* Fixed substring is found far enough so that the match
740 cannot start at strpos. */
741 try_at_offset:
cad2e5aa 742 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
743 /* Eventually fbm_*() should handle this, but often
744 anchored_offset is not 0, so this check will not be wasted. */
745 /* XXXX In the code below we prefer to look for "^" even in
746 presence of anchored substrings. And we search even
747 beyond the found float position. These pessimizations
748 are historical artefacts only. */
749 find_anchor:
2c2d71f5 750 while (t < strend - prog->minlen) {
cad2e5aa 751 if (*t == '\n') {
4ee3650e 752 if (t < check_at - prog->check_offset_min) {
33b8afdf 753 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
754 /* Since we moved from the found position,
755 we definitely contradict the found anchored
30944b6d
IZ
756 substr. Due to the above check we do not
757 contradict "check" substr.
758 Thus we can arrive here only if check substr
759 is float. Redo checking for "other"=="fixed".
760 */
9041c2e3 761 strpos = t + 1;
a3621e74 762 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 763 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
764 goto do_other_anchored;
765 }
4ee3650e
GS
766 /* We don't contradict the found floating substring. */
767 /* XXXX Why not check for STCLASS? */
cad2e5aa 768 s = t + 1;
a3621e74 769 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 770 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
771 goto set_useful;
772 }
4ee3650e
GS
773 /* Position contradicts check-string */
774 /* XXXX probably better to look for check-string
775 than for "\n", so one should lower the limit for t? */
a3621e74 776 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 777 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 778 other_last = strpos = s = t + 1;
cad2e5aa
JH
779 goto restart;
780 }
781 t++;
782 }
a3621e74 783 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 784 PL_colors[0], PL_colors[1]));
2c2d71f5 785 goto fail_finish;
cad2e5aa 786 }
f5952150 787 else {
a3621e74 788 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 789 PL_colors[0], PL_colors[1]));
f5952150 790 }
cad2e5aa
JH
791 s = t;
792 set_useful:
33b8afdf 793 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
794 }
795 else {
f5952150 796 /* The found string does not prohibit matching at strpos,
2c2d71f5 797 - no optimization of calling REx engine can be performed,
f5952150
GS
798 unless it was an MBOL and we are not after MBOL,
799 or a future STCLASS check will fail this. */
2c2d71f5
JH
800 try_at_start:
801 /* Even in this situation we may use MBOL flag if strpos is offset
802 wrt the start of the string. */
05b4157f 803 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 804 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
805 /* May be due to an implicit anchor of m{.*foo} */
806 && !(prog->reganch & ROPT_IMPLICIT))
807 {
cad2e5aa
JH
808 t = strpos;
809 goto find_anchor;
810 }
a3621e74 811 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 812 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 813 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 814 );
2c2d71f5 815 success_at_start:
30944b6d 816 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
817 && (do_utf8 ? (
818 prog->check_utf8 /* Could be deleted already */
819 && --BmUSEFUL(prog->check_utf8) < 0
820 && (prog->check_utf8 == prog->float_utf8)
821 ) : (
822 prog->check_substr /* Could be deleted already */
823 && --BmUSEFUL(prog->check_substr) < 0
824 && (prog->check_substr == prog->float_substr)
825 )))
66e933ab 826 {
cad2e5aa 827 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 828 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
829 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
830 if (do_utf8 ? prog->check_substr : prog->check_utf8)
831 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
832 prog->check_substr = prog->check_utf8 = NULL; /* disable */
833 prog->float_substr = prog->float_utf8 = NULL; /* clear */
834 check = NULL; /* abort */
cad2e5aa 835 s = strpos;
3cf5c195
IZ
836 /* XXXX This is a remnant of the old implementation. It
837 looks wasteful, since now INTUIT can use many
6eb5f6b9 838 other heuristics. */
cad2e5aa
JH
839 prog->reganch &= ~RE_USE_INTUIT;
840 }
841 else
842 s = strpos;
843 }
844
6eb5f6b9
JH
845 /* Last resort... */
846 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
847 /* trie stclasses are too expensive to use here, we are better off to
848 leave it to regmatch itself */
786e8c11 849 if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) {
6eb5f6b9
JH
850 /* minlen == 0 is possible if regstclass is \b or \B,
851 and the fixed substr is ''$.
852 Since minlen is already taken into account, s+1 is before strend;
853 accidentally, minlen >= 1 guaranties no false positives at s + 1
854 even for \b or \B. But (minlen? 1 : 0) below assumes that
855 regstclass does not come from lookahead... */
856 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
857 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 858 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 859 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 860 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 861 : 1);
1de06328
YO
862 char * endpos;
863 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
864 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
865 else if (prog->float_substr || prog->float_utf8)
866 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
867 else
868 endpos= strend;
869
56570a2c 870 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %d s: %d endpos: %d\n",
b57a0404 871 (IV)start_shift, check_at - strbeg, s - strbeg, endpos - strbeg));
1de06328 872
6eb5f6b9 873 t = s;
3b0527fe 874 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
875 if (!s) {
876#ifdef DEBUGGING
cbbf8932 877 const char *what = NULL;
6eb5f6b9
JH
878#endif
879 if (endpos == strend) {
a3621e74 880 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
881 "Could not match STCLASS...\n") );
882 goto fail;
883 }
a3621e74 884 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 885 "This position contradicts STCLASS...\n") );
653099ff
GS
886 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
887 goto fail;
6eb5f6b9 888 /* Contradict one of substrings */
33b8afdf
JH
889 if (prog->anchored_substr || prog->anchored_utf8) {
890 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 891 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 892 hop_and_restart:
1aa99e6b 893 s = HOP3c(t, 1, strend);
66e933ab
GS
894 if (s + start_shift + end_shift > strend) {
895 /* XXXX Should be taken into account earlier? */
a3621e74 896 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
897 "Could not match STCLASS...\n") );
898 goto fail;
899 }
5e39e1e5
HS
900 if (!check)
901 goto giveup;
a3621e74 902 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 903 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
904 what, (long)(s + start_shift - i_strpos)) );
905 goto restart;
906 }
66e933ab 907 /* Have both, check_string is floating */
6eb5f6b9
JH
908 if (t + start_shift >= check_at) /* Contradicts floating=check */
909 goto retry_floating_check;
910 /* Recheck anchored substring, but not floating... */
9041c2e3 911 s = check_at;
5e39e1e5
HS
912 if (!check)
913 goto giveup;
a3621e74 914 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 915 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
916 (long)(other_last - i_strpos)) );
917 goto do_other_anchored;
918 }
60e71179
GS
919 /* Another way we could have checked stclass at the
920 current position only: */
921 if (ml_anch) {
922 s = t = t + 1;
5e39e1e5
HS
923 if (!check)
924 goto giveup;
a3621e74 925 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 926 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 927 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 928 goto try_at_offset;
66e933ab 929 }
33b8afdf 930 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 931 goto fail;
6eb5f6b9
JH
932 /* Check is floating subtring. */
933 retry_floating_check:
934 t = check_at - start_shift;
a3621e74 935 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
936 goto hop_and_restart;
937 }
b7953727 938 if (t != s) {
a3621e74 939 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 940 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
941 (long)(t - i_strpos), (long)(s - i_strpos))
942 );
943 }
944 else {
a3621e74 945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
946 "Does not contradict STCLASS...\n");
947 );
948 }
6eb5f6b9 949 }
5e39e1e5 950 giveup:
a3621e74 951 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
952 PL_colors[4], (check ? "Guessed" : "Giving up"),
953 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 954 return s;
2c2d71f5
JH
955
956 fail_finish: /* Substring not found */
33b8afdf
JH
957 if (prog->check_substr || prog->check_utf8) /* could be removed already */
958 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 959 fail:
a3621e74 960 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 961 PL_colors[4], PL_colors[5]));
bd61b366 962 return NULL;
cad2e5aa 963}
9661b544 964
786e8c11 965
3b0527fe 966
4cadc6a9
YO
967#define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
968foldlen, foldbuf, uniflags) STMT_START { \
969 switch (trie_type) { \
970 case trie_utf8_fold: \
971 if ( foldlen>0 ) { \
972 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
973 foldlen -= len; \
974 uscan += len; \
975 len=0; \
976 } else { \
977 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
978 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
979 foldlen -= UNISKIP( uvc ); \
980 uscan = foldbuf + UNISKIP( uvc ); \
981 } \
982 break; \
983 case trie_utf8: \
984 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
985 break; \
986 case trie_plain: \
987 uvc = (UV)*uc; \
988 len = 1; \
989 } \
990 \
991 if (uvc < 256) { \
992 charid = trie->charmap[ uvc ]; \
993 } \
994 else { \
995 charid = 0; \
996 if (trie->widecharmap) { \
997 SV** const svpp = hv_fetch(trie->widecharmap, \
998 (char*)&uvc, sizeof(UV), 0); \
999 if (svpp) \
1000 charid = (U16)SvIV(*svpp); \
1001 } \
1002 } \
1003} STMT_END
1004
1005#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1006 if ( (CoNd) \
1007 && (ln == len || \
1008 ibcmp_utf8(s, NULL, 0, do_utf8, \
1009 m, NULL, ln, (bool)UTF)) \
1010 && (!reginfo || regtry(reginfo, s)) ) \
1011 goto got_it; \
1012 else { \
1013 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1014 uvchr_to_utf8(tmpbuf, c); \
1015 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1016 if ( f != c \
1017 && (f == c1 || f == c2) \
1018 && (ln == foldlen || \
1019 !ibcmp_utf8((char *) foldbuf, \
1020 NULL, foldlen, do_utf8, \
1021 m, \
1022 NULL, ln, (bool)UTF)) \
1023 && (!reginfo || regtry(reginfo, s)) ) \
1024 goto got_it; \
1025 } \
1026 s += len
1027
1028#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1029STMT_START { \
1030 while (s <= e) { \
1031 if ( (CoNd) \
1032 && (ln == 1 || !(OP(c) == EXACTF \
1033 ? ibcmp(s, m, ln) \
1034 : ibcmp_locale(s, m, ln))) \
1035 && (!reginfo || regtry(reginfo, s)) ) \
1036 goto got_it; \
1037 s++; \
1038 } \
1039} STMT_END
1040
1041#define REXEC_FBC_UTF8_SCAN(CoDe) \
1042STMT_START { \
1043 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1044 CoDe \
1045 s += uskip; \
1046 } \
1047} STMT_END
1048
1049#define REXEC_FBC_SCAN(CoDe) \
1050STMT_START { \
1051 while (s < strend) { \
1052 CoDe \
1053 s++; \
1054 } \
1055} STMT_END
1056
1057#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1058REXEC_FBC_UTF8_SCAN( \
1059 if (CoNd) { \
1060 if (tmp && (!reginfo || regtry(reginfo, s))) \
1061 goto got_it; \
1062 else \
1063 tmp = doevery; \
1064 } \
1065 else \
1066 tmp = 1; \
1067)
1068
1069#define REXEC_FBC_CLASS_SCAN(CoNd) \
1070REXEC_FBC_SCAN( \
1071 if (CoNd) { \
1072 if (tmp && (!reginfo || regtry(reginfo, s))) \
1073 goto got_it; \
1074 else \
1075 tmp = doevery; \
1076 } \
1077 else \
1078 tmp = 1; \
1079)
1080
1081#define REXEC_FBC_TRYIT \
1082if ((!reginfo || regtry(reginfo, s))) \
1083 goto got_it
1084
1085#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1086 if (do_utf8) { \
1087 UtFpReLoAd; \
1088 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1089 } \
1090 else { \
1091 REXEC_FBC_CLASS_SCAN(CoNd); \
1092 } \
1093 break
1094
1095#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1096 PL_reg_flags |= RF_tainted; \
1097 if (do_utf8) { \
1098 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1099 } \
1100 else { \
1101 REXEC_FBC_CLASS_SCAN(CoNd); \
1102 } \
1103 break
1104
786e8c11
YO
1105#define DUMP_EXEC_POS(li,s,doutf8) \
1106 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1107
1108/* We know what class REx starts with. Try to find this position... */
1109/* if reginfo is NULL, its a dryrun */
1110/* annoyingly all the vars in this routine have different names from their counterparts
1111 in regmatch. /grrr */
1112
3c3eec57 1113STATIC char *
07be1b83
YO
1114S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1115 const char *strend, const regmatch_info *reginfo)
a687059c 1116{
27da23d5 1117 dVAR;
1df70142 1118 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 1119 char *m;
d8093b23 1120 STRLEN ln;
5dab1207 1121 STRLEN lnc;
078c425b 1122 register STRLEN uskip;
d8093b23
G
1123 unsigned int c1;
1124 unsigned int c2;
6eb5f6b9
JH
1125 char *e;
1126 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1127 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 1128
6eb5f6b9
JH
1129 /* We know what class it must start with. */
1130 switch (OP(c)) {
6eb5f6b9 1131 case ANYOF:
388cc4de 1132 if (do_utf8) {
4cadc6a9 1133 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1134 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1135 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1136 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1137 }
1138 else {
1139 while (s < strend) {
1140 STRLEN skip = 1;
1141
32fc9b6a 1142 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1143 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1144 /* The assignment of 2 is intentional:
1145 * for the folded sharp s, the skip is 2. */
1146 (skip = SHARP_S_SKIP))) {
3b0527fe 1147 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
1148 goto got_it;
1149 else
1150 tmp = doevery;
1151 }
1152 else
1153 tmp = 1;
1154 s += skip;
1155 }
a0d0e21e 1156 }
6eb5f6b9 1157 break;
f33976b4 1158 case CANY:
4cadc6a9 1159 REXEC_FBC_SCAN(
3b0527fe 1160 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
1161 goto got_it;
1162 else
1163 tmp = doevery;
4cadc6a9 1164 );
f33976b4 1165 break;
6eb5f6b9 1166 case EXACTF:
5dab1207
NIS
1167 m = STRING(c);
1168 ln = STR_LEN(c); /* length to match in octets/bytes */
1169 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1170 if (UTF) {
a2a2844f 1171 STRLEN ulen1, ulen2;
5dab1207 1172 U8 *sm = (U8 *) m;
89ebb4a3
JH
1173 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1174 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1175 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1176
1177 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1178 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1179
89ebb4a3 1180 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1181 0, uniflags);
89ebb4a3 1182 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1183 0, uniflags);
5dab1207
NIS
1184 lnc = 0;
1185 while (sm < ((U8 *) m + ln)) {
1186 lnc++;
1187 sm += UTF8SKIP(sm);
1188 }
1aa99e6b
IH
1189 }
1190 else {
1191 c1 = *(U8*)m;
1192 c2 = PL_fold[c1];
1193 }
6eb5f6b9
JH
1194 goto do_exactf;
1195 case EXACTFL:
5dab1207
NIS
1196 m = STRING(c);
1197 ln = STR_LEN(c);
1198 lnc = (I32) ln;
d8093b23 1199 c1 = *(U8*)m;
6eb5f6b9
JH
1200 c2 = PL_fold_locale[c1];
1201 do_exactf:
db12adc6 1202 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1203
3b0527fe 1204 if (!reginfo && e < s)
6eb5f6b9 1205 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1206
60a8b682
JH
1207 /* The idea in the EXACTF* cases is to first find the
1208 * first character of the EXACTF* node and then, if
1209 * necessary, case-insensitively compare the full
1210 * text of the node. The c1 and c2 are the first
1211 * characters (though in Unicode it gets a bit
1212 * more complicated because there are more cases
7f16dd3d
JH
1213 * than just upper and lower: one needs to use
1214 * the so-called folding case for case-insensitive
1215 * matching (called "loose matching" in Unicode).
1216 * ibcmp_utf8() will do just that. */
60a8b682 1217
1aa99e6b 1218 if (do_utf8) {
575cac57 1219 UV c, f;
89ebb4a3 1220 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1221 STRLEN len, foldlen;
4ad0818d 1222 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1223 if (c1 == c2) {
5dab1207
NIS
1224 /* Upper and lower of 1st char are equal -
1225 * probably not a "letter". */
1aa99e6b 1226 while (s <= e) {
89ebb4a3 1227 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1228 uniflags);
4cadc6a9 1229 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1230 }
09091399
JH
1231 }
1232 else {
1aa99e6b 1233 while (s <= e) {
89ebb4a3 1234 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1235 uniflags);
80aecb99 1236
60a8b682 1237 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1238 * Note that not all the possible combinations
1239 * are handled here: some of them are handled
1240 * by the standard folding rules, and some of
1241 * them (the character class or ANYOF cases)
1242 * are handled during compiletime in
1243 * regexec.c:S_regclass(). */
880bd946
JH
1244 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1245 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1246 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1247
4cadc6a9 1248 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1249 }
09091399 1250 }
1aa99e6b
IH
1251 }
1252 else {
1253 if (c1 == c2)
4cadc6a9 1254 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1255 else
4cadc6a9 1256 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1257 }
1258 break;
bbce6d69 1259 case BOUNDL:
3280af22 1260 PL_reg_flags |= RF_tainted;
bbce6d69 1261 /* FALL THROUGH */
a0d0e21e 1262 case BOUND:
ffc61ed2 1263 if (do_utf8) {
12d33761 1264 if (s == PL_bostr)
ffc61ed2
JH
1265 tmp = '\n';
1266 else {
6136c704 1267 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1268 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1269 }
1270 tmp = ((OP(c) == BOUND ?
9041c2e3 1271 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1272 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1273 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1274 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1275 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1276 isALNUM_LC_utf8((U8*)s)))
1277 {
1278 tmp = !tmp;
4cadc6a9 1279 REXEC_FBC_TRYIT;
a687059c 1280 }
4cadc6a9 1281 );
a0d0e21e 1282 }
667bb95a 1283 else {
12d33761 1284 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1285 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1286 REXEC_FBC_SCAN(
ffc61ed2
JH
1287 if (tmp ==
1288 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1289 tmp = !tmp;
4cadc6a9 1290 REXEC_FBC_TRYIT;
a0ed51b3 1291 }
4cadc6a9 1292 );
a0ed51b3 1293 }
3b0527fe 1294 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1295 goto got_it;
1296 break;
bbce6d69 1297 case NBOUNDL:
3280af22 1298 PL_reg_flags |= RF_tainted;
bbce6d69 1299 /* FALL THROUGH */
a0d0e21e 1300 case NBOUND:
ffc61ed2 1301 if (do_utf8) {
12d33761 1302 if (s == PL_bostr)
ffc61ed2
JH
1303 tmp = '\n';
1304 else {
6136c704 1305 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1306 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1307 }
1308 tmp = ((OP(c) == NBOUND ?
9041c2e3 1309 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1310 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1311 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1312 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1313 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1314 isALNUM_LC_utf8((U8*)s)))
1315 tmp = !tmp;
4cadc6a9
YO
1316 else REXEC_FBC_TRYIT;
1317 );
a0d0e21e 1318 }
667bb95a 1319 else {
12d33761 1320 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1321 tmp = ((OP(c) == NBOUND ?
1322 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1323 REXEC_FBC_SCAN(
ffc61ed2
JH
1324 if (tmp ==
1325 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1326 tmp = !tmp;
4cadc6a9
YO
1327 else REXEC_FBC_TRYIT;
1328 );
a0ed51b3 1329 }
3b0527fe 1330 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1331 goto got_it;
1332 break;
a0d0e21e 1333 case ALNUM:
4cadc6a9
YO
1334 REXEC_FBC_CSCAN_PRELOAD(
1335 LOAD_UTF8_CHARCLASS_ALNUM(),
1336 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1337 isALNUM(*s)
1338 );
bbce6d69 1339 case ALNUML:
4cadc6a9
YO
1340 REXEC_FBC_CSCAN_TAINT(
1341 isALNUM_LC_utf8((U8*)s),
1342 isALNUM_LC(*s)
1343 );
a0d0e21e 1344 case NALNUM:
4cadc6a9
YO
1345 REXEC_FBC_CSCAN_PRELOAD(
1346 LOAD_UTF8_CHARCLASS_ALNUM(),
1347 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1348 !isALNUM(*s)
1349 );
bbce6d69 1350 case NALNUML:
4cadc6a9
YO
1351 REXEC_FBC_CSCAN_TAINT(
1352 !isALNUM_LC_utf8((U8*)s),
1353 !isALNUM_LC(*s)
1354 );
a0d0e21e 1355 case SPACE:
4cadc6a9
YO
1356 REXEC_FBC_CSCAN_PRELOAD(
1357 LOAD_UTF8_CHARCLASS_SPACE(),
1358 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1359 isSPACE(*s)
1360 );
bbce6d69 1361 case SPACEL:
4cadc6a9
YO
1362 REXEC_FBC_CSCAN_TAINT(
1363 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1364 isSPACE_LC(*s)
1365 );
a0d0e21e 1366 case NSPACE:
4cadc6a9
YO
1367 REXEC_FBC_CSCAN_PRELOAD(
1368 LOAD_UTF8_CHARCLASS_SPACE(),
1369 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1370 !isSPACE(*s)
1371 );
bbce6d69 1372 case NSPACEL:
4cadc6a9
YO
1373 REXEC_FBC_CSCAN_TAINT(
1374 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1375 !isSPACE_LC(*s)
1376 );
a0d0e21e 1377 case DIGIT:
4cadc6a9
YO
1378 REXEC_FBC_CSCAN_PRELOAD(
1379 LOAD_UTF8_CHARCLASS_DIGIT(),
1380 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1381 isDIGIT(*s)
1382 );
b8c5462f 1383 case DIGITL:
4cadc6a9
YO
1384 REXEC_FBC_CSCAN_TAINT(
1385 isDIGIT_LC_utf8((U8*)s),
1386 isDIGIT_LC(*s)
1387 );
a0d0e21e 1388 case NDIGIT:
4cadc6a9
YO
1389 REXEC_FBC_CSCAN_PRELOAD(
1390 LOAD_UTF8_CHARCLASS_DIGIT(),
1391 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1392 !isDIGIT(*s)
1393 );
b8c5462f 1394 case NDIGITL:
4cadc6a9
YO
1395 REXEC_FBC_CSCAN_TAINT(
1396 !isDIGIT_LC_utf8((U8*)s),
1397 !isDIGIT_LC(*s)
1398 );
1de06328
YO
1399 case AHOCORASICKC:
1400 case AHOCORASICK:
07be1b83
YO
1401 {
1402 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1403 trie_type = do_utf8 ?
1404 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1405 : trie_plain;
1406 /* what trie are we using right now */
1407 reg_ac_data *aho
1408 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1409 reg_trie_data *trie=aho->trie;
1410
1411 const char *last_start = strend - trie->minlen;
6148ee25 1412#ifdef DEBUGGING
07be1b83 1413 const char *real_start = s;
6148ee25 1414#endif
07be1b83 1415 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1416 SV *sv_points;
1417 U8 **points; /* map of where we were in the input string
786e8c11 1418 when reading a given char. For ASCII this
be8e71aa
YO
1419 is unnecessary overhead as the relationship
1420 is always 1:1, but for unicode, especially
1421 case folded unicode this is not true. */
f9e705e8 1422 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1423 U8 *bitmap=NULL;
1424
07be1b83
YO
1425
1426 GET_RE_DEBUG_FLAGS_DECL;
1427
be8e71aa
YO
1428 /* We can't just allocate points here. We need to wrap it in
1429 * an SV so it gets freed properly if there is a croak while
1430 * running the match */
1431 ENTER;
1432 SAVETMPS;
1433 sv_points=newSV(maxlen * sizeof(U8 *));
1434 SvCUR_set(sv_points,
1435 maxlen * sizeof(U8 *));
1436 SvPOK_on(sv_points);
1437 sv_2mortal(sv_points);
1438 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1439 if ( trie_type != trie_utf8_fold
1440 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1441 {
786e8c11
YO
1442 if (trie->bitmap)
1443 bitmap=(U8*)trie->bitmap;
1444 else
1445 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1446 }
786e8c11
YO
1447 /* this is the Aho-Corasick algorithm modified a touch
1448 to include special handling for long "unknown char"
1449 sequences. The basic idea being that we use AC as long
1450 as we are dealing with a possible matching char, when
1451 we encounter an unknown char (and we have not encountered
1452 an accepting state) we scan forward until we find a legal
1453 starting char.
1454 AC matching is basically that of trie matching, except
1455 that when we encounter a failing transition, we fall back
1456 to the current states "fail state", and try the current char
1457 again, a process we repeat until we reach the root state,
1458 state 1, or a legal transition. If we fail on the root state
1459 then we can either terminate if we have reached an accepting
1460 state previously, or restart the entire process from the beginning
1461 if we have not.
1462
1463 */
07be1b83
YO
1464 while (s <= last_start) {
1465 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1466 U8 *uc = (U8*)s;
1467 U16 charid = 0;
1468 U32 base = 1;
1469 U32 state = 1;
1470 UV uvc = 0;
1471 STRLEN len = 0;
1472 STRLEN foldlen = 0;
1473 U8 *uscan = (U8*)NULL;
1474 U8 *leftmost = NULL;
786e8c11
YO
1475#ifdef DEBUGGING
1476 U32 accepted_word= 0;
1477#endif
07be1b83
YO
1478 U32 pointpos = 0;
1479
1480 while ( state && uc <= (U8*)strend ) {
1481 int failed=0;
786e8c11
YO
1482 U32 word = aho->states[ state ].wordnum;
1483
1de06328
YO
1484 if( state==1 ) {
1485 if ( bitmap ) {
1486 DEBUG_TRIE_EXECUTE_r(
1487 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1488 dump_exec_pos( (char *)uc, c, strend, real_start,
1489 (char *)uc, do_utf8 );
1490 PerlIO_printf( Perl_debug_log,
1491 " Scanning for legal start char...\n");
1492 }
1493 );
1494 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1495 uc++;
786e8c11 1496 }
1de06328 1497 s= (char *)uc;
786e8c11 1498 }
786e8c11
YO
1499 if (uc >(U8*)last_start) break;
1500 }
1501
1502 if ( word ) {
1503 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1504 if (!leftmost || lpos < leftmost) {
1505 DEBUG_r(accepted_word=word);
07be1b83 1506 leftmost= lpos;
786e8c11 1507 }
07be1b83 1508 if (base==0) break;
786e8c11 1509
07be1b83
YO
1510 }
1511 points[pointpos++ % maxlen]= uc;
4cadc6a9
YO
1512 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1513 uvc, charid, foldlen, foldbuf, uniflags);
786e8c11
YO
1514 DEBUG_TRIE_EXECUTE_r({
1515 dump_exec_pos( (char *)uc, c, strend, real_start,
1516 s, do_utf8 );
07be1b83 1517 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1518 " Charid:%3u CP:%4"UVxf" ",
1519 charid, uvc);
1520 });
07be1b83
YO
1521
1522 do {
6148ee25 1523#ifdef DEBUGGING
786e8c11 1524 word = aho->states[ state ].wordnum;
6148ee25 1525#endif
07be1b83
YO
1526 base = aho->states[ state ].trans.base;
1527
786e8c11
YO
1528 DEBUG_TRIE_EXECUTE_r({
1529 if (failed)
1530 dump_exec_pos( (char *)uc, c, strend, real_start,
1531 s, do_utf8 );
07be1b83 1532 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1533 "%sState: %4"UVxf", word=%"UVxf,
1534 failed ? " Fail transition to " : "",
1535 (UV)state, (UV)word);
1536 });
07be1b83
YO
1537 if ( base ) {
1538 U32 tmp;
1539 if (charid &&
1540 (base + charid > trie->uniquecharcount )
1541 && (base + charid - 1 - trie->uniquecharcount
1542 < trie->lasttrans)
1543 && trie->trans[base + charid - 1 -
1544 trie->uniquecharcount].check == state
1545 && (tmp=trie->trans[base + charid - 1 -
1546 trie->uniquecharcount ].next))
1547 {
786e8c11
YO
1548 DEBUG_TRIE_EXECUTE_r(
1549 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1550 state = tmp;
1551 break;
1552 }
1553 else {
786e8c11
YO
1554 DEBUG_TRIE_EXECUTE_r(
1555 PerlIO_printf( Perl_debug_log," - fail\n"));
1556 failed = 1;
1557 state = aho->fail[state];
07be1b83
YO
1558 }
1559 }
1560 else {
1561 /* we must be accepting here */
786e8c11
YO
1562 DEBUG_TRIE_EXECUTE_r(
1563 PerlIO_printf( Perl_debug_log," - accepting\n"));
1564 failed = 1;
07be1b83
YO
1565 break;
1566 }
1567 } while(state);
786e8c11 1568 uc += len;
07be1b83
YO
1569 if (failed) {
1570 if (leftmost)
1571 break;
786e8c11 1572 if (!state) state = 1;
07be1b83
YO
1573 }
1574 }
1575 if ( aho->states[ state ].wordnum ) {
1576 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
786e8c11
YO
1577 if (!leftmost || lpos < leftmost) {
1578 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1579 leftmost = lpos;
786e8c11 1580 }
07be1b83 1581 }
07be1b83
YO
1582 if (leftmost) {
1583 s = (char*)leftmost;
786e8c11
YO
1584 DEBUG_TRIE_EXECUTE_r({
1585 PerlIO_printf(
1586 Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n",
1587 (UV)accepted_word, s - real_start
1588 );
1589 });
be8e71aa
YO
1590 if (!reginfo || regtry(reginfo, s)) {
1591 FREETMPS;
1592 LEAVE;
07be1b83 1593 goto got_it;
be8e71aa 1594 }
07be1b83 1595 s = HOPc(s,1);
786e8c11
YO
1596 DEBUG_TRIE_EXECUTE_r({
1597 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1598 });
07be1b83 1599 } else {
786e8c11
YO
1600 DEBUG_TRIE_EXECUTE_r(
1601 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1602 break;
1603 }
1604 }
be8e71aa
YO
1605 FREETMPS;
1606 LEAVE;
07be1b83
YO
1607 }
1608 break;
b3c9acc1 1609 default:
3c3eec57
GS
1610 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1611 break;
d6a28714 1612 }
6eb5f6b9
JH
1613 return 0;
1614 got_it:
1615 return s;
1616}
1617
1618/*
1619 - regexec_flags - match a regexp against a string
1620 */
1621I32
1622Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1623 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1624/* strend: pointer to null at end of string */
1625/* strbeg: real beginning of string */
1626/* minend: end of match must be >=minend after stringarg. */
1627/* data: May be used for some additional optimizations. */
1628/* nosave: For optimizations. */
1629{
97aff369 1630 dVAR;
6eb5f6b9
JH
1631 register char *s;
1632 register regnode *c;
1633 register char *startpos = stringarg;
6eb5f6b9
JH
1634 I32 minlen; /* must match at least this many chars */
1635 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1636 I32 end_shift = 0; /* Same for the end. */ /* CC */
1637 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1638 char *scream_olds = NULL;
3dab1dad 1639 SV* const oreplsv = GvSV(PL_replgv);
f9f4320a 1640 const bool do_utf8 = (bool)DO_UTF8(sv);
2757e526 1641 I32 multiline;
0df25f3d 1642
3b0527fe 1643 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1644
1645 GET_RE_DEBUG_FLAGS_DECL;
1646
9d4ba2ae 1647 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1648
1649 /* Be paranoid... */
1650 if (prog == NULL || startpos == NULL) {
1651 Perl_croak(aTHX_ "NULL regexp parameter");
1652 return 0;
1653 }
1654
2757e526 1655 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1656 reginfo.prog = prog;
2757e526 1657
bac06658 1658 RX_MATCH_UTF8_set(prog, do_utf8);
1de06328
YO
1659 DEBUG_EXECUTE_r(
1660 debug_start_match(prog, do_utf8, startpos, strend,
1661 "Matching");
1662 );
bac06658 1663
6eb5f6b9 1664 minlen = prog->minlen;
1de06328
YO
1665
1666 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 1667 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1668 "String too short [regexec_flags]...\n"));
1669 goto phooey;
1aa99e6b 1670 }
6eb5f6b9 1671
1de06328 1672
6eb5f6b9
JH
1673 /* Check validity of program. */
1674 if (UCHARAT(prog->program) != REG_MAGIC) {
1675 Perl_croak(aTHX_ "corrupted regexp program");
1676 }
1677
1678 PL_reg_flags = 0;
1679 PL_reg_eval_set = 0;
1680 PL_reg_maxiter = 0;
1681
1682 if (prog->reganch & ROPT_UTF8)
1683 PL_reg_flags |= RF_utf8;
1684
1685 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1686 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1687 PL_bostr = strbeg;
3b0527fe 1688 reginfo.sv = sv;
6eb5f6b9
JH
1689
1690 /* Mark end of line for $ (and such) */
1691 PL_regeol = strend;
1692
1693 /* see how far we have to get to not match where we matched before */
3b0527fe 1694 reginfo.till = startpos+minend;
6eb5f6b9 1695
6eb5f6b9
JH
1696 /* If there is a "must appear" string, look for it. */
1697 s = startpos;
1698
3b0527fe 1699 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1700 MAGIC *mg;
1701
1702 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1703 reginfo.ganch = startpos;
6eb5f6b9
JH
1704 else if (sv && SvTYPE(sv) >= SVt_PVMG
1705 && SvMAGIC(sv)
14befaf4
DM
1706 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1707 && mg->mg_len >= 0) {
3b0527fe 1708 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1709 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1710 if (s > reginfo.ganch)
6eb5f6b9 1711 goto phooey;
3b0527fe 1712 s = reginfo.ganch;
6eb5f6b9
JH
1713 }
1714 }
1715 else /* pos() not defined */
3b0527fe 1716 reginfo.ganch = strbeg;
6eb5f6b9
JH
1717 }
1718
a0714e2c 1719 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1720 re_scream_pos_data d;
1721
1722 d.scream_olds = &scream_olds;
1723 d.scream_pos = &scream_pos;
1724 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1725 if (!s) {
a3621e74 1726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1727 goto phooey; /* not present */
3fa9c3d7 1728 }
6eb5f6b9
JH
1729 }
1730
1de06328 1731
6eb5f6b9
JH
1732
1733 /* Simplest case: anchored match need be tried only once. */
1734 /* [unless only anchor is BOL and multiline is set] */
1735 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1736 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1737 goto got_it;
7fba1cd6 1738 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1739 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1740 {
1741 char *end;
1742
1743 if (minlen)
1744 dontbother = minlen - 1;
1aa99e6b 1745 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1746 /* for multiline we only have to try after newlines */
33b8afdf 1747 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1748 if (s == startpos)
1749 goto after_try;
1750 while (1) {
3b0527fe 1751 if (regtry(&reginfo, s))
6eb5f6b9
JH
1752 goto got_it;
1753 after_try:
1754 if (s >= end)
1755 goto phooey;
1756 if (prog->reganch & RE_USE_INTUIT) {
1757 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1758 if (!s)
1759 goto phooey;
1760 }
1761 else
1762 s++;
1763 }
1764 } else {
1765 if (s > startpos)
1766 s--;
1767 while (s < end) {
1768 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1769 if (regtry(&reginfo, s))
6eb5f6b9
JH
1770 goto got_it;
1771 }
1772 }
1773 }
1774 }
1775 goto phooey;
f9f4320a
YO
1776 } else if (ROPT_GPOS_CHECK == (prog->reganch & ROPT_GPOS_CHECK))
1777 {
1778 /* the warning about reginfo.ganch being used without intialization
1779 is bogus -- we set it above, when prog->reganch & ROPT_GPOS_SEEN
1780 and we only enter this block when the same bit is set. */
3b0527fe 1781 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1782 goto got_it;
1783 goto phooey;
1784 }
1785
1786 /* Messy cases: unanchored match. */
33b8afdf 1787 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1788 /* we have /x+whatever/ */
1789 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1790 char ch;
bf93d4cc
GS
1791#ifdef DEBUGGING
1792 int did_match = 0;
1793#endif
33b8afdf
JH
1794 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1795 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1796 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1797
1aa99e6b 1798 if (do_utf8) {
4cadc6a9 1799 REXEC_FBC_SCAN(
6eb5f6b9 1800 if (*s == ch) {
a3621e74 1801 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1802 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1803 s += UTF8SKIP(s);
1804 while (s < strend && *s == ch)
1805 s += UTF8SKIP(s);
1806 }
4cadc6a9 1807 );
6eb5f6b9
JH
1808 }
1809 else {
4cadc6a9 1810 REXEC_FBC_SCAN(
6eb5f6b9 1811 if (*s == ch) {
a3621e74 1812 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1813 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1814 s++;
1815 while (s < strend && *s == ch)
1816 s++;
1817 }
4cadc6a9 1818 );
6eb5f6b9 1819 }
a3621e74 1820 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1821 PerlIO_printf(Perl_debug_log,
b7953727
JH
1822 "Did not find anchored character...\n")
1823 );
6eb5f6b9 1824 }
a0714e2c
SS
1825 else if (prog->anchored_substr != NULL
1826 || prog->anchored_utf8 != NULL
1827 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1828 && prog->float_max_offset < strend - s)) {
1829 SV *must;
1830 I32 back_max;
1831 I32 back_min;
1832 char *last;
6eb5f6b9 1833 char *last1; /* Last position checked before */
bf93d4cc
GS
1834#ifdef DEBUGGING
1835 int did_match = 0;
1836#endif
33b8afdf
JH
1837 if (prog->anchored_substr || prog->anchored_utf8) {
1838 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1839 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1840 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1841 back_max = back_min = prog->anchored_offset;
1842 } else {
1843 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1844 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1845 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1846 back_max = prog->float_max_offset;
1847 back_min = prog->float_min_offset;
1848 }
1de06328
YO
1849
1850
33b8afdf
JH
1851 if (must == &PL_sv_undef)
1852 /* could not downgrade utf8 check substring, so must fail */
1853 goto phooey;
1854
1de06328
YO
1855 if (back_min<0) {
1856 last = strend;
1857 } else {
1858 last = HOP3c(strend, /* Cannot start after this */
1859 -(I32)(CHR_SVLEN(must)
1860 - (SvTAIL(must) != 0) + back_min), strbeg);
1861 }
6eb5f6b9
JH
1862 if (s > PL_bostr)
1863 last1 = HOPc(s, -1);
1864 else
1865 last1 = s - 1; /* bogus */
1866
a0288114 1867 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1868 check_substr==must. */
1869 scream_pos = -1;
1870 dontbother = end_shift;
1871 strend = HOPc(strend, -dontbother);
1872 while ( (s <= last) &&
9041c2e3 1873 ((flags & REXEC_SCREAM)
1de06328 1874 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 1875 end_shift, &scream_pos, 0))
1de06328 1876 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 1877 (unsigned char*)strend, must,
7fba1cd6 1878 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1879 /* we may be pointing at the wrong string */
1880 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1881 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1882 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1883 if (HOPc(s, -back_max) > last1) {
1884 last1 = HOPc(s, -back_min);
1885 s = HOPc(s, -back_max);
1886 }
1887 else {
52657f30 1888 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1889
1890 last1 = HOPc(s, -back_min);
52657f30 1891 s = t;
6eb5f6b9 1892 }
1aa99e6b 1893 if (do_utf8) {
6eb5f6b9 1894 while (s <= last1) {
3b0527fe 1895 if (regtry(&reginfo, s))
6eb5f6b9
JH
1896 goto got_it;
1897 s += UTF8SKIP(s);
1898 }
1899 }
1900 else {
1901 while (s <= last1) {
3b0527fe 1902 if (regtry(&reginfo, s))
6eb5f6b9
JH
1903 goto got_it;
1904 s++;
1905 }
1906 }
1907 }
ab3bbdeb
YO
1908 DEBUG_EXECUTE_r(if (!did_match) {
1909 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1910 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1911 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 1912 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 1913 ? "anchored" : "floating"),
ab3bbdeb
YO
1914 quoted, RE_SV_TAIL(must));
1915 });
6eb5f6b9
JH
1916 goto phooey;
1917 }
786e8c11 1918 else if ( (c = prog->regstclass) ) {
f14c76ed 1919 if (minlen) {
be8e71aa 1920 const OPCODE op = OP(prog->regstclass);
66e933ab 1921 /* don't bother with what can't match */
786e8c11 1922 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
1923 strend = HOPc(strend, -(minlen - 1));
1924 }
a3621e74 1925 DEBUG_EXECUTE_r({
be8e71aa 1926 SV * const prop = sv_newmortal();
32fc9b6a 1927 regprop(prog, prop, c);
0df25f3d 1928 {
ab3bbdeb
YO
1929 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1930 s,strend-s,60);
0df25f3d 1931 PerlIO_printf(Perl_debug_log,
ab3bbdeb 1932 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 1933 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 1934 quoted, (int)(strend - s));
0df25f3d 1935 }
ffc61ed2 1936 });
3b0527fe 1937 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 1938 goto got_it;
07be1b83 1939 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
1940 }
1941 else {
1942 dontbother = 0;
a0714e2c 1943 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1944 /* Trim the end. */
d6a28714 1945 char *last;
33b8afdf
JH
1946 SV* float_real;
1947
1948 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1949 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1950 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1951
1952 if (flags & REXEC_SCREAM) {
33b8afdf 1953 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1954 end_shift, &scream_pos, 1); /* last one */
1955 if (!last)
ffc61ed2 1956 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1957 /* we may be pointing at the wrong string */
1958 else if (RX_MATCH_COPIED(prog))
3f7c398e 1959 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1960 }
d6a28714
JH
1961 else {
1962 STRLEN len;
cfd0369c 1963 const char * const little = SvPV_const(float_real, len);
d6a28714 1964
33b8afdf 1965 if (SvTAIL(float_real)) {
d6a28714
JH
1966 if (memEQ(strend - len + 1, little, len - 1))
1967 last = strend - len + 1;
7fba1cd6 1968 else if (!multiline)
9041c2e3 1969 last = memEQ(strend - len, little, len)
bd61b366 1970 ? strend - len : NULL;
b8c5462f 1971 else
d6a28714
JH
1972 goto find_last;
1973 } else {
1974 find_last:
9041c2e3 1975 if (len)
d6a28714 1976 last = rninstr(s, strend, little, little + len);
b8c5462f 1977 else
a0288114 1978 last = strend; /* matching "$" */
b8c5462f 1979 }
b8c5462f 1980 }
bf93d4cc 1981 if (last == NULL) {
a3621e74 1982 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 1983 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 1984 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1985 goto phooey; /* Should not happen! */
1986 }
d6a28714
JH
1987 dontbother = strend - last + prog->float_min_offset;
1988 }
1989 if (minlen && (dontbother < minlen))
1990 dontbother = minlen - 1;
1991 strend -= dontbother; /* this one's always in bytes! */
1992 /* We don't know much -- general case. */
1aa99e6b 1993 if (do_utf8) {
d6a28714 1994 for (;;) {
3b0527fe 1995 if (regtry(&reginfo, s))
d6a28714
JH
1996 goto got_it;
1997 if (s >= strend)
1998 break;
b8c5462f 1999 s += UTF8SKIP(s);
d6a28714
JH
2000 };
2001 }
2002 else {
2003 do {
3b0527fe 2004 if (regtry(&reginfo, s))
d6a28714
JH
2005 goto got_it;
2006 } while (s++ < strend);
2007 }
2008 }
2009
2010 /* Failure. */
2011 goto phooey;
2012
2013got_it:
2014 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2015
2016 if (PL_reg_eval_set) {
2017 /* Preserve the current value of $^R */
2018 if (oreplsv != GvSV(PL_replgv))
2019 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2020 restored, the value remains
2021 the same. */
4f639d21 2022 restore_pos(aTHX_ prog);
d6a28714
JH
2023 }
2024
2025 /* make sure $`, $&, $', and $digit will work later */
2026 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2027 RX_MATCH_COPY_FREE(prog);
d6a28714 2028 if (flags & REXEC_COPY_STR) {
be8e71aa 2029 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2030#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2031 if ((SvIsCOW(sv)
2032 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2033 if (DEBUG_C_TEST) {
2034 PerlIO_printf(Perl_debug_log,
2035 "Copy on write: regexp capture, type %d\n",
2036 (int) SvTYPE(sv));
2037 }
2038 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2039 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2040 assert (SvPOKp(prog->saved_copy));
2041 } else
2042#endif
2043 {
2044 RX_MATCH_COPIED_on(prog);
2045 s = savepvn(strbeg, i);
2046 prog->subbeg = s;
2047 }
d6a28714 2048 prog->sublen = i;
d6a28714
JH
2049 }
2050 else {
2051 prog->subbeg = strbeg;
2052 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2053 }
2054 }
9041c2e3 2055
d6a28714
JH
2056 return 1;
2057
2058phooey:
a3621e74 2059 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2060 PL_colors[4], PL_colors[5]));
d6a28714 2061 if (PL_reg_eval_set)
4f639d21 2062 restore_pos(aTHX_ prog);
d6a28714
JH
2063 return 0;
2064}
2065
2066/*
2067 - regtry - try match at specific point
2068 */
2069STATIC I32 /* 0 failure, 1 success */
3b0527fe 2070S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2071{
97aff369 2072 dVAR;
d6a28714
JH
2073 register I32 *sp;
2074 register I32 *ep;
2075 CHECKPOINT lastcp;
3b0527fe 2076 regexp *prog = reginfo->prog;
a3621e74 2077 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2078
2079 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2080 MAGIC *mg;
2081
2082 PL_reg_eval_set = RS_init;
a3621e74 2083 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2084 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2085 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2086 ));
e8347627 2087 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2088 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2089 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2090 SAVETMPS;
2091 /* Apparently this is not needed, judging by wantarray. */
e8347627 2092 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2093 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2094
3b0527fe 2095 if (reginfo->sv) {
d6a28714 2096 /* Make $_ available to executed code. */
3b0527fe 2097 if (reginfo->sv != DEFSV) {
59f00321 2098 SAVE_DEFSV;
3b0527fe 2099 DEFSV = reginfo->sv;
b8c5462f 2100 }
d6a28714 2101
3b0527fe
DM
2102 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2103 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2104 /* prepare for quick setting of pos */
d300d9fa
NC
2105#ifdef PERL_OLD_COPY_ON_WRITE
2106 if (SvIsCOW(sv))
2107 sv_force_normal_flags(sv, 0);
2108#endif
3dab1dad 2109 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2110 &PL_vtbl_mglob, NULL, 0);
d6a28714 2111 mg->mg_len = -1;
b8c5462f 2112 }
d6a28714
JH
2113 PL_reg_magic = mg;
2114 PL_reg_oldpos = mg->mg_len;
4f639d21 2115 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2116 }
09687e5a 2117 if (!PL_reg_curpm) {
a02a5408 2118 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2119#ifdef USE_ITHREADS
2120 {
be8e71aa 2121 SV* const repointer = newSViv(0);
577e12cc 2122 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2123 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2124 av_push(PL_regex_padav,repointer);
2125 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2126 PL_regex_pad = AvARRAY(PL_regex_padav);
2127 }
2128#endif
2129 }
aaa362c4 2130 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2131 PL_reg_oldcurpm = PL_curpm;
2132 PL_curpm = PL_reg_curpm;
2133 if (RX_MATCH_COPIED(prog)) {
2134 /* Here is a serious problem: we cannot rewrite subbeg,
2135 since it may be needed if this match fails. Thus
2136 $` inside (?{}) could fail... */
2137 PL_reg_oldsaved = prog->subbeg;
2138 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2139#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2140 PL_nrs = prog->saved_copy;
2141#endif
d6a28714
JH
2142 RX_MATCH_COPIED_off(prog);
2143 }
2144 else
bd61b366 2145 PL_reg_oldsaved = NULL;
d6a28714
JH
2146 prog->subbeg = PL_bostr;
2147 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2148 }
973dddac 2149 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2150 PL_reginput = startpos;
2151 PL_regstartp = prog->startp;
2152 PL_regendp = prog->endp;
2153 PL_reglastparen = &prog->lastparen;
a01268b5 2154 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2155 prog->lastparen = 0;
03994de8 2156 prog->lastcloseparen = 0;
d6a28714 2157 PL_regsize = 0;
a3621e74 2158 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2159 if (PL_reg_start_tmpl <= prog->nparens) {
2160 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2161 if(PL_reg_start_tmp)
2162 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2163 else
a02a5408 2164 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2165 }
2166
2167 /* XXXX What this code is doing here?!!! There should be no need
2168 to do this again and again, PL_reglastparen should take care of
3dd2943c 2169 this! --ilya*/
dafc8851
JH
2170
2171 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2172 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2173 * PL_reglastparen), is not needed at all by the test suite
2174 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2175 * enough, for building DynaLoader, or otherwise this
2176 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2177 * will happen. Meanwhile, this code *is* needed for the
2178 * above-mentioned test suite tests to succeed. The common theme
2179 * on those tests seems to be returning null fields from matches.
2180 * --jhi */
dafc8851 2181#if 1
d6a28714
JH
2182 sp = prog->startp;
2183 ep = prog->endp;
2184 if (prog->nparens) {
097eb12c 2185 register I32 i;
eb160463 2186 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2187 *++sp = -1;
2188 *++ep = -1;
2189 }
2190 }
dafc8851 2191#endif
02db2b7b 2192 REGCP_SET(lastcp);
3b0527fe 2193 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2194 prog->endp[0] = PL_reginput - PL_bostr;
2195 return 1;
2196 }
02db2b7b 2197 REGCP_UNWIND(lastcp);
d6a28714
JH
2198 return 0;
2199}
2200
02db2b7b 2201
8ba1375e
MJD
2202#define sayYES goto yes
2203#define sayNO goto no
e0f9d4a8 2204#define sayNO_ANYOF goto no_anyof
8ba1375e 2205#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2206#define sayNO_FINAL goto no_final
2207#define sayNO_SILENT goto do_no
2208#define saySAME(x) if (x) goto yes; else goto no
2209
f9f4320a
YO
2210/* we dont use STMT_START/END here because it leads to
2211 "unreachable code" warnings, which are bogus, but distracting. */
2212#define CACHEsayNO \
3298f257
DM
2213 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) \
2214 PL_reg_poscache[st->u.whilem.cache_offset] |= \
2215 (1<<st->u.whilem.cache_bit); \
f9f4320a 2216 sayNO
3298f257 2217
a3621e74 2218/* this is used to determine how far from the left messages like
265c4333
YO
2219 'failed...' are printed. It should be set such that messages
2220 are inline with the regop output that created them.
a3621e74 2221*/
265c4333 2222#define REPORT_CODE_OFF 32
a3621e74
YO
2223
2224
2225/* Make sure there is a test for this +1 options in re_tests */
2226#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2227
40a82448
DM
2228#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2229#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2230
86545054
DM
2231#define SLAB_FIRST(s) (&(s)->states[0])
2232#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2233
5d9a96ca
DM
2234/* grab a new slab and return the first slot in it */
2235
2236STATIC regmatch_state *
2237S_push_slab(pTHX)
2238{
54df2634
NC
2239#if PERL_VERSION < 9
2240 dMY_CXT;
2241#endif
5d9a96ca
DM
2242 regmatch_slab *s = PL_regmatch_slab->next;
2243 if (!s) {
2244 Newx(s, 1, regmatch_slab);
2245 s->prev = PL_regmatch_slab;
2246 s->next = NULL;
2247 PL_regmatch_slab->next = s;
2248 }
2249 PL_regmatch_slab = s;
86545054 2250 return SLAB_FIRST(s);
5d9a96ca 2251}
5b47454d 2252
95b24440
DM
2253/* simulate a recursive call to regmatch */
2254
2255#define REGMATCH(ns, where) \
5d9a96ca
DM
2256 st->scan = scan; \
2257 scan = (ns); \
2258 st->resume_state = resume_##where; \
95b24440
DM
2259 goto start_recurse; \
2260 resume_point_##where:
2261
40a82448
DM
2262/* push a new state then goto it */
2263
2264#define PUSH_STATE_GOTO(state, node) \
2265 scan = node; \
2266 st->resume_state = state; \
2267 goto push_state;
2268
2269/* push a new state with success backtracking, then goto it */
2270
2271#define PUSH_YES_STATE_GOTO(state, node) \
2272 scan = node; \
2273 st->resume_state = state; \
2274 goto push_yes_state;
2275
aa283a38 2276
aa283a38 2277
d6a28714
JH
2278/*
2279 - regmatch - main matching routine
2280 *
2281 * Conceptually the strategy is simple: check to see whether the current
2282 * node matches, call self recursively to see whether the rest matches,
2283 * and then act accordingly. In practice we make some effort to avoid
2284 * recursion, in particular by going through "ordinary" nodes (that don't
2285 * need to know whether the rest of the match failed) by a loop instead of
2286 * by recursion.
2287 */
2288/* [lwall] I've hoisted the register declarations to the outer block in order to
2289 * maybe save a little bit of pushing and popping on the stack. It also takes
2290 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2291 *
2292 * This function used to be heavily recursive, but since this had the
2293 * effect of blowing the CPU stack on complex regexes, it has been
2294 * restructured to be iterative, and to save state onto the heap rather
2295 * than the stack. Essentially whereever regmatch() used to be called, it
2296 * pushes the current state, notes where to return, then jumps back into
2297 * the main loop.
2298 *
2299 * Originally the structure of this function used to look something like
2300
2301 S_regmatch() {
2302 int a = 1, b = 2;
2303 ...
2304 while (scan != NULL) {
5d9a96ca 2305 a++; // do stuff with a and b
95b24440
DM
2306 ...
2307 switch (OP(scan)) {
2308 case FOO: {
2309 int local = 3;
2310 ...
2311 if (regmatch(...)) // recurse
2312 goto yes;
2313 }
2314 ...
2315 }
2316 }
2317 yes:
2318 return 1;
2319 }
2320
2321 * Now it looks something like this:
2322
5d9a96ca 2323 typedef struct {
95b24440
DM
2324 int a, b, local;
2325 int resume_state;
5d9a96ca 2326 } regmatch_state;
95b24440
DM
2327
2328 S_regmatch() {
5d9a96ca
DM
2329 regmatch_state *st = new();
2330 int depth=0;
2331 st->a++; // do stuff with a and b
95b24440
DM
2332 ...
2333 while (scan != NULL) {
2334 ...
2335 switch (OP(scan)) {
2336 case FOO: {
5d9a96ca 2337 st->local = 3;
95b24440 2338 ...
5d9a96ca
DM
2339 st->scan = scan;
2340 scan = ...;
2341 st->resume_state = resume_FOO;
2342 goto start_recurse; // recurse
95b24440 2343
5d9a96ca
DM
2344 resume_point_FOO:
2345 if (result)
95b24440
DM
2346 goto yes;
2347 }
2348 ...
2349 }
5d9a96ca
DM
2350 start_recurse:
2351 st = new(); push a new state
2352 st->a = 1; st->b = 2;
2353 depth++;
95b24440 2354 }
5d9a96ca 2355 yes:
95b24440 2356 result = 1;
5d9a96ca
DM
2357 if (depth--) {
2358 st = pop();
95b24440
DM
2359 switch (resume_state) {
2360 case resume_FOO:
2361 goto resume_point_FOO;
2362 ...
2363 }
2364 }
2365 return result
2366 }
2367
2368 * WARNING: this means that any line in this function that contains a
2369 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2370 * regmatch() using gotos instead. Thus the values of any local variables
2371 * not saved in the regmatch_state structure will have been lost when
2372 * execution resumes on the next line .
5d9a96ca
DM
2373 *
2374 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2375 * PL_regmatch_state always points to the currently active state, and
2376 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2377 * The first time regmatch is called, the first slab is allocated, and is
2378 * never freed until interpreter desctruction. When the slab is full,
2379 * a new one is allocated chained to the end. At exit from regmatch, slabs
2380 * allocated since entry are freed.
d6a28714 2381 */
95b24440 2382
40a82448 2383
5bc10b2c 2384#define DEBUG_STATE_pp(pp) \
265c4333 2385 DEBUG_STATE_r({ \
5bc10b2c
DM
2386 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2387 PerlIO_printf(Perl_debug_log, \
2388 " %*s"pp" %s\n", \
2389 depth*2, "", \
03363afd 2390 reg_name[st->resume_state] ); \
265c4333 2391 });
5bc10b2c 2392
40a82448 2393
3dab1dad 2394#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2395
3df15adc 2396#ifdef DEBUGGING
5bc10b2c 2397
ab3bbdeb
YO
2398STATIC void
2399S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2400 const char *start, const char *end, const char *blurb)
2401{
2402 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2403 if (!PL_colorset)
2404 reginitcolors();
2405 {
2406 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2407 prog->precomp, prog->prelen, 60);
2408
2409 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2410 start, end - start, 60);
2411
2412 PerlIO_printf(Perl_debug_log,
2413 "%s%s REx%s %s against %s\n",
2414 PL_colors[4], blurb, PL_colors[5], s0, s1);
2415
2416 if (do_utf8||utf8_pat)
1de06328
YO
2417 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2418 utf8_pat ? "pattern" : "",
2419 utf8_pat && do_utf8 ? " and " : "",
2420 do_utf8 ? "string" : ""
ab3bbdeb
YO
2421 );
2422 }
2423}
3df15adc
YO
2424
2425STATIC void
786e8c11
YO
2426S_dump_exec_pos(pTHX_ const char *locinput,
2427 const regnode *scan,
2428 const char *loc_regeol,
2429 const char *loc_bostr,
2430 const char *loc_reg_starttry,
2431 const bool do_utf8)
07be1b83 2432{
786e8c11 2433 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2434 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2435 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2436 /* The part of the string before starttry has one color
2437 (pref0_len chars), between starttry and current
2438 position another one (pref_len - pref0_len chars),
2439 after the current position the third one.
2440 We assume that pref0_len <= pref_len, otherwise we
2441 decrease pref0_len. */
786e8c11
YO
2442 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2443 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2444 int pref0_len;
2445
2446 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2447 pref_len++;
786e8c11
YO
2448 pref0_len = pref_len - (locinput - loc_reg_starttry);
2449 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2450 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2451 ? (5 + taill) - pref_len : loc_regeol - locinput);
07be1b83
YO
2452 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2453 l--;
2454 if (pref0_len < 0)
2455 pref0_len = 0;
2456 if (pref0_len > pref_len)
2457 pref0_len = pref_len;
2458 {
3df15adc 2459 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2460
ab3bbdeb 2461 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2462 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2463
ab3bbdeb 2464 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2465 (locinput - pref_len + pref0_len),
1de06328 2466 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2467
ab3bbdeb 2468 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2469 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2470
1de06328 2471 const STRLEN tlen=len0+len1+len2;
3df15adc 2472 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2473 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2474 (IV)(locinput - loc_bostr),
07be1b83 2475 len0, s0,
07be1b83 2476 len1, s1,
07be1b83 2477 (docolor ? "" : "> <"),
07be1b83 2478 len2, s2,
f9f4320a 2479 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
2480 "");
2481 }
2482}
3df15adc 2483
07be1b83
YO
2484#endif
2485
d6a28714 2486STATIC I32 /* 0 failure, 1 success */
3b0527fe 2487S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2488{
54df2634
NC
2489#if PERL_VERSION < 9
2490 dMY_CXT;
2491#endif
27da23d5 2492 dVAR;
95b24440 2493 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2494 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2495
3b0527fe
DM
2496 regexp *rex = reginfo->prog;
2497
5d9a96ca
DM
2498 regmatch_slab *orig_slab;
2499 regmatch_state *orig_state;
a3621e74 2500
5d9a96ca
DM
2501 /* the current state. This is a cached copy of PL_regmatch_state */
2502 register regmatch_state *st;
95b24440 2503
5d9a96ca
DM
2504 /* cache heavy used fields of st in registers */
2505 register regnode *scan;
2506 register regnode *next;
2507 register I32 n = 0; /* initialize to shut up compiler warning */
2508 register char *locinput = PL_reginput;
95b24440 2509
5d9a96ca
DM
2510 /* these variables are NOT saved during a recusive RFEGMATCH: */
2511 register I32 nextchr; /* is always set to UCHARAT(locinput) */
b69b0499 2512 bool result = 0; /* return value of S_regmatch */
5d9a96ca 2513 int depth = 0; /* depth of recursion */
77cb431f
DM
2514 regmatch_state *yes_state = NULL; /* state to pop to on success of
2515 subpattern */
faec1544 2516 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 2517 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 2518 U32 state_num;
95b24440 2519
10edeb5d
JH
2520 I32 parenfloor = 0;
2521
95b24440 2522#ifdef DEBUGGING
e68ec53f 2523 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2524#endif
2525
5d9a96ca
DM
2526 /* on first ever call to regmatch, allocate first slab */
2527 if (!PL_regmatch_slab) {
2528 Newx(PL_regmatch_slab, 1, regmatch_slab);
2529 PL_regmatch_slab->prev = NULL;
2530 PL_regmatch_slab->next = NULL;
86545054 2531 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2532 }
2533
2534 /* remember current high-water mark for exit */
2535 /* XXX this should be done with SAVE* instead */
2536 orig_slab = PL_regmatch_slab;
2537 orig_state = PL_regmatch_state;
2538
2539 /* grab next free state slot */
2540 st = ++PL_regmatch_state;
86545054 2541 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2542 st = PL_regmatch_state = S_push_slab(aTHX);
2543
2544 st->minmod = 0;
2545 st->sw = 0;
2546 st->logical = 0;
b8591aee 2547 cur_curlyx = NULL;
786e8c11 2548
d6a28714
JH
2549 /* Note that nextchr is a byte even in UTF */
2550 nextchr = UCHARAT(locinput);
2551 scan = prog;
2552 while (scan != NULL) {
8ba1375e 2553
a3621e74 2554 DEBUG_EXECUTE_r( {
6136c704 2555 SV * const prop = sv_newmortal();
1de06328 2556 regnode *rnext=regnext(scan);
786e8c11 2557 DUMP_EXEC_POS( locinput, scan, do_utf8 );
32fc9b6a 2558 regprop(rex, prop, scan);
07be1b83
YO
2559
2560 PerlIO_printf(Perl_debug_log,
2561 "%3"IVdf":%*s%s(%"IVdf")\n",
5bc10b2c 2562 (IV)(scan - rex->program), depth*2, "",
07be1b83 2563 SvPVX_const(prop),
1de06328
YO
2564 (PL_regkind[OP(scan)] == END || !rnext) ?
2565 0 : (IV)(rnext - rex->program));
2a782b5b 2566 });
d6a28714
JH
2567
2568 next = scan + NEXT_OFF(scan);
2569 if (next == scan)
2570 next = NULL;
40a82448 2571 state_num = OP(scan);
d6a28714 2572
40a82448
DM
2573 reenter_switch:
2574 switch (state_num) {
d6a28714 2575 case BOL:
7fba1cd6 2576 if (locinput == PL_bostr)
d6a28714 2577 {
3b0527fe 2578 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2579 break;
2580 }
d6a28714
JH
2581 sayNO;
2582 case MBOL:
12d33761
HS
2583 if (locinput == PL_bostr ||
2584 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2585 {
b8c5462f
JH
2586 break;
2587 }
d6a28714
JH
2588 sayNO;
2589 case SBOL:
c2a73568 2590 if (locinput == PL_bostr)
b8c5462f 2591 break;
d6a28714
JH
2592 sayNO;
2593 case GPOS:
3b0527fe 2594 if (locinput == reginfo->ganch)
d6a28714
JH
2595 break;
2596 sayNO;
2597 case EOL:
d6a28714
JH
2598 goto seol;
2599 case MEOL:
d6a28714 2600 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2601 sayNO;
b8c5462f 2602 break;
d6a28714
JH
2603 case SEOL:
2604 seol:
2605 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2606 sayNO;
d6a28714 2607 if (PL_regeol - locinput > 1)
b8c5462f 2608 sayNO;
b8c5462f 2609 break;
d6a28714
JH
2610 case EOS:
2611 if (PL_regeol != locinput)
b8c5462f 2612 sayNO;
d6a28714 2613 break;
ffc61ed2 2614 case SANY:
d6a28714 2615 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2616 sayNO;
f33976b4
DB
2617 if (do_utf8) {
2618 locinput += PL_utf8skip[nextchr];
2619 if (locinput > PL_regeol)
2620 sayNO;
2621 nextchr = UCHARAT(locinput);
2622 }
2623 else
2624 nextchr = UCHARAT(++locinput);
2625 break;
2626 case CANY:
2627 if (!nextchr && locinput >= PL_regeol)
2628 sayNO;
b8c5462f 2629 nextchr = UCHARAT(++locinput);
a0d0e21e 2630 break;
ffc61ed2 2631 case REG_ANY:
1aa99e6b
IH
2632 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2633 sayNO;
2634 if (do_utf8) {
b8c5462f 2635 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2636 if (locinput > PL_regeol)
2637 sayNO;
a0ed51b3 2638 nextchr = UCHARAT(locinput);
a0ed51b3 2639 }
1aa99e6b
IH
2640 else
2641 nextchr = UCHARAT(++locinput);
a0ed51b3 2642 break;
166ba7cd
DM
2643
2644#undef ST
2645#define ST st->u.trie
786e8c11
YO
2646 case TRIEC:
2647 /* In this case the charclass data is available inline so
2648 we can fail fast without a lot of extra overhead.
2649 */
2650 if (scan->flags == EXACT || !do_utf8) {
2651 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2652 DEBUG_EXECUTE_r(
2653 PerlIO_printf(Perl_debug_log,
2654 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2655 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
786e8c11
YO
2656 );
2657 sayNO_SILENT;
2658 /* NOTREACHED */
2659 }
2660 }
2661 /* FALL THROUGH */
5b47454d 2662 case TRIE:
3dab1dad 2663 {
07be1b83 2664 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2665 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2666 trie_type = do_utf8 ?
2667 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2668 : trie_plain;
2669
2670 /* what trie are we using right now */
be8e71aa 2671 reg_trie_data * const trie
3dab1dad
YO
2672 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2673 U32 state = trie->startstate;
166ba7cd 2674
3dab1dad
YO
2675 if (trie->bitmap && trie_type != trie_utf8_fold &&
2676 !TRIE_BITMAP_TEST(trie,*locinput)
2677 ) {
2678 if (trie->states[ state ].wordnum) {
2679 DEBUG_EXECUTE_r(
2680 PerlIO_printf(Perl_debug_log,
2681 "%*s %smatched empty string...%s\n",
5bc10b2c 2682 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2683 );
2684 break;
2685 } else {
2686 DEBUG_EXECUTE_r(
2687 PerlIO_printf(Perl_debug_log,
786e8c11 2688 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2689 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2690 );
2691 sayNO_SILENT;
2692 }
2693 }
166ba7cd 2694
786e8c11
YO
2695 {
2696 U8 *uc = ( U8* )locinput;
2697
2698 STRLEN len = 0;
2699 STRLEN foldlen = 0;
2700 U8 *uscan = (U8*)NULL;
2701 STRLEN bufflen=0;
2702 SV *sv_accept_buff = NULL;
2703 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2704
2705 ST.accepted = 0; /* how many accepting states we have seen */
2706 ST.B = next;
2707 ST.jump = trie->jump;
2708
2709#ifdef DEBUGGING
2710 ST.me = scan;
2711#endif
2712
2713
2714
07be1b83
YO
2715 /*
2716 traverse the TRIE keeping track of all accepting states
2717 we transition through until we get to a failing node.
2718 */
2719
a3621e74 2720 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 2721 U32 base = trie->states[ state ].trans.base;
f9f4320a 2722 UV uvc = 0;
786e8c11
YO
2723 U16 charid;
2724 /* We use charid to hold the wordnum as we don't use it
2725 for charid until after we have done the wordnum logic.
2726 We define an alias just so that the wordnum logic reads
2727 more naturally. */
2728
2729#define got_wordnum charid
2730 got_wordnum = trie->states[ state ].wordnum;
2731
2732 if ( got_wordnum ) {
2733 if ( ! ST.accepted ) {
5b47454d
DM
2734 ENTER;
2735 SAVETMPS;
2736 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2737 sv_accept_buff=newSV(bufflen *
2738 sizeof(reg_trie_accepted) - 1);
786e8c11 2739 SvCUR_set(sv_accept_buff, 0);
5b47454d
DM
2740 SvPOK_on(sv_accept_buff);
2741 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2742 SAVETMPS;
2743 ST.accept_buff =
5b47454d
DM
2744 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2745 }
786e8c11 2746 do {
166ba7cd 2747 if (ST.accepted >= bufflen) {
5b47454d 2748 bufflen *= 2;
166ba7cd 2749 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2750 SvGROW(sv_accept_buff,
2751 bufflen * sizeof(reg_trie_accepted));
2752 }
2753 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2754 + sizeof(reg_trie_accepted));
a3621e74 2755
786e8c11
YO
2756
2757 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2758 ST.accept_buff[ST.accepted].endpos = uc;
2759 ++ST.accepted;
2760 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2761 }
2762#undef got_wordnum
a3621e74 2763
07be1b83 2764 DEBUG_TRIE_EXECUTE_r({
786e8c11 2765 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
a3621e74 2766 PerlIO_printf( Perl_debug_log,
786e8c11 2767 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
5bc10b2c 2768 2+depth * 2, "", PL_colors[4],
786e8c11 2769 (UV)state, (UV)ST.accepted );
07be1b83 2770 });
a3621e74
YO
2771
2772 if ( base ) {
4cadc6a9
YO
2773 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2774 uvc, charid, foldlen, foldbuf, uniflags);
a3621e74 2775
5b47454d
DM
2776 if (charid &&
2777 (base + charid > trie->uniquecharcount )
2778 && (base + charid - 1 - trie->uniquecharcount
2779 < trie->lasttrans)
2780 && trie->trans[base + charid - 1 -
2781 trie->uniquecharcount].check == state)
2782 {
2783 state = trie->trans[base + charid - 1 -
2784 trie->uniquecharcount ].next;
2785 }
2786 else {
2787 state = 0;
2788 }
2789 uc += len;
2790
2791 }
2792 else {
a3621e74
YO
2793 state = 0;
2794 }
2795 DEBUG_TRIE_EXECUTE_r(
e4584336 2796 PerlIO_printf( Perl_debug_log,
786e8c11 2797 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 2798 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2799 );
2800 }
166ba7cd 2801 if (!ST.accepted )
a3621e74 2802 sayNO;
a3621e74 2803
166ba7cd
DM
2804 DEBUG_EXECUTE_r(
2805 PerlIO_printf( Perl_debug_log,
2806 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 2807 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
2808 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2809 );
786e8c11 2810 }}
166ba7cd
DM
2811
2812 /* FALL THROUGH */
2813
2814 case TRIE_next_fail: /* we failed - try next alterative */
2815
2816 if ( ST.accepted == 1 ) {
2817 /* only one choice left - just continue */
2818 DEBUG_EXECUTE_r({
2819 reg_trie_data * const trie
2820 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2821 SV ** const tmp = RX_DEBUG(reginfo->prog)
2822 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2823 : NULL;
2824 PerlIO_printf( Perl_debug_log,
2825 "%*s %sonly one match left: #%d <%s>%s\n",
5bc10b2c 2826 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd
DM
2827 ST.accept_buff[ 0 ].wordnum,
2828 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2829 PL_colors[5] );
2830 });
2831 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2832 /* in this case we free tmps/leave before we call regmatch
2833 as we wont be using accept_buff again. */
2834 FREETMPS;
2835 LEAVE;
2836 locinput = PL_reginput;
2837 nextchr = UCHARAT(locinput);
786e8c11
YO
2838
2839 if ( !ST.jump )
2840 scan = ST.B;
2841 else
2842 scan = ST.B - ST.jump[ST.accept_buff[0].wordnum];
2843
166ba7cd
DM
2844 continue; /* execute rest of RE */
2845 }
2846
2847 if (!ST.accepted-- ) {
2848 FREETMPS;
2849 LEAVE;
2850 sayNO;
2851 }
2852
a3621e74 2853 /*
166ba7cd
DM
2854 There are at least two accepting states left. Presumably
2855 the number of accepting states is going to be low,
2856 typically two. So we simply scan through to find the one
2857 with lowest wordnum. Once we find it, we swap the last
2858 state into its place and decrement the size. We then try to
2859 match the rest of the pattern at the point where the word
2860 ends. If we succeed, control just continues along the
2861 regex; if we fail we return here to try the next accepting
2862 state
2863 */
a3621e74 2864
166ba7cd
DM
2865 {
2866 U32 best = 0;
2867 U32 cur;
2868 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2869 DEBUG_TRIE_EXECUTE_r(
f2278c82 2870 PerlIO_printf( Perl_debug_log,
166ba7cd 2871 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
5bc10b2c 2872 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
166ba7cd
DM
2873 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2874 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2875 );
2876
2877 if (ST.accept_buff[cur].wordnum <
2878 ST.accept_buff[best].wordnum)
2879 best = cur;
a3621e74 2880 }
166ba7cd
DM
2881
2882 DEBUG_EXECUTE_r({
2883 reg_trie_data * const trie
2884 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2885 SV ** const tmp = RX_DEBUG(reginfo->prog)
2886 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2887 : NULL;
265c4333
YO
2888 regnode *nextop=!ST.jump ?
2889 ST.B :
2890 ST.B - ST.jump[ST.accept_buff[best].wordnum];
2891 PerlIO_printf( Perl_debug_log,
2892 "%*s %strying alternation #%d <%s> at node #%d %s\n",
5bc10b2c 2893 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 2894 ST.accept_buff[best].wordnum,
265c4333
YO
2895 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2896 REG_NODE_NUM(nextop),
166ba7cd
DM
2897 PL_colors[5] );
2898 });
2899
2900 if ( best<ST.accepted ) {
2901 reg_trie_accepted tmp = ST.accept_buff[ best ];
2902 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2903 ST.accept_buff[ ST.accepted ] = tmp;
2904 best = ST.accepted;
a3621e74 2905 }
166ba7cd 2906 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
786e8c11
YO
2907 if ( !ST.jump ) {
2908 PUSH_STATE_GOTO(TRIE_next, ST.B);
2909 /* NOTREACHED */
2910 } else {
2911 PUSH_STATE_GOTO(TRIE_next, ST.B - ST.jump[ST.accept_buff[best].wordnum]);
2912 /* NOTREACHED */
2913 }
2914 /* NOTREACHED */
166ba7cd 2915 }
166ba7cd
DM
2916 /* NOTREACHED */
2917
2918#undef ST
2919
95b24440
DM
2920 case EXACT: {
2921 char *s = STRING(scan);
5d9a96ca 2922 st->ln = STR_LEN(scan);
eb160463 2923 if (do_utf8 != UTF) {
bc517b45 2924 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2925 char *l = locinput;
be8e71aa 2926 const char * const e = s + st->ln;
a72c7584 2927
5ff6fc6d
JH
2928 if (do_utf8) {
2929 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2930 while (s < e) {
a3b680e6 2931 STRLEN ulen;
1aa99e6b 2932 if (l >= PL_regeol)
5ff6fc6d
JH
2933 sayNO;
2934 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2935 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2936 uniflags))
5ff6fc6d 2937 sayNO;
bc517b45 2938 l += ulen;
5ff6fc6d 2939 s ++;
1aa99e6b 2940 }
5ff6fc6d
JH
2941 }
2942 else {
2943 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2944 while (s < e) {
a3b680e6 2945 STRLEN ulen;
1aa99e6b
IH
2946 if (l >= PL_regeol)
2947 sayNO;
5ff6fc6d 2948 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2949 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2950 uniflags))
1aa99e6b 2951 sayNO;
bc517b45 2952 s += ulen;
a72c7584 2953 l ++;
1aa99e6b 2954 }
5ff6fc6d 2955 }
1aa99e6b
IH
2956 locinput = l;
2957 nextchr = UCHARAT(locinput);
2958 break;
2959 }
bc517b45 2960 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2961 /* Inline the first character, for speed. */
2962 if (UCHARAT(s) != nextchr)
2963 sayNO;
5d9a96ca 2964 if (PL_regeol - locinput < st->ln)
d6a28714 2965 sayNO;
5d9a96ca 2966 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 2967 sayNO;
5d9a96ca 2968 locinput += st->ln;
d6a28714
JH
2969 nextchr = UCHARAT(locinput);
2970 break;
95b24440 2971 }
d6a28714 2972 case EXACTFL:
b8c5462f
JH
2973 PL_reg_flags |= RF_tainted;
2974 /* FALL THROUGH */
95b24440 2975 case EXACTF: {
be8e71aa 2976 char * const s = STRING(scan);
5d9a96ca 2977 st->ln = STR_LEN(scan);
d6a28714 2978
d07ddd77
JH
2979 if (do_utf8 || UTF) {
2980 /* Either target or the pattern are utf8. */
be8e71aa 2981 const char * const l = locinput;
d07ddd77 2982 char *e = PL_regeol;
bc517b45 2983
5d9a96ca 2984 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 2985 l, &e, 0, do_utf8)) {
5486206c
JH
2986 /* One more case for the sharp s:
2987 * pack("U0U*", 0xDF) =~ /ss/i,
2988 * the 0xC3 0x9F are the UTF-8
2989 * byte sequence for the U+00DF. */
2990 if (!(do_utf8 &&
2991 toLOWER(s[0]) == 's' &&
5d9a96ca 2992 st->ln >= 2 &&
5486206c
JH
2993 toLOWER(s[1]) == 's' &&
2994 (U8)l[0] == 0xC3 &&
2995 e - l >= 2 &&
2996 (U8)l[1] == 0x9F))
2997 sayNO;
2998 }
d07ddd77
JH
2999 locinput = e;
3000 nextchr = UCHARAT(locinput);
3001 break;
a0ed51b3 3002 }
d6a28714 3003
bc517b45
JH
3004 /* Neither the target and the pattern are utf8. */
3005
d6a28714
JH
3006 /* Inline the first character, for speed. */
3007 if (UCHARAT(s) != nextchr &&
3008 UCHARAT(s) != ((OP(scan) == EXACTF)
3009 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3010 sayNO;
5d9a96ca 3011 if (PL_regeol - locinput < st->ln)
b8c5462f 3012 sayNO;
5d9a96ca
DM
3013 if (st->ln > 1 && (OP(scan) == EXACTF
3014 ? ibcmp(s, locinput, st->ln)
3015 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 3016 sayNO;
5d9a96ca 3017 locinput += st->ln;
d6a28714 3018 nextchr = UCHARAT(locinput);
a0d0e21e 3019 break;
95b24440 3020 }
d6a28714 3021 case ANYOF:
ffc61ed2 3022 if (do_utf8) {
9e55ce06
JH
3023 STRLEN inclasslen = PL_regeol - locinput;
3024
32fc9b6a 3025 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 3026 sayNO_ANYOF;
ffc61ed2
JH
3027 if (locinput >= PL_regeol)
3028 sayNO;
0f0076b4 3029 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3030 nextchr = UCHARAT(locinput);
e0f9d4a8 3031 break;
ffc61ed2
JH
3032 }
3033 else {
3034 if (nextchr < 0)
3035 nextchr = UCHARAT(locinput);
32fc9b6a 3036 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 3037 sayNO_ANYOF;
ffc61ed2
JH
3038 if (!nextchr && locinput >= PL_regeol)
3039 sayNO;
3040 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3041 break;
3042 }
3043 no_anyof:
3044 /* If we might have the case of the German sharp s
3045 * in a casefolding Unicode character class. */
3046
ebc501f0
JH
3047 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3048 locinput += SHARP_S_SKIP;
e0f9d4a8 3049 nextchr = UCHARAT(locinput);
ffc61ed2 3050 }
e0f9d4a8
JH
3051 else
3052 sayNO;
b8c5462f 3053 break;
d6a28714 3054 case ALNUML:
b8c5462f
JH
3055 PL_reg_flags |= RF_tainted;
3056 /* FALL THROUGH */
d6a28714 3057 case ALNUM:
b8c5462f 3058 if (!nextchr)
4633a7c4 3059 sayNO;
ffc61ed2 3060 if (do_utf8) {
1a4fad37 3061 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3062 if (!(OP(scan) == ALNUM
bb7a0f54 3063 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3064 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3065 {
3066 sayNO;
a0ed51b3 3067 }
b8c5462f 3068 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3069 nextchr = UCHARAT(locinput);
3070 break;
3071 }
ffc61ed2 3072 if (!(OP(scan) == ALNUM
d6a28714 3073 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3074 sayNO;
b8c5462f 3075 nextchr = UCHARAT(++locinput);
a0d0e21e 3076 break;
d6a28714 3077 case NALNUML:
b8c5462f
JH
3078 PL_reg_flags |= RF_tainted;
3079 /* FALL THROUGH */
d6a28714
JH
3080 case NALNUM:
3081 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3082 sayNO;
ffc61ed2 3083 if (do_utf8) {
1a4fad37 3084 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3085 if (OP(scan) == NALNUM
bb7a0f54 3086 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3087 : isALNUM_LC_utf8((U8*)locinput))
3088 {
b8c5462f 3089 sayNO;
d6a28714 3090 }
b8c5462f
JH
3091 locinput += PL_utf8skip[nextchr];
3092 nextchr = UCHARAT(locinput);
3093 break;
3094 }
ffc61ed2 3095 if (OP(scan) == NALNUM
d6a28714 3096 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3097 sayNO;
76e3520e 3098 nextchr = UCHARAT(++locinput);
a0d0e21e 3099 break;
d6a28714
JH
3100 case BOUNDL:
3101 case NBOUNDL:
3280af22 3102 PL_reg_flags |= RF_tainted;
bbce6d69 3103 /* FALL THROUGH */
d6a28714
JH
3104 case BOUND:
3105 case NBOUND:
3106 /* was last char in word? */
ffc61ed2 3107 if (do_utf8) {
12d33761 3108 if (locinput == PL_bostr)
5d9a96ca 3109 st->ln = '\n';
ffc61ed2 3110 else {
a3b680e6 3111 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3112
4ad0818d 3113 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3114 }
3115 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3116 st->ln = isALNUM_uni(st->ln);
1a4fad37 3117 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3118 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3119 }
3120 else {
5d9a96ca 3121 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3122 n = isALNUM_LC_utf8((U8*)locinput);
3123 }
a0ed51b3 3124 }
d6a28714 3125 else {
5d9a96ca 3126 st->ln = (locinput != PL_bostr) ?
12d33761 3127 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3128 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3129 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3130 n = isALNUM(nextchr);
3131 }
3132 else {
5d9a96ca 3133 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3134 n = isALNUM_LC(nextchr);
3135 }
d6a28714 3136 }
5d9a96ca 3137 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3138 OP(scan) == BOUNDL))
3139 sayNO;
a0ed51b3 3140 break;
d6a28714 3141 case SPACEL:
3280af22 3142 PL_reg_flags |= RF_tainted;
bbce6d69 3143 /* FALL THROUGH */
d6a28714 3144 case SPACE:
9442cb0e 3145 if (!nextchr)
4633a7c4 3146 sayNO;
1aa99e6b 3147 if (do_utf8) {
fd400ab9 3148 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3149 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3150 if (!(OP(scan) == SPACE
bb7a0f54 3151 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3152 : isSPACE_LC_utf8((U8*)locinput)))
3153 {
3154 sayNO;
3155 }
3156 locinput += PL_utf8skip[nextchr];
3157 nextchr = UCHARAT(locinput);
3158 break;
d6a28714 3159 }
ffc61ed2
JH
3160 if (!(OP(scan) == SPACE
3161 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3162 sayNO;
3163 nextchr = UCHARAT(++locinput);
3164 }
3165 else {
3166 if (!(OP(scan) == SPACE
3167 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3168 sayNO;
3169 nextchr = UCHARAT(++locinput);
a0ed51b3 3170 }
a0ed51b3 3171 break;
d6a28714 3172 case NSPACEL:
3280af22 3173 PL_reg_flags |= RF_tainted;
bbce6d69 3174 /* FALL THROUGH */
d6a28714 3175 case NSPACE:
9442cb0e 3176 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3177 sayNO;
1aa99e6b 3178 if (do_utf8) {
1a4fad37 3179 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3180 if (OP(scan) == NSPACE
bb7a0f54 3181 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3182 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3183 {
3184 sayNO;
3185 }
3186 locinput += PL_utf8skip[nextchr];
3187 nextchr = UCHARAT(locinput);
3188 break;
a0ed51b3 3189 }
ffc61ed2 3190 if (OP(scan) == NSPACE
d6a28714 3191 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3192 sayNO;
76e3520e 3193 nextchr = UCHARAT(++locinput);
a0d0e21e 3194 break;
d6a28714 3195 case DIGITL:
a0ed51b3
LW
3196 PL_reg_flags |= RF_tainted;
3197 /* FALL THROUGH */
d6a28714 3198 case DIGIT:
9442cb0e 3199 if (!nextchr)
a0ed51b3 3200 sayNO;
1aa99e6b 3201 if (do_utf8) {
1a4fad37 3202 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3203 if (!(OP(scan) == DIGIT
bb7a0f54 3204 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3205 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3206 {
a0ed51b3 3207 sayNO;
dfe13c55 3208 }
6f06b55f 3209 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3210 nextchr = UCHARAT(locinput);
3211 break;
3212 }
ffc61ed2 3213 if (!(OP(scan) == DIGIT
9442cb0e 3214 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3215 sayNO;
3216 nextchr = UCHARAT(++locinput);
3217 break;
d6a28714 3218 case NDIGITL:
b8c5462f
JH
3219 PL_reg_flags |= RF_tainted;
3220 /* FALL THROUGH */
d6a28714 3221 case NDIGIT:
9442cb0e 3222 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3223 sayNO;
1aa99e6b 3224 if (do_utf8) {
1a4fad37 3225 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3226 if (OP(scan) == NDIGIT
bb7a0f54 3227 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3228 : isDIGIT_LC_utf8((U8*)locinput))
3229 {
a0ed51b3 3230 sayNO;
9442cb0e 3231 }
6f06b55f 3232 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3233 nextchr = UCHARAT(locinput);
3234 break;
3235 }
ffc61ed2 3236 if (OP(scan) == NDIGIT
9442cb0e 3237 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3238 sayNO;
3239 nextchr = UCHARAT(++locinput);
3240 break;
3241 case CLUMP:
b7c83a7e 3242 if (locinput >= PL_regeol)
a0ed51b3 3243 sayNO;
b7c83a7e 3244 if (do_utf8) {
1a4fad37 3245 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3246 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3247 sayNO;
3248 locinput += PL_utf8skip[nextchr];
3249 while (locinput < PL_regeol &&
3250 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3251 locinput += UTF8SKIP(locinput);
3252 if (locinput > PL_regeol)
3253 sayNO;
eb08e2da
JH
3254 }
3255 else
3256 locinput++;
a0ed51b3
LW
3257 nextchr = UCHARAT(locinput);
3258 break;
c8756f30 3259 case REFFL:
3280af22 3260 PL_reg_flags |= RF_tainted;
c8756f30 3261 /* FALL THROUGH */
c277df42 3262 case REF:
95b24440
DM
3263 case REFF: {
3264 char *s;
c277df42 3265 n = ARG(scan); /* which paren pair */
5d9a96ca 3266 st->ln = PL_regstartp[n];
2c2d71f5 3267 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3268 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3269 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3270 if (st->ln == PL_regendp[n])
a0d0e21e 3271 break;
a0ed51b3 3272
5d9a96ca 3273 s = PL_bostr + st->ln;
1aa99e6b 3274 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3275 char *l = locinput;
a3b680e6 3276 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3277 /*
3278 * Note that we can't do the "other character" lookup trick as
3279 * in the 8-bit case (no pun intended) because in Unicode we
3280 * have to map both upper and title case to lower case.
3281 */
3282 if (OP(scan) == REFF) {
3283 while (s < e) {
a3b680e6
AL
3284 STRLEN ulen1, ulen2;
3285 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3286 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3287
a0ed51b3
LW
3288 if (l >= PL_regeol)
3289 sayNO;
a2a2844f
JH
3290 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3291 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3292 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3293 sayNO;
a2a2844f
JH
3294 s += ulen1;
3295 l += ulen2;
a0ed51b3
LW
3296 }
3297 }
3298 locinput = l;
3299 nextchr = UCHARAT(locinput);
3300 break;
3301 }
3302
a0d0e21e 3303 /* Inline the first character, for speed. */
76e3520e 3304 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3305 (OP(scan) == REF ||
3306 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3307 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3308 sayNO;
5d9a96ca
DM
3309 st->ln = PL_regendp[n] - st->ln;
3310 if (locinput + st->ln > PL_regeol)
4633a7c4 3311 sayNO;
5d9a96ca
DM
3312 if (st->ln > 1 && (OP(scan) == REF
3313 ? memNE(s, locinput, st->ln)
c8756f30 3314 : (OP(scan) == REFF
5d9a96ca
DM
3315 ? ibcmp(s, locinput, st->ln)
3316 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3317 sayNO;
5d9a96ca 3318 locinput += st->ln;
76e3520e 3319 nextchr = UCHARAT(locinput);
a0d0e21e 3320 break;
95b24440 3321 }
a0d0e21e
LW
3322
3323 case NOTHING:
c277df42 3324 case TAIL:
a0d0e21e
LW
3325 break;
3326 case BACK:
3327 break;
40a82448
DM
3328
3329#undef ST
3330#define ST st->u.eval
3331
3332 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
c277df42 3333 {
c277df42 3334 SV *ret;
8e5e9ebe 3335 {
4aabdb9b
DM
3336 /* execute the code in the {...} */
3337 dSP;
6136c704 3338 SV ** const before = SP;
4aabdb9b
DM
3339 OP_4tree * const oop = PL_op;
3340 COP * const ocurcop = PL_curcop;
3341 PAD *old_comppad;
4aabdb9b
DM
3342
3343 n = ARG(scan);
32fc9b6a 3344 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3345 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3346 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3347 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3348
8e5e9ebe
RGS
3349 CALLRUNOPS(aTHX); /* Scalar context. */
3350 SPAGAIN;
3351 if (SP == before)
075aa684 3352 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3353 else {
3354 ret = POPs;
3355 PUTBACK;
3356 }
4aabdb9b
DM
3357
3358 PL_op = oop;
3359 PAD_RESTORE_LOCAL(old_comppad);
3360 PL_curcop = ocurcop;
3361 if (!st->logical) {
3362 /* /(?{...})/ */
3363 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3364 break;
3365 }
8e5e9ebe 3366 }
4aabdb9b
DM
3367 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3368 regexp *re;
4aabdb9b 3369 {
4f639d21
DM
3370 /* extract RE object from returned value; compiling if
3371 * necessary */
3372
6136c704 3373 MAGIC *mg = NULL;
be8e71aa 3374 const SV *sv;
faf82a0b
AE
3375 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3376 mg = mg_find(sv, PERL_MAGIC_qr);
3377 else if (SvSMAGICAL(ret)) {
3378 if (SvGMAGICAL(ret))
3379 sv_unmagic(ret, PERL_MAGIC_qr);
3380 else
3381 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3382 }
faf82a0b 3383
0f5d15d6
IZ
3384 if (mg) {
3385 re = (regexp *)mg->mg_obj;
df0003d4 3386 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3387 }
3388 else {
3389 STRLEN len;
6136c704 3390 const char * const t = SvPV_const(ret, len);
0f5d15d6 3391 PMOP pm;
a3b680e6 3392 const I32 osize = PL_regsize;
0f5d15d6 3393
5fcd1c1b 3394 Zero(&pm, 1, PMOP);
4aabdb9b 3395 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
f9f4320a 3396 re = CALLREGCOMP((char*)t, (char*)t + len, &pm);
9041c2e3 3397 if (!(SvFLAGS(ret)
faf82a0b
AE
3398 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3399 | SVs_GMG)))
14befaf4
DM
3400 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3401 PERL_MAGIC_qr,0,0);
0f5d15d6 3402 PL_regsize = osize;
0f5d15d6 3403 }
4aabdb9b 3404 }
aa283a38
DM
3405
3406 /* run the pattern returned from (??{...}) */
40a82448
DM
3407 ST.cp = regcppush(0); /* Save *all* the positions. */
3408 REGCP_SET(ST.lastcp);
4aabdb9b
DM
3409 *PL_reglastparen = 0;
3410 *PL_reglastcloseparen = 0;
4aabdb9b 3411 PL_reginput = locinput;
4aabdb9b
DM
3412
3413 /* XXXX This is too dramatic a measure... */
3414 PL_reg_maxiter = 0;
3415
5d9a96ca 3416 st->logical = 0;
faec1544
DM
3417 ST.toggle_reg_flags = PL_reg_flags;
3418 if (re->reganch & ROPT_UTF8)
3419 PL_reg_flags |= RF_utf8;
3420 else
3421 PL_reg_flags &= ~RF_utf8;
3422 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3423
40a82448 3424 ST.prev_rex = rex;
faec1544 3425 ST.prev_curlyx = cur_curlyx;
aa283a38 3426 rex = re;
faec1544 3427 cur_curlyx = NULL;
40a82448 3428 ST.B = next;
faec1544
DM
3429 ST.prev_eval = cur_eval;
3430 cur_eval = st;
3431
1de06328
YO
3432 DEBUG_EXECUTE_r(
3433 debug_start_match(re, do_utf8, locinput, PL_regeol,
3434 "Matching embedded");
3435 );
faec1544
DM
3436 /* now continue from first node in postoned RE */
3437 PUSH_YES_STATE_GOTO(EVAL_AB, re->program + 1);
4aabdb9b 3438 /* NOTREACHED */
a0ed51b3 3439 }
4aabdb9b 3440 /* /(?(?{...})X|Y)/ */
f9f4320a 3441 st->sw = (bool)SvTRUE(ret);
4aabdb9b 3442 st->logical = 0;
c277df42
IZ
3443 break;
3444 }
40a82448 3445
faec1544
DM
3446 case EVAL_AB: /* cleanup after a successful (??{A})B */
3447 /* note: this is called twice; first after popping B, then A */
3448 PL_reg_flags ^= ST.toggle_reg_flags;
40a82448
DM
3449 ReREFCNT_dec(rex);
3450 rex = ST.prev_rex;
faec1544
DM
3451 regcpblow(ST.cp);
3452 cur_eval = ST.prev_eval;
3453 cur_curlyx = ST.prev_curlyx;
40a82448
DM
3454 /* XXXX This is too dramatic a measure... */
3455 PL_reg_maxiter = 0;
faec1544 3456 sayYES_FINAL;
40a82448 3457
40a82448 3458
faec1544
DM
3459 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3460 /* note: this is called twice; first after popping B, then A */
3461 PL_reg_flags ^= ST.toggle_reg_flags;
3462 ReREFCNT_dec(rex);
3463 rex = ST.prev_rex;
40a82448
DM
3464 PL_reginput = locinput;
3465 REGCP_UNWIND(ST.lastcp);
3466 regcppop(rex);
faec1544
DM
3467 cur_eval = ST.prev_eval;
3468 cur_curlyx = ST.prev_curlyx;
3469 /* XXXX This is too dramatic a measure... */
3470 PL_reg_maxiter = 0;
40a82448
DM
3471 sayNO_SILENT;
3472
3473#undef ST
3474
a0d0e21e 3475 case OPEN:
c277df42 3476 n = ARG(scan); /* which paren pair */
3280af22
NIS
3477 PL_reg_start_tmp[n] = locinput;
3478 if (n > PL_regsize)
3479 PL_regsize = n;
a0d0e21e
LW
3480 break;
3481 case CLOSE:
c277df42 3482 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3483 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3484 PL_regendp[n] = locinput - PL_bostr;
eb160463 3485 if (n > (I32)*PL_reglastparen)
3280af22 3486 *PL_reglastparen = n;
a01268b5 3487 *PL_reglastcloseparen = n;
a0d0e21e 3488 break;
c277df42
IZ
3489 case GROUPP:
3490 n = ARG(scan); /* which paren pair */
f9f4320a 3491 st->sw = (bool)((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3492 break;
3493 case IFTHEN:
2c2d71f5 3494 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3495 if (st->sw)
c277df42
IZ
3496 next = NEXTOPER(NEXTOPER(scan));
3497 else {
3498 next = scan + ARG(scan);
3499 if (OP(next) == IFTHEN) /* Fake one. */
3500 next = NEXTOPER(NEXTOPER(next));
3501 }
3502 break;
3503 case LOGICAL:
5d9a96ca 3504 st->logical = scan->flags;
c277df42 3505 break;
2ab05381 3506/*******************************************************************
b8591aee 3507 cur_curlyx points to the regmatch_state associated with the most recent CURLYX.
a0374537 3508 This struct contains info about the innermost (...)* loop (an
b8591aee 3509 "infoblock"), and a pointer to the next outer cur_curlyx.
2ab05381
IZ
3510
3511 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3512
95b24440 3513 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3514
b8591aee 3515 2) This regnode populates cur_curlyx, and calls regmatch() recursively
95b24440 3516 with the starting point at WHILEM node;
2ab05381
IZ
3517
3518 3) Each hit of WHILEM node tries to match A and Z (in the order
3519 depending on the current iteration, min/max of {min,max} and
3520 greediness). The information about where are nodes for "A"
b8591aee 3521 and "Z" is read from cur_curlyx, as is info on how many times "A"
2ab05381
IZ
3522 was already matched, and greediness.
3523
3524 4) After A matches, the same WHILEM node is hit again.
3525
b8591aee 3526 5) Each time WHILEM is hit, cur_curlyx is the infoblock created by CURLYX
2ab05381 3527 of the same pair. Thus when WHILEM tries to match Z, it temporarily
b8591aee 3528 resets cur_curlyx, since this Y(A)*Z can be a part of some other loop:
2ab05381
IZ
3529 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3530 of the external loop.
3531
b8591aee 3532 Currently present infoblocks form a tree with a stem formed by cur_curlyx
2ab05381
IZ
3533 and whatever it mentions via ->next, and additional attached trees
3534 corresponding to temporarily unset infoblocks as in "5" above.
3535
95b24440 3536 In the following picture, infoblocks for outer loop of
2ab05381
IZ
3537 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3538 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3539 infoblocks are drawn below the "reset" infoblock.
3540
3541 In fact in the picture below we do not show failed matches for Z and T
3542 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3543 more obvious *why* one needs to *temporary* unset infoblocks.]
3544
3545 Matched REx position InfoBlocks Comment
3546 (Y(A)*?Z)*?T x
3547 Y(A)*?Z)*?T x <- O
3548 Y (A)*?Z)*?T x <- O
3549 Y A)*?Z)*?T x <- O <- I
3550 YA )*?Z)*?T x <- O <- I
3551 YA A)*?Z)*?T x <- O <- I
3552 YAA )*?Z)*?T x <- O <- I
3553 YAA Z)*?T x <- O # Temporary unset I
3554 I
3555
3556 YAAZ Y(A)*?Z)*?T x <- O
3557 I
3558
3559 YAAZY (A)*?Z)*?T x <- O
3560 I
3561
3562 YAAZY A)*?Z)*?T x <- O <- I
3563 I
3564
3565 YAAZYA )*?Z)*?T x <- O <- I
3566 I
3567
3568 YAAZYA Z)*?T x <- O # Temporary unset I
3569 I,I
3570
3571 YAAZYAZ )*?T x <- O
3572 I,I
3573
3574 YAAZYAZ T x # Temporary unset O
3575 O
3576 I,I
3577
3578 YAAZYAZT x
3579 O
3580 I,I
3581 *******************************************************************/
95b24440 3582
a0d0e21e 3583 case CURLYX: {
cb434fcc 3584 /* No need to save/restore up to this paren */
10edeb5d
JH
3585 parenfloor = scan->flags;
3586
c2b7afd3
NC
3587 /* Dave says:
3588
3589 CURLYX and WHILEM are always paired: they're the moral
3590 equivalent of pp_enteriter anbd pp_iter.
3591
3592 The only time next could be null is if the node tree is
3593 corrupt. This was mentioned on p5p a few days ago.
3594
3595 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3596 So we'll assert that this is true:
3597 */
3598 assert(next);
30b2893d 3599 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
c277df42 3600 next += ARG(next);
cb434fcc
IZ
3601 /* XXXX Probably it is better to teach regpush to support
3602 parenfloor > PL_regsize... */
eb160463 3603 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc 3604 parenfloor = *PL_reglastparen; /* Pessimization... */
a0374537 3605
d8319b27 3606 st->u.curlyx.cp = PL_savestack_ix;
b8591aee
DM
3607 st->u.curlyx.outercc = cur_curlyx;
3608 cur_curlyx = st;
a0374537
DM
3609 /* these fields contain the state of the current curly.
3610 * they are accessed by subsequent WHILEMs;
3611 * cur and lastloc are also updated by WHILEM */
d8319b27
DM
3612 st->u.curlyx.parenfloor = parenfloor;
3613 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3614 st->u.curlyx.min = ARG1(scan);
3615 st->u.curlyx.max = ARG2(scan);
3616 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3617 st->u.curlyx.lastloc = 0;
a0374537
DM
3618 /* st->next and st->minmod are also read by WHILEM */
3619
3280af22 3620 PL_reginput = locinput;
95b24440
DM
3621 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3622 /*** all unsaved local vars undefined at this point */
d8319b27 3623 regcpblow(st->u.curlyx.cp);
b8591aee 3624 cur_curlyx = st->u.curlyx.outercc;
95b24440 3625 saySAME(result);
a0d0e21e 3626 }
5f66b61c 3627 /* NOTREACHED */
a0d0e21e
LW
3628 case WHILEM: {
3629 /*
3630 * This is really hard to understand, because after we match
3631 * what we're trying to match, we must make sure the rest of
2c2d71f5 3632 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3633 * to go back UP the parse tree by recursing ever deeper. And
3634 * if it fails, we have to reset our parent's current state
3635 * that we can try again after backing off.
3636 */
3637
c2b7afd3
NC
3638 /* Dave says:
3639
b8591aee
DM
3640 cur_curlyx gets initialised by CURLYX ready for use by WHILEM.
3641 So again, unless somethings been corrupted, cur_curlyx cannot
c2b7afd3
NC
3642 be null at that point in WHILEM.
3643
3644 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3645 So we'll assert that this is true:
3646 */
b8591aee
DM
3647 assert(cur_curlyx);
3648 st->u.whilem.lastloc = cur_curlyx->u.curlyx.lastloc; /* Detection of 0-len. */
d8319b27
DM
3649 st->u.whilem.cache_offset = 0;
3650 st->u.whilem.cache_bit = 0;
c277df42 3651
b8591aee 3652 n = cur_curlyx->u.curlyx.cur + 1; /* how many we know we matched */
3280af22 3653 PL_reginput = locinput;
a0d0e21e 3654
a3621e74 3655 DEBUG_EXECUTE_r(
9041c2e3 3656 PerlIO_printf(Perl_debug_log,
91f3b821 3657 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
5bc10b2c 3658 REPORT_CODE_OFF+depth*2, "",
b8591aee
DM
3659 (long)n, (long)cur_curlyx->u.curlyx.min,
3660 (long)cur_curlyx->u.curlyx.max,
3661 PTR2UV(cur_curlyx))
c277df42 3662 );
4633a7c4 3663
a0d0e21e
LW
3664 /* If degenerate scan matches "", assume scan done. */
3665
b8591aee
DM
3666 if (locinput == cur_curlyx->u.curlyx.lastloc && n >=
3667 cur_curlyx->u.curlyx.min)
3668 {
3669 st->u.whilem.savecc = cur_curlyx;
3670 cur_curlyx = cur_curlyx->u.curlyx.outercc;
3671 if (cur_curlyx)
3672 st->ln = cur_curlyx->u.curlyx.cur;
a3621e74 3673 DEBUG_EXECUTE_r(
c3464db5
DD
3674 PerlIO_printf(Perl_debug_log,
3675 "%*s empty match detected, try continuation...\n",
5bc10b2c 3676 REPORT_CODE_OFF+depth*2, "")
c277df42 3677 );
d8319b27 3678 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
95b24440 3679 /*** all unsaved local vars undefined at this point */
b8591aee 3680 cur_curlyx = st->u.whilem.savecc;
95b24440 3681 if (result)
4633a7c4 3682 sayYES;
b8591aee
DM
3683 if (cur_curlyx->u.curlyx.outercc)
3684 cur_curlyx->u.curlyx.outercc->u.curlyx.cur = st->ln;
4633a7c4 3685 sayNO;
a0d0e21e
LW
3686 }
3687
3688 /* First just match a string of min scans. */
3689
b8591aee
DM
3690 if (n < cur_curlyx->u.curlyx.min) {
3691 cur_curlyx->u.curlyx.cur = n;
3692 cur_curlyx->u.curlyx.lastloc = locinput;
3693 REGMATCH(cur_curlyx->u.curlyx.scan, WHILEM2);
95b24440
DM
3694 /*** all unsaved local vars undefined at this point */
3695 if (result)
4633a7c4 3696 sayYES;
b8591aee
DM
3697 cur_curlyx->u.curlyx.cur = n - 1;
3698 cur_curlyx->u.curlyx.lastloc = st->u.whilem.lastloc;
4633a7c4 3699 sayNO;
a0d0e21e
LW
3700 }
3701
2c2d71f5
JH
3702 if (scan->flags) {
3703 /* Check whether we already were at this position.
3704 Postpone detection until we know the match is not
3705 *that* much linear. */
3706 if (!PL_reg_maxiter) {
3707 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
3708 /* possible overflow for long strings and many CURLYX's */
3709 if (PL_reg_maxiter < 0)
3710 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
3711 PL_reg_leftiter = PL_reg_maxiter;
3712 }
3713 if (PL_reg_leftiter-- == 0) {
3298f257 3714 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 3715 if (PL_reg_poscache) {
eb160463 3716 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3717 Renew(PL_reg_poscache, size, char);
3718 PL_reg_poscache_size = size;
3719 }
3720 Zero(PL_reg_poscache, size, char);
3721 }
3722 else {
3723 PL_reg_poscache_size = size;
a02a5408 3724 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3725 }
a3621e74 3726 DEBUG_EXECUTE_r(
2c2d71f5
JH
3727 PerlIO_printf(Perl_debug_log,
3728 "%sDetected a super-linear match, switching on caching%s...\n",
3729 PL_colors[4], PL_colors[5])
3730 );
3731 }
3732 if (PL_reg_leftiter < 0) {
d8319b27 3733 st->u.whilem.cache_offset = locinput - PL_bostr;
2c2d71f5 3734
3298f257 3735 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1
d8319b27
DM
3736 + st->u.whilem.cache_offset * (scan->flags>>4);
3737 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3738 st->u.whilem.cache_offset /= 8;
3739 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
a3621e74 3740 DEBUG_EXECUTE_r(
2c2d71f5
JH
3741 PerlIO_printf(Perl_debug_log,
3742 "%*s already tried at this position...\n",
5bc10b2c 3743 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 3744 );
3298f257 3745 sayNO; /* cache records failure */
2c2d71f5 3746 }
2c2d71f5
JH
3747 }
3748 }
3749
a0d0e21e
LW
3750 /* Prefer next over scan for minimal matching. */
3751
b8591aee
DM
3752 if (cur_curlyx->minmod) {
3753 st->u.whilem.savecc = cur_curlyx;
3754 cur_curlyx = cur_curlyx->u.curlyx.outercc;
3755 if (cur_curlyx)
3756 st->ln = cur_curlyx->u.curlyx.cur;
d8319b27
DM
3757 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3758 REGCP_SET(st->u.whilem.lastcp);
3759 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
95b24440 3760 /*** all unsaved local vars undefined at this point */
b8591aee 3761 cur_curlyx = st->u.whilem.savecc;
95b24440 3762 if (result) {
d8319b27 3763 regcpblow(st->u.whilem.cp);
3298f257 3764 sayYES; /* All done. */
5f05dabc 3765 }
d8319b27 3766 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3767 regcppop(rex);
b8591aee
DM
3768 if (cur_curlyx->u.curlyx.outercc)
3769 cur_curlyx->u.curlyx.outercc->u.curlyx.cur = st->ln;
a0d0e21e 3770
b8591aee 3771 if (n >= cur_curlyx->u.curlyx.max) { /* Maximum greed exceeded? */
9041c2e3 3772 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3773 && !(PL_reg_flags & RF_warned)) {
3774 PL_reg_flags |= RF_warned;
9014280d 3775 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3776 "Complex regular subexpression recursion",
3777 REG_INFTY - 1);
c277df42 3778 }
3ab3c9b4 3779 CACHEsayNO;
c277df42 3780 }
a687059c 3781
a3621e74 3782 DEBUG_EXECUTE_r(
c3464db5
DD
3783 PerlIO_printf(Perl_debug_log,
3784 "%*s trying longer...\n",
5bc10b2c 3785 REPORT_CODE_OFF+depth*2, "")
c277df42 3786 );
a0d0e21e 3787 /* Try scanning more and see if it helps. */
3280af22 3788 PL_reginput = locinput;
b8591aee
DM
3789 cur_curlyx->u.curlyx.cur = n;
3790 cur_curlyx->u.curlyx.lastloc = locinput;
3791 st->u.whilem.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
d8319b27 3792 REGCP_SET(st->u.whilem.lastcp);
b8591aee 3793 REGMATCH(cur_curlyx->u.curlyx.scan, WHILEM4);
95b24440
DM
3794 /*** all unsaved local vars undefined at this point */
3795 if (result) {
d8319b27 3796 regcpblow(st->u.whilem.cp);
3298f257 3797 sayYES;
5f05dabc 3798 }
d8319b27 3799 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3800 regcppop(rex);
b8591aee
DM
3801 cur_curlyx->u.curlyx.cur = n - 1;
3802 cur_curlyx->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3803 CACHEsayNO;
a0d0e21e
LW
3804 }
3805
3806 /* Prefer scan over next for maximal matching. */
3807
b8591aee
DM
3808 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
3809 st->u.whilem.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3810 cur_curlyx->u.curlyx.cur = n;
3811 cur_curlyx->u.curlyx.lastloc = locinput;
d8319b27 3812 REGCP_SET(st->u.whilem.lastcp);
b8591aee 3813 REGMATCH(cur_curlyx->u.curlyx.scan, WHILEM5);
95b24440
DM
3814 /*** all unsaved local vars undefined at this point */
3815 if (result) {
d8319b27 3816 regcpblow(st->u.whilem.cp);
3298f257 3817 sayYES;
5f05dabc 3818 }
d8319b27 3819 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3820 regcppop(rex); /* Restore some previous $<digit>s? */
3280af22 3821 PL_reginput = locinput;
a3621e74 3822 DEBUG_EXECUTE_r(
c3464db5
DD
3823 PerlIO_printf(Perl_debug_log,
3824 "%*s failed, try continuation...\n",
5bc10b2c 3825 REPORT_CODE_OFF+depth*2, "")
c277df42
IZ
3826 );
3827 }
9041c2e3 3828 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3829 && !(PL_reg_flags & RF_warned)) {
3280af22 3830 PL_reg_flags |= RF_warned;
9014280d 3831 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3832 "Complex regular subexpression recursion",
3833 REG_INFTY - 1);
a0d0e21e
LW
3834 }
3835
3836 /* Failed deeper matches of scan, so see if this one works. */
b8591aee
DM
3837 st->u.whilem.savecc = cur_curlyx;
3838 cur_curlyx = cur_curlyx->u.curlyx.outercc;
3839 if (cur_curlyx)
3840 st->ln = cur_curlyx->u.curlyx.cur;
d8319b27 3841 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
95b24440 3842 /*** all unsaved local vars undefined at this point */
b8591aee 3843 cur_curlyx = st->u.whilem.savecc;
95b24440 3844 if (result)
3298f257 3845 sayYES;
b8591aee
DM
3846 if (cur_curlyx->u.curlyx.outercc)
3847 cur_curlyx->u.curlyx.outercc->u.curlyx.cur = st->ln;
3848 cur_curlyx->u.curlyx.cur = n - 1;
3849 cur_curlyx->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3850 CACHEsayNO;
a0d0e21e 3851 }
5f66b61c 3852 /* NOTREACHED */
40a82448
DM
3853
3854#undef ST
3855#define ST st->u.branch
3856
3857 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
3858 next = scan + ARG(scan);
3859 if (next == scan)
3860 next = NULL;
40a82448
DM
3861 scan = NEXTOPER(scan);
3862 /* FALL THROUGH */
c277df42 3863
40a82448
DM
3864 case BRANCH: /* /(...|A|...)/ */
3865 scan = NEXTOPER(scan); /* scan now points to inner node */
3866 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3867 /* last branch; skip state push and jump direct to node */
3868 continue;
3869 ST.lastparen = *PL_reglastparen;
3870 ST.next_branch = next;
3871 REGCP_SET(ST.cp);
3872 PL_reginput = locinput;
02db2b7b 3873
40a82448
DM
3874 /* Now go into the branch */
3875 PUSH_STATE_GOTO(BRANCH_next, scan);
3876 /* NOTREACHED */
3877
3878 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3879 REGCP_UNWIND(ST.cp);
3880 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3881 PL_regendp[n] = -1;
3882 *PL_reglastparen = n;
3883 scan = ST.next_branch;
3884 /* no more branches? */
3885 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
3886 sayNO;
3887 continue; /* execute next BRANCH[J] op */
3888 /* NOTREACHED */
3889
a0d0e21e 3890 case MINMOD:
5d9a96ca 3891 st->minmod = 1;
a0d0e21e 3892 break;
40a82448
DM
3893
3894#undef ST
3895#define ST st->u.curlym
3896
3897 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
3898
3899 /* This is an optimisation of CURLYX that enables us to push
3900 * only a single backtracking state, no matter now many matches
3901 * there are in {m,n}. It relies on the pattern being constant
3902 * length, with no parens to influence future backrefs
3903 */
3904
3905 ST.me = scan;
dc45a647 3906 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
3907
3908 /* if paren positive, emulate an OPEN/CLOSE around A */
3909 if (ST.me->flags) {
3910 I32 paren = ST.me->flags;
3911 if (paren > PL_regsize)
3912 PL_regsize = paren;
3913 if (paren > (I32)*PL_reglastparen)
3914 *PL_reglastparen = paren;
c277df42 3915 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 3916 }
40a82448
DM
3917 ST.A = scan;
3918 ST.B = next;
3919 ST.alen = 0;
3920 ST.count = 0;
3921 ST.minmod = st->minmod;
3922 st->minmod = 0;
3923 ST.c1 = CHRTEST_UNINIT;
3924 REGCP_SET(ST.cp);
6407bf3b 3925
40a82448
DM
3926 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
3927 goto curlym_do_B;
3928
3929 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 3930 PL_reginput = locinput;
40a82448
DM
3931 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
3932 /* NOTREACHED */
5f80c4cf 3933
40a82448
DM
3934 case CURLYM_A: /* we've just matched an A */
3935 locinput = st->locinput;
3936 nextchr = UCHARAT(locinput);
3937
3938 ST.count++;
3939 /* after first match, determine A's length: u.curlym.alen */
3940 if (ST.count == 1) {
3941 if (PL_reg_match_utf8) {
3942 char *s = locinput;
3943 while (s < PL_reginput) {
3944 ST.alen++;
3945 s += UTF8SKIP(s);
3946 }
3947 }
3948 else {
3949 ST.alen = PL_reginput - locinput;
3950 }
3951 if (ST.alen == 0)
3952 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
3953 }
0cadcf80
DM
3954 DEBUG_EXECUTE_r(
3955 PerlIO_printf(Perl_debug_log,
40a82448 3956 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 3957 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 3958 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
3959 );
3960
40a82448
DM
3961 locinput = PL_reginput;
3962 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
3963 goto curlym_do_A; /* try to match another A */
3964 goto curlym_do_B; /* try to match B */
5f80c4cf 3965
40a82448
DM
3966 case CURLYM_A_fail: /* just failed to match an A */
3967 REGCP_UNWIND(ST.cp);
3968 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
3969 sayNO;
0cadcf80 3970
40a82448
DM
3971 curlym_do_B: /* execute the B in /A{m,n}B/ */
3972 PL_reginput = locinput;
3973 if (ST.c1 == CHRTEST_UNINIT) {
3974 /* calculate c1 and c2 for possible match of 1st char
3975 * following curly */
3976 ST.c1 = ST.c2 = CHRTEST_VOID;
3977 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
3978 regnode *text_node = ST.B;
3979 if (! HAS_TEXT(text_node))
3980 FIND_NEXT_IMPT(text_node);
3981 if (HAS_TEXT(text_node)
3982 && PL_regkind[OP(text_node)] != REF)
3983 {
3984 ST.c1 = (U8)*STRING(text_node);
3985 ST.c2 =
3986 (OP(text_node) == EXACTF || OP(text_node) == REFF)
3987 ? PL_fold[ST.c1]
3988 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3989 ? PL_fold_locale[ST.c1]
3990 : ST.c1;
c277df42 3991 }
c277df42 3992 }
40a82448
DM
3993 }
3994
3995 DEBUG_EXECUTE_r(
3996 PerlIO_printf(Perl_debug_log,
3997 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 3998 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
3999 "", (IV)ST.count)
4000 );
4001 if (ST.c1 != CHRTEST_VOID
4002 && UCHARAT(PL_reginput) != ST.c1
4003 && UCHARAT(PL_reginput) != ST.c2)
4004 {
4005 /* simulate B failing */
4006 state_num = CURLYM_B_fail;
4007 goto reenter_switch;
4008 }
4009
4010 if (ST.me->flags) {
4011 /* mark current A as captured */
4012 I32 paren = ST.me->flags;
4013 if (ST.count) {
4014 PL_regstartp[paren]
4015 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4016 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 4017 }
40a82448
DM
4018 else
4019 PL_regendp[paren] = -1;
c277df42 4020 }
40a82448 4021 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 4022 /* NOTREACHED */
40a82448
DM
4023
4024 case CURLYM_B_fail: /* just failed to match a B */
4025 REGCP_UNWIND(ST.cp);
4026 if (ST.minmod) {
4027 if (ST.count == ARG2(ST.me) /* max */)
4028 sayNO;
4029 goto curlym_do_A; /* try to match a further A */
4030 }
4031 /* backtrack one A */
4032 if (ST.count == ARG1(ST.me) /* min */)
4033 sayNO;
4034 ST.count--;
4035 locinput = HOPc(locinput, -ST.alen);
4036 goto curlym_do_B; /* try to match B */
4037
c255a977
DM
4038#undef ST
4039#define ST st->u.curly
40a82448 4040
c255a977
DM
4041#define CURLY_SETPAREN(paren, success) \
4042 if (paren) { \
4043 if (success) { \
4044 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
4045 PL_regendp[paren] = locinput - PL_bostr; \
4046 } \
4047 else \
4048 PL_regendp[paren] = -1; \
4049 }
4050
4051 case STAR: /* /A*B/ where A is width 1 */
4052 ST.paren = 0;
4053 ST.min = 0;
4054 ST.max = REG_INFTY;
a0d0e21e
LW
4055 scan = NEXTOPER(scan);
4056 goto repeat;
c255a977
DM
4057 case PLUS: /* /A+B/ where A is width 1 */
4058 ST.paren = 0;
4059 ST.min = 1;
4060 ST.max = REG_INFTY;
c277df42 4061 scan = NEXTOPER(scan);
c255a977
DM
4062 goto repeat;
4063 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4064 ST.paren = scan->flags; /* Which paren to set */
4065 if (ST.paren > PL_regsize)
4066 PL_regsize = ST.paren;
4067 if (ST.paren > (I32)*PL_reglastparen)
4068 *PL_reglastparen = ST.paren;
4069 ST.min = ARG1(scan); /* min to match */
4070 ST.max = ARG2(scan); /* max to match */
4071 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4072 goto repeat;
4073 case CURLY: /* /A{m,n}B/ where A is width 1 */
4074 ST.paren = 0;
4075 ST.min = ARG1(scan); /* min to match */
4076 ST.max = ARG2(scan); /* max to match */
4077 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 4078 repeat:
a0d0e21e
LW
4079 /*
4080 * Lookahead to avoid useless match attempts
4081 * when we know what character comes next.
c255a977 4082 *
5f80c4cf
JP
4083 * Used to only do .*x and .*?x, but now it allows
4084 * for )'s, ('s and (?{ ... })'s to be in the way
4085 * of the quantifier and the EXACT-like node. -- japhy
4086 */
4087
c255a977
DM
4088 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4089 sayNO;
cca55fe3 4090 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4091 U8 *s;
4092 regnode *text_node = next;
4093
3dab1dad
YO
4094 if (! HAS_TEXT(text_node))
4095 FIND_NEXT_IMPT(text_node);
5f80c4cf 4096
9e137952 4097 if (! HAS_TEXT(text_node))
c255a977 4098 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 4099 else {
3dab1dad 4100 if (PL_regkind[OP(text_node)] == REF) {
c255a977 4101 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 4102 goto assume_ok_easy;
cca55fe3 4103 }
be8e71aa
YO
4104 else
4105 s = (U8*)STRING(text_node);
5f80c4cf
JP
4106
4107 if (!UTF) {
c255a977 4108 ST.c2 = ST.c1 = *s;
f65d3ee7 4109 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
c255a977 4110 ST.c2 = PL_fold[ST.c1];
f65d3ee7 4111 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
c255a977 4112 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 4113 }
5f80c4cf 4114 else { /* UTF */
f65d3ee7 4115 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 4116 STRLEN ulen1, ulen2;
89ebb4a3
JH
4117 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4118 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4119
4120 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4121 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
e294cc5d
JH
4122#ifdef EBCDIC
4123 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4124 ckWARN(WARN_UTF8) ?
4125 0 : UTF8_ALLOW_ANY);
4126 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4127 ckWARN(WARN_UTF8) ?
4128 0 : UTF8_ALLOW_ANY);
4129#else
c255a977 4130 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
e294cc5d 4131 uniflags);
c255a977 4132 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
e294cc5d
JH
4133 uniflags);
4134#endif
5f80c4cf
JP
4135 }
4136 else {
c255a977 4137 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4138 uniflags);
5f80c4cf 4139 }
1aa99e6b
IH
4140 }
4141 }
bbce6d69 4142 }
a0d0e21e 4143 else
c255a977 4144 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 4145 assume_ok_easy:
c255a977
DM
4146
4147 ST.A = scan;
4148 ST.B = next;
3280af22 4149 PL_reginput = locinput;
5d9a96ca
DM
4150 if (st->minmod) {
4151 st->minmod = 0;
c255a977 4152 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4633a7c4 4153 sayNO;
c255a977 4154 ST.count = ST.min;
a0ed51b3 4155 locinput = PL_reginput;
c255a977
DM
4156 REGCP_SET(ST.cp);
4157 if (ST.c1 == CHRTEST_VOID)
4158 goto curly_try_B_min;
4159
4160 ST.oldloc = locinput;
4161
4162 /* set ST.maxpos to the furthest point along the
4163 * string that could possibly match */
4164 if (ST.max == REG_INFTY) {
4165 ST.maxpos = PL_regeol - 1;
4166 if (do_utf8)
4167 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4168 ST.maxpos--;
4169 }
4170 else if (do_utf8) {
4171 int m = ST.max - ST.min;
4172 for (ST.maxpos = locinput;
4173 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4174 ST.maxpos += UTF8SKIP(ST.maxpos);
4175 }
4176 else {
4177 ST.maxpos = locinput + ST.max - ST.min;
4178 if (ST.maxpos >= PL_regeol)
4179 ST.maxpos = PL_regeol - 1;
4180 }
4181 goto curly_try_B_min_known;
4182
4183 }
4184 else {
4185 ST.count = regrepeat(rex, ST.A, ST.max);
4186 locinput = PL_reginput;
4187 if (ST.count < ST.min)
4188 sayNO;
4189 if ((ST.count > ST.min)
4190 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4191 {
4192 /* A{m,n} must come at the end of the string, there's
4193 * no point in backing off ... */
4194 ST.min = ST.count;
4195 /* ...except that $ and \Z can match before *and* after
4196 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4197 We may back off by one in this case. */
4198 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4199 ST.min--;
4200 }
4201 REGCP_SET(ST.cp);
4202 goto curly_try_B_max;
4203 }
4204 /* NOTREACHED */
4205
4206
4207 case CURLY_B_min_known_fail:
4208 /* failed to find B in a non-greedy match where c1,c2 valid */
4209 if (ST.paren && ST.count)
4210 PL_regendp[ST.paren] = -1;
4211
4212 PL_reginput = locinput; /* Could be reset... */
4213 REGCP_UNWIND(ST.cp);
4214 /* Couldn't or didn't -- move forward. */
4215 ST.oldloc = locinput;
4216 if (do_utf8)
4217 locinput += UTF8SKIP(locinput);
4218 else
4219 locinput++;
4220 ST.count++;
4221 curly_try_B_min_known:
4222 /* find the next place where 'B' could work, then call B */
4223 {
4224 int n;
4225 if (do_utf8) {
4226 n = (ST.oldloc == locinput) ? 0 : 1;
4227 if (ST.c1 == ST.c2) {
4228 STRLEN len;
4229 /* set n to utf8_distance(oldloc, locinput) */
4230 while (locinput <= ST.maxpos &&
4231 utf8n_to_uvchr((U8*)locinput,
4232 UTF8_MAXBYTES, &len,
4233 uniflags) != (UV)ST.c1) {
4234 locinput += len;
4235 n++;
4236 }
1aa99e6b
IH
4237 }
4238 else {
c255a977
DM
4239 /* set n to utf8_distance(oldloc, locinput) */
4240 while (locinput <= ST.maxpos) {
4241 STRLEN len;
4242 const UV c = utf8n_to_uvchr((U8*)locinput,
4243 UTF8_MAXBYTES, &len,
4244 uniflags);
4245 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4246 break;
4247 locinput += len;
4248 n++;
1aa99e6b 4249 }
0fe9bf95
IZ
4250 }
4251 }
c255a977
DM
4252 else {
4253 if (ST.c1 == ST.c2) {
4254 while (locinput <= ST.maxpos &&
4255 UCHARAT(locinput) != ST.c1)
4256 locinput++;
bbce6d69 4257 }
c255a977
DM
4258 else {
4259 while (locinput <= ST.maxpos
4260 && UCHARAT(locinput) != ST.c1
4261 && UCHARAT(locinput) != ST.c2)
4262 locinput++;
a0ed51b3 4263 }
c255a977
DM
4264 n = locinput - ST.oldloc;
4265 }
4266 if (locinput > ST.maxpos)
4267 sayNO;
4268 /* PL_reginput == oldloc now */
4269 if (n) {
4270 ST.count += n;
4271 if (regrepeat(rex, ST.A, n) < n)
4633a7c4 4272 sayNO;
a0d0e21e 4273 }
c255a977
DM
4274 PL_reginput = locinput;
4275 CURLY_SETPAREN(ST.paren, ST.count);
4276 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 4277 }
c255a977
DM
4278 /* NOTREACHED */
4279
4280
4281 case CURLY_B_min_fail:
4282 /* failed to find B in a non-greedy match where c1,c2 invalid */
4283 if (ST.paren && ST.count)
4284 PL_regendp[ST.paren] = -1;
4285
4286 REGCP_UNWIND(ST.cp);
4287 /* failed -- move forward one */
4288 PL_reginput = locinput;
4289 if (regrepeat(rex, ST.A, 1)) {
4290 ST.count++;
a0ed51b3 4291 locinput = PL_reginput;
c255a977
DM
4292 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4293 ST.count > 0)) /* count overflow ? */
15272685 4294 {
c255a977
DM
4295 curly_try_B_min:
4296 CURLY_SETPAREN(ST.paren, ST.count);
4297 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
4298 }
4299 }
4633a7c4 4300 sayNO;
c255a977
DM
4301 /* NOTREACHED */
4302
4303
4304 curly_try_B_max:
4305 /* a successful greedy match: now try to match B */
4306 {
4307 UV c = 0;
4308 if (ST.c1 != CHRTEST_VOID)
4309 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4310 UTF8_MAXBYTES, 0, uniflags)
466787eb 4311 : (UV) UCHARAT(PL_reginput);
c255a977
DM
4312 /* If it could work, try it. */
4313 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4314 CURLY_SETPAREN(ST.paren, ST.count);
4315 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4316 /* NOTREACHED */
4317 }
4318 }
4319 /* FALL THROUGH */
4320 case CURLY_B_max_fail:
4321 /* failed to find B in a greedy match */
4322 if (ST.paren && ST.count)
4323 PL_regendp[ST.paren] = -1;
4324
4325 REGCP_UNWIND(ST.cp);
4326 /* back up. */
4327 if (--ST.count < ST.min)
4328 sayNO;
4329 PL_reginput = locinput = HOPc(locinput, -1);
4330 goto curly_try_B_max;
4331
4332#undef ST
4333
4334
a0d0e21e 4335 case END:
faec1544
DM
4336 if (cur_eval) {
4337 /* we've just finished A in /(??{A})B/; now continue with B */
4338 I32 tmpix;
4339
4340
4341 st->u.eval.toggle_reg_flags
4342 = cur_eval->u.eval.toggle_reg_flags;
4343 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4344
4345 st->u.eval.prev_rex = rex; /* inner */
4346 rex = cur_eval->u.eval.prev_rex; /* outer */
4347 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4348 ReREFCNT_inc(rex);
4349 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4350 REGCP_SET(st->u.eval.lastcp);
4351 PL_reginput = locinput;
4352
4353 /* Restore parens of the outer rex without popping the
4354 * savestack */
4355 tmpix = PL_savestack_ix;
4356 PL_savestack_ix = cur_eval->u.eval.lastcp;
4357 regcppop(rex);
4358 PL_savestack_ix = tmpix;
4359
4360 st->u.eval.prev_eval = cur_eval;
4361 cur_eval = cur_eval->u.eval.prev_eval;
4362 DEBUG_EXECUTE_r(
4363 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ...\n",
5bc10b2c 4364 REPORT_CODE_OFF+depth*2, ""););
faec1544
DM
4365 PUSH_YES_STATE_GOTO(EVAL_AB,
4366 st->u.eval.prev_eval->u.eval.B); /* match B */
4367 }
4368
3b0527fe 4369 if (locinput < reginfo->till) {
a3621e74 4370 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4371 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4372 PL_colors[4],
4373 (long)(locinput - PL_reg_starttry),
3b0527fe 4374 (long)(reginfo->till - PL_reg_starttry),
7821416a
IZ
4375 PL_colors[5]));
4376 sayNO_FINAL; /* Cannot match: too short. */
4377 }
4378 PL_reginput = locinput; /* put where regtry can find it */
4379 sayYES_FINAL; /* Success! */
dad79028
DM
4380
4381 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4382 DEBUG_EXECUTE_r(
4383 PerlIO_printf(Perl_debug_log,
4384 "%*s %ssubpattern success...%s\n",
5bc10b2c 4385 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
3280af22 4386 PL_reginput = locinput; /* put where regtry can find it */
dad79028
DM
4387 sayYES_FINAL; /* Success! */
4388
40a82448
DM
4389#undef ST
4390#define ST st->u.ifmatch
4391
4392 case SUSPEND: /* (?>A) */
4393 ST.wanted = 1;
9fe1d20c 4394 PL_reginput = locinput;
9041c2e3 4395 goto do_ifmatch;
dad79028 4396
40a82448
DM
4397 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4398 ST.wanted = 0;
dad79028
DM
4399 goto ifmatch_trivial_fail_test;
4400
40a82448
DM
4401 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4402 ST.wanted = 1;
dad79028 4403 ifmatch_trivial_fail_test:
a0ed51b3 4404 if (scan->flags) {
52657f30 4405 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4406 if (!s) {
4407 /* trivial fail */
4408 if (st->logical) {
4409 st->logical = 0;
40a82448 4410 st->sw = 1 - (bool)ST.wanted;
dad79028 4411 }
40a82448 4412 else if (ST.wanted)
dad79028
DM
4413 sayNO;
4414 next = scan + ARG(scan);
4415 if (next == scan)
4416 next = NULL;
4417 break;
4418 }
efb30f32 4419 PL_reginput = s;
a0ed51b3
LW
4420 }
4421 else
4422 PL_reginput = locinput;
4423
c277df42 4424 do_ifmatch:
40a82448
DM
4425 ST.me = scan;
4426 /* execute body of (?...A) */
4427 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4428 /* NOTREACHED */
4429
4430 case IFMATCH_A_fail: /* body of (?...A) failed */
4431 ST.wanted = !ST.wanted;
4432 /* FALL THROUGH */
4433
4434 case IFMATCH_A: /* body of (?...A) succeeded */
4435 if (st->logical) {
4436 st->logical = 0;
4437 st->sw = (bool)ST.wanted;
4438 }
4439 else if (!ST.wanted)
4440 sayNO;
4441
4442 if (OP(ST.me) == SUSPEND)
4443 locinput = PL_reginput;
4444 else {
4445 locinput = PL_reginput = st->locinput;
4446 nextchr = UCHARAT(locinput);
4447 }
4448 scan = ST.me + ARG(ST.me);
4449 if (scan == ST.me)
4450 scan = NULL;
4451 continue; /* execute B */
4452
4453#undef ST
dad79028 4454
c277df42 4455 case LONGJMP:
c277df42
IZ
4456 next = scan + ARG(scan);
4457 if (next == scan)
4458 next = NULL;
a0d0e21e
LW
4459 break;
4460 default:
b900a521 4461 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4462 PTR2UV(scan), OP(scan));
cea2e8a9 4463 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4464 }
95b24440 4465
a0d0e21e 4466 scan = next;
95b24440
DM
4467 continue;
4468 /* NOTREACHED */
4469
40a82448
DM
4470 push_yes_state:
4471 /* push a state that backtracks on success */
4472 st->u.yes.prev_yes_state = yes_state;
4473 yes_state = st;
4474 /* FALL THROUGH */
4475 push_state:
4476 /* push a new regex state, then continue at scan */
4477 {
4478 regmatch_state *newst;
4479
5bc10b2c 4480 DEBUG_STATE_pp("push");
40a82448 4481 depth++;
40a82448
DM
4482 st->locinput = locinput;
4483 newst = st+1;
4484 if (newst > SLAB_LAST(PL_regmatch_slab))
4485 newst = S_push_slab(aTHX);
4486 PL_regmatch_state = newst;
40a82448
DM
4487 /* XXX probably don't need to initialise these */
4488 newst->minmod = 0;
4489 newst->sw = 0;
4490 newst->logical = 0;
4491
786e8c11 4492
40a82448
DM
4493 locinput = PL_reginput;
4494 nextchr = UCHARAT(locinput);
4495 st = newst;
4496 continue;
4497 /* NOTREACHED */
4498 }
4499
95b24440
DM
4500 /* simulate recursively calling regmatch(), but without actually
4501 * recursing - ie save the current state on the heap rather than on
4502 * the stack, then re-enter the loop. This avoids complex regexes
4503 * blowing the processor stack */
4504
4505 start_recurse:
4506 {
5d9a96ca
DM
4507 /* push new state */
4508 regmatch_state *oldst = st;
4509
5bc10b2c 4510 DEBUG_STATE_pp("push");
5d9a96ca
DM
4511 depth++;
4512
4513 /* grab the next free state slot */
4514 st++;
86545054 4515 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
4516 st = S_push_slab(aTHX);
4517 PL_regmatch_state = st;
4518
4519 oldst->next = next;
4520 oldst->n = n;
4521 oldst->locinput = locinput;
5d9a96ca 4522
95b24440
DM
4523 locinput = PL_reginput;
4524 nextchr = UCHARAT(locinput);
5d9a96ca
DM
4525 st->minmod = 0;
4526 st->sw = 0;
4527 st->logical = 0;
786e8c11 4528
95b24440 4529 }
a0d0e21e 4530 }
a687059c 4531
aa283a38
DM
4532
4533
a0d0e21e
LW
4534 /*
4535 * We get here only if there's trouble -- normally "case END" is
4536 * the terminating point.
4537 */
cea2e8a9 4538 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4539 /*NOTREACHED*/
4633a7c4
LW
4540 sayNO;
4541
7821416a 4542yes_final:
77cb431f
DM
4543
4544 if (yes_state) {
4545 /* we have successfully completed a subexpression, but we must now
4546 * pop to the state marked by yes_state and continue from there */
77cb431f 4547 assert(st != yes_state);
5bc10b2c
DM
4548#ifdef DEBUGGING
4549 while (st != yes_state) {
4550 st--;
4551 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4552 PL_regmatch_slab = PL_regmatch_slab->prev;
4553 st = SLAB_LAST(PL_regmatch_slab);
4554 }
4555 DEBUG_STATE_pp("pop (yes)");
4556 depth--;
4557 }
4558#else
77cb431f
DM
4559 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4560 || yes_state > SLAB_LAST(PL_regmatch_slab))
4561 {
4562 /* not in this slab, pop slab */
4563 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4564 PL_regmatch_slab = PL_regmatch_slab->prev;
4565 st = SLAB_LAST(PL_regmatch_slab);
4566 }
4567 depth -= (st - yes_state);
5bc10b2c 4568#endif
77cb431f
DM
4569 st = yes_state;
4570 yes_state = st->u.yes.prev_yes_state;
4571 PL_regmatch_state = st;
4572
4573 switch (st->resume_state) {
faec1544 4574 case EVAL_AB:
40a82448
DM
4575 case IFMATCH_A:
4576 case CURLYM_A:
40a82448
DM
4577 state_num = st->resume_state;
4578 goto reenter_switch;
4579
4580 case CURLYM_B:
166ba7cd
DM
4581 case BRANCH_next:
4582 case TRIE_next:
c255a977 4583 case CURLY_B_max:
77cb431f 4584 default:
40a82448 4585 Perl_croak(aTHX_ "unexpected yes resume state");
77cb431f
DM
4586 }
4587 }
4588
a3621e74 4589 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4590 PL_colors[4], PL_colors[5]));
4633a7c4 4591yes:
02db2b7b 4592
95b24440 4593 result = 1;
aa283a38 4594 /* XXX this is duplicate(ish) code to that in the do_no section.
40a82448 4595 * will disappear when REGFMATCH goes */
aa283a38
DM
4596 if (depth) {
4597 /* restore previous state and re-enter */
40a82448
DM
4598 st--;
4599 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4600 PL_regmatch_slab = PL_regmatch_slab->prev;
4601 st = SLAB_LAST(PL_regmatch_slab);
4602 }
4603 PL_regmatch_state = st;
4604 scan = st->scan;
4605 next = st->next;
4606 n = st->n;
4607 locinput= st->locinput;
4608 nextchr = UCHARAT(locinput);
aa283a38 4609
5bc10b2c
DM
4610 DEBUG_STATE_pp("pop");
4611 depth--;
4612
aa283a38 4613 switch (st->resume_state) {
aa283a38
DM
4614 case resume_CURLYX:
4615 goto resume_point_CURLYX;
4616 case resume_WHILEM1:
4617 goto resume_point_WHILEM1;
4618 case resume_WHILEM2:
4619 goto resume_point_WHILEM2;
4620 case resume_WHILEM3:
4621 goto resume_point_WHILEM3;
4622 case resume_WHILEM4:
4623 goto resume_point_WHILEM4;
4624 case resume_WHILEM5:
4625 goto resume_point_WHILEM5;
4626 case resume_WHILEM6:
4627 goto resume_point_WHILEM6;
77cb431f 4628
166ba7cd 4629 case TRIE_next:
40a82448
DM
4630 case CURLYM_A:
4631 case CURLYM_B:
faec1544 4632 case EVAL_AB:
40a82448
DM
4633 case IFMATCH_A:
4634 case BRANCH_next:
c255a977
DM
4635 case CURLY_B_max:
4636 case CURLY_B_min:
4637 case CURLY_B_min_known:
40a82448
DM
4638 break;
4639
aa283a38
DM
4640 default:
4641 Perl_croak(aTHX_ "regexp resume memory corruption");
4642 }
4643 }
4644 goto final_exit;
4633a7c4
LW
4645
4646no:
a3621e74 4647 DEBUG_EXECUTE_r(
7821416a 4648 PerlIO_printf(Perl_debug_log,
786e8c11 4649 "%*s %sfailed...%s\n",
5bc10b2c 4650 REPORT_CODE_OFF+depth*2, "",
786e8c11 4651 PL_colors[4], PL_colors[5])
7821416a 4652 );
7821416a
IZ
4653no_final:
4654do_no:
aa283a38 4655
95b24440 4656 result = 0;
5d9a96ca 4657
aa283a38
DM
4658 if (depth) {
4659 /* there's a previous state to backtrack to */
40a82448
DM
4660 st--;
4661 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4662 PL_regmatch_slab = PL_regmatch_slab->prev;
4663 st = SLAB_LAST(PL_regmatch_slab);
4664 }
4665 PL_regmatch_state = st;
4666 scan = st->scan;
4667 next = st->next;
4668 n = st->n;
4669 locinput= st->locinput;
4670 nextchr = UCHARAT(locinput);
4671
5bc10b2c
DM
4672 DEBUG_STATE_pp("pop");
4673 depth--;
4674
5d9a96ca 4675 switch (st->resume_state) {
95b24440
DM
4676 case resume_CURLYX:
4677 goto resume_point_CURLYX;
4678 case resume_WHILEM1:
4679 goto resume_point_WHILEM1;
4680 case resume_WHILEM2:
4681 goto resume_point_WHILEM2;
4682 case resume_WHILEM3:
4683 goto resume_point_WHILEM3;
4684 case resume_WHILEM4:
4685 goto resume_point_WHILEM4;
4686 case resume_WHILEM5:
4687 goto resume_point_WHILEM5;
4688 case resume_WHILEM6:
4689 goto resume_point_WHILEM6;
dad79028 4690
166ba7cd 4691 case TRIE_next:
faec1544 4692 case EVAL_AB:
40a82448
DM
4693 case BRANCH_next:
4694 case CURLYM_A:
4695 case CURLYM_B:
4696 case IFMATCH_A:
c255a977
DM
4697 case CURLY_B_max:
4698 case CURLY_B_min:
4699 case CURLY_B_min_known:
40a82448
DM
4700 if (yes_state == st)
4701 yes_state = st->u.yes.prev_yes_state;
4702 state_num = st->resume_state + 1; /* failure = success + 1 */
4703 goto reenter_switch;
dad79028 4704
95b24440
DM
4705 default:
4706 Perl_croak(aTHX_ "regexp resume memory corruption");
4707 }
95b24440 4708 }
aa283a38
DM
4709
4710final_exit:
4711
5d9a96ca
DM
4712 /* restore original high-water mark */
4713 PL_regmatch_slab = orig_slab;
4714 PL_regmatch_state = orig_state;
4715
4716 /* free all slabs above current one */
4717 if (orig_slab->next) {
c4fd8992 4718 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
4719 orig_slab->next = NULL;
4720 while (sl) {
c4fd8992 4721 regmatch_slab * const osl = sl;
5d9a96ca 4722 sl = sl->next;
ad65c075 4723 Safefree(osl);
5d9a96ca
DM
4724 }
4725 }
4726
95b24440
DM
4727 return result;
4728
a687059c
LW
4729}
4730
4731/*
4732 - regrepeat - repeatedly match something simple, report how many
4733 */
4734/*
4735 * [This routine now assumes that it will only match on things of length 1.
4736 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4737 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4738 */
76e3520e 4739STATIC I32
32fc9b6a 4740S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
a687059c 4741{
27da23d5 4742 dVAR;
a0d0e21e 4743 register char *scan;
a0d0e21e 4744 register I32 c;
3280af22 4745 register char *loceol = PL_regeol;
a0ed51b3 4746 register I32 hardcount = 0;
53c4c00c 4747 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4748
3280af22 4749 scan = PL_reginput;
faf11cac
HS
4750 if (max == REG_INFTY)
4751 max = I32_MAX;
4752 else if (max < loceol - scan)
7f596f4c 4753 loceol = scan + max;
a0d0e21e 4754 switch (OP(p)) {
22c35a8c 4755 case REG_ANY:
1aa99e6b 4756 if (do_utf8) {
ffc61ed2 4757 loceol = PL_regeol;
1aa99e6b 4758 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4759 scan += UTF8SKIP(scan);
4760 hardcount++;
4761 }
4762 } else {
4763 while (scan < loceol && *scan != '\n')
4764 scan++;
a0ed51b3
LW
4765 }
4766 break;
ffc61ed2 4767 case SANY:
def8e4ea
JH
4768 if (do_utf8) {
4769 loceol = PL_regeol;
a0804c9e 4770 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4771 scan += UTF8SKIP(scan);
4772 hardcount++;
4773 }
4774 }
4775 else
4776 scan = loceol;
a0ed51b3 4777 break;
f33976b4
DB
4778 case CANY:
4779 scan = loceol;
4780 break;
090f7165
JH
4781 case EXACT: /* length of string is 1 */
4782 c = (U8)*STRING(p);
4783 while (scan < loceol && UCHARAT(scan) == c)
4784 scan++;
bbce6d69 4785 break;
4786 case EXACTF: /* length of string is 1 */
cd439c50 4787 c = (U8)*STRING(p);
bbce6d69 4788 while (scan < loceol &&
22c35a8c 4789 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4790 scan++;
4791 break;
4792 case EXACTFL: /* length of string is 1 */
3280af22 4793 PL_reg_flags |= RF_tainted;
cd439c50 4794 c = (U8)*STRING(p);
bbce6d69 4795 while (scan < loceol &&
22c35a8c 4796 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4797 scan++;
4798 break;
4799 case ANYOF:
ffc61ed2
JH
4800 if (do_utf8) {
4801 loceol = PL_regeol;
cfc92286 4802 while (hardcount < max && scan < loceol &&
32fc9b6a 4803 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4804 scan += UTF8SKIP(scan);
4805 hardcount++;
4806 }
4807 } else {
32fc9b6a 4808 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
4809 scan++;
4810 }
a0d0e21e
LW
4811 break;
4812 case ALNUM:
1aa99e6b 4813 if (do_utf8) {
ffc61ed2 4814 loceol = PL_regeol;
1a4fad37 4815 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4816 while (hardcount < max && scan < loceol &&
3568d838 4817 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4818 scan += UTF8SKIP(scan);
4819 hardcount++;
4820 }
4821 } else {
4822 while (scan < loceol && isALNUM(*scan))
4823 scan++;
a0ed51b3
LW
4824 }
4825 break;
bbce6d69 4826 case ALNUML:
3280af22 4827 PL_reg_flags |= RF_tainted;
1aa99e6b 4828 if (do_utf8) {
ffc61ed2 4829 loceol = PL_regeol;
1aa99e6b
IH
4830 while (hardcount < max && scan < loceol &&
4831 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4832 scan += UTF8SKIP(scan);
4833 hardcount++;
4834 }
4835 } else {
4836 while (scan < loceol && isALNUM_LC(*scan))
4837 scan++;
a0ed51b3
LW
4838 }
4839 break;
a0d0e21e 4840 case NALNUM:
1aa99e6b 4841 if (do_utf8) {
ffc61ed2 4842 loceol = PL_regeol;
1a4fad37 4843 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4844 while (hardcount < max && scan < loceol &&
3568d838 4845 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4846 scan += UTF8SKIP(scan);
4847 hardcount++;
4848 }
4849 } else {
4850 while (scan < loceol && !isALNUM(*scan))
4851 scan++;
a0ed51b3
LW
4852 }
4853 break;
bbce6d69 4854 case NALNUML:
3280af22 4855 PL_reg_flags |= RF_tainted;
1aa99e6b 4856 if (do_utf8) {
ffc61ed2 4857 loceol = PL_regeol;
1aa99e6b
IH
4858 while (hardcount < max && scan < loceol &&
4859 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4860 scan += UTF8SKIP(scan);
4861 hardcount++;
4862 }
4863 } else {
4864 while (scan < loceol && !isALNUM_LC(*scan))
4865 scan++;
a0ed51b3
LW
4866 }
4867 break;
a0d0e21e 4868 case SPACE:
1aa99e6b 4869 if (do_utf8) {
ffc61ed2 4870 loceol = PL_regeol;
1a4fad37 4871 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4872 while (hardcount < max && scan < loceol &&
3568d838
JH
4873 (*scan == ' ' ||
4874 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4875 scan += UTF8SKIP(scan);
4876 hardcount++;
4877 }
4878 } else {
4879 while (scan < loceol && isSPACE(*scan))
4880 scan++;
a0ed51b3
LW
4881 }
4882 break;
bbce6d69 4883 case SPACEL:
3280af22 4884 PL_reg_flags |= RF_tainted;
1aa99e6b 4885 if (do_utf8) {
ffc61ed2 4886 loceol = PL_regeol;
1aa99e6b 4887 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4888 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4889 scan += UTF8SKIP(scan);
4890 hardcount++;
4891 }
4892 } else {
4893 while (scan < loceol && isSPACE_LC(*scan))
4894 scan++;
a0ed51b3
LW
4895 }
4896 break;
a0d0e21e 4897 case NSPACE:
1aa99e6b 4898 if (do_utf8) {
ffc61ed2 4899 loceol = PL_regeol;
1a4fad37 4900 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4901 while (hardcount < max && scan < loceol &&
3568d838
JH
4902 !(*scan == ' ' ||
4903 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4904 scan += UTF8SKIP(scan);
4905 hardcount++;
4906 }
4907 } else {
4908 while (scan < loceol && !isSPACE(*scan))
4909 scan++;
4910 break;
a0ed51b3 4911 }
bbce6d69 4912 case NSPACEL:
3280af22 4913 PL_reg_flags |= RF_tainted;
1aa99e6b 4914 if (do_utf8) {
ffc61ed2 4915 loceol = PL_regeol;
1aa99e6b 4916 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4917 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4918 scan += UTF8SKIP(scan);
4919 hardcount++;
4920 }
4921 } else {
4922 while (scan < loceol && !isSPACE_LC(*scan))
4923 scan++;
a0ed51b3
LW
4924 }
4925 break;
a0d0e21e 4926 case DIGIT:
1aa99e6b 4927 if (do_utf8) {
ffc61ed2 4928 loceol = PL_regeol;
1a4fad37 4929 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4930 while (hardcount < max && scan < loceol &&
3568d838 4931 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4932 scan += UTF8SKIP(scan);
4933 hardcount++;
4934 }
4935 } else {
4936 while (scan < loceol && isDIGIT(*scan))
4937 scan++;
a0ed51b3
LW
4938 }
4939 break;
a0d0e21e 4940 case NDIGIT:
1aa99e6b 4941 if (do_utf8) {
ffc61ed2 4942 loceol = PL_regeol;
1a4fad37 4943 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4944 while (hardcount < max && scan < loceol &&
3568d838 4945 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4946 scan += UTF8SKIP(scan);
4947 hardcount++;
4948 }
4949 } else {
4950 while (scan < loceol && !isDIGIT(*scan))
4951 scan++;
a0ed51b3
LW
4952 }
4953 break;
a0d0e21e
LW
4954 default: /* Called on something of 0 width. */
4955 break; /* So match right here or not at all. */
4956 }
a687059c 4957
a0ed51b3
LW
4958 if (hardcount)
4959 c = hardcount;
4960 else
4961 c = scan - PL_reginput;
3280af22 4962 PL_reginput = scan;
a687059c 4963
a3621e74 4964 DEBUG_r({
e68ec53f 4965 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 4966 DEBUG_EXECUTE_r({
e68ec53f
YO
4967 SV * const prop = sv_newmortal();
4968 regprop(prog, prop, p);
4969 PerlIO_printf(Perl_debug_log,
be8e71aa
YO
4970 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4971 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 4972 });
be8e71aa 4973 });
9041c2e3 4974
a0d0e21e 4975 return(c);
a687059c
LW
4976}
4977
c277df42 4978
be8e71aa 4979#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 4980/*
ffc61ed2
JH
4981- regclass_swash - prepare the utf8 swash
4982*/
4983
4984SV *
32fc9b6a 4985Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4986{
97aff369 4987 dVAR;
9e55ce06
JH
4988 SV *sw = NULL;
4989 SV *si = NULL;
4990 SV *alt = NULL;
3dab1dad 4991 const struct reg_data * const data = prog ? prog->data : NULL;
ffc61ed2 4992
4f639d21 4993 if (data && data->count) {
a3b680e6 4994 const U32 n = ARG(node);
ffc61ed2 4995
4f639d21
DM
4996 if (data->what[n] == 's') {
4997 SV * const rv = (SV*)data->data[n];
890ce7af 4998 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 4999 SV **const ary = AvARRAY(av);
9e55ce06 5000 SV **a, **b;
9041c2e3 5001
711a919c 5002 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5003 * documentation of these array elements. */
5004
b11f357e 5005 si = *ary;
8f7f7219 5006 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
5007 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5008
ffc61ed2
JH
5009 if (a)
5010 sw = *a;
5011 else if (si && doinit) {
5012 sw = swash_init("utf8", "", si, 1, 0);
5013 (void)av_store(av, 1, sw);
5014 }
9e55ce06
JH
5015 if (b)
5016 alt = *b;
ffc61ed2
JH
5017 }
5018 }
5019
9e55ce06
JH
5020 if (listsvp)
5021 *listsvp = si;
5022 if (altsvp)
5023 *altsvp = alt;
ffc61ed2
JH
5024
5025 return sw;
5026}
76234dfb 5027#endif
ffc61ed2
JH
5028
5029/*
ba7b4546 5030 - reginclass - determine if a character falls into a character class
832705d4
JH
5031
5032 The n is the ANYOF regnode, the p is the target string, lenp
5033 is pointer to the maximum length of how far to go in the p
5034 (if the lenp is zero, UTF8SKIP(p) is used),
5035 do_utf8 tells whether the target string is in UTF-8.
5036
bbce6d69 5037 */
5038
76e3520e 5039STATIC bool
32fc9b6a 5040S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5041{
27da23d5 5042 dVAR;
a3b680e6 5043 const char flags = ANYOF_FLAGS(n);
bbce6d69 5044 bool match = FALSE;
cc07378b 5045 UV c = *p;
ae9ddab8 5046 STRLEN len = 0;
9e55ce06 5047 STRLEN plen;
1aa99e6b 5048
19f67299
TS
5049 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5050 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5051 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5052 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
e8a70c6f
SP
5053 if (len == (STRLEN)-1)
5054 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 5055 }
bbce6d69 5056
0f0076b4 5057 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5058 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5059 if (lenp)
5060 *lenp = 0;
ffc61ed2 5061 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5062 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5063 match = TRUE;
bbce6d69 5064 }
3568d838 5065 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5066 match = TRUE;
ffc61ed2 5067 if (!match) {
9e55ce06 5068 AV *av;
32fc9b6a 5069 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5070
5071 if (sw) {
3568d838 5072 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
5073 match = TRUE;
5074 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5075 if (!match && lenp && av) {
5076 I32 i;
9e55ce06 5077 for (i = 0; i <= av_len(av); i++) {
890ce7af 5078 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5079 STRLEN len;
890ce7af 5080 const char * const s = SvPV_const(sv, len);
9e55ce06 5081
061b10df 5082 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
5083 *lenp = len;
5084 match = TRUE;
5085 break;
5086 }
5087 }
5088 }
5089 if (!match) {
89ebb4a3 5090 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
5091 STRLEN tmplen;
5092
9e55ce06
JH
5093 to_utf8_fold(p, tmpbuf, &tmplen);
5094 if (swash_fetch(sw, tmpbuf, do_utf8))
5095 match = TRUE;
5096 }
ffc61ed2
JH
5097 }
5098 }
bbce6d69 5099 }
9e55ce06 5100 if (match && lenp && *lenp == 0)
0f0076b4 5101 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5102 }
1aa99e6b 5103 if (!match && c < 256) {
ffc61ed2
JH
5104 if (ANYOF_BITMAP_TEST(n, c))
5105 match = TRUE;
5106 else if (flags & ANYOF_FOLD) {
eb160463 5107 U8 f;
a0ed51b3 5108
ffc61ed2
JH
5109 if (flags & ANYOF_LOCALE) {
5110 PL_reg_flags |= RF_tainted;
5111 f = PL_fold_locale[c];
5112 }
5113 else
5114 f = PL_fold[c];
5115 if (f != c && ANYOF_BITMAP_TEST(n, f))
5116 match = TRUE;
5117 }
5118
5119 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5120 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5121 if (
5122 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5123 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5124 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5125 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5126 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5127 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5128 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5129 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5130 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5131 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5132 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5133 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5134 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5135 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5136 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5137 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5138 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5139 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5140 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5141 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5142 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5143 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5144 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5145 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5146 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5147 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5148 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5149 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5150 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5151 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5152 ) /* How's that for a conditional? */
5153 {
5154 match = TRUE;
5155 }
a0ed51b3 5156 }
a0ed51b3
LW
5157 }
5158
a0ed51b3
LW
5159 return (flags & ANYOF_INVERT) ? !match : match;
5160}
161b471a 5161
dfe13c55 5162STATIC U8 *
0ce71af7 5163S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5164{
97aff369 5165 dVAR;
a0ed51b3 5166 if (off >= 0) {
1aa99e6b 5167 while (off-- && s < lim) {
ffc61ed2 5168 /* XXX could check well-formedness here */
a0ed51b3 5169 s += UTF8SKIP(s);
ffc61ed2 5170 }
a0ed51b3
LW
5171 }
5172 else {
1de06328
YO
5173 while (off++ && s > lim) {
5174 s--;
5175 if (UTF8_IS_CONTINUED(*s)) {
5176 while (s > lim && UTF8_IS_CONTINUATION(*s))
5177 s--;
a0ed51b3 5178 }
1de06328 5179 /* XXX could check well-formedness here */
a0ed51b3
LW
5180 }
5181 }
5182 return s;
5183}
161b471a 5184
f9f4320a
YO
5185#ifdef XXX_dmq
5186/* there are a bunch of places where we use two reghop3's that should
5187 be replaced with this routine. but since thats not done yet
5188 we ifdef it out - dmq
5189*/
dfe13c55 5190STATIC U8 *
1de06328
YO
5191S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5192{
5193 dVAR;
5194 if (off >= 0) {
5195 while (off-- && s < rlim) {
5196 /* XXX could check well-formedness here */
5197 s += UTF8SKIP(s);
5198 }
5199 }
5200 else {
5201 while (off++ && s > llim) {
5202 s--;
5203 if (UTF8_IS_CONTINUED(*s)) {
5204 while (s > llim && UTF8_IS_CONTINUATION(*s))
5205 s--;
5206 }
5207 /* XXX could check well-formedness here */
5208 }
5209 }
5210 return s;
5211}
f9f4320a 5212#endif
1de06328
YO
5213
5214STATIC U8 *
0ce71af7 5215S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5216{
97aff369 5217 dVAR;
a0ed51b3 5218 if (off >= 0) {
1aa99e6b 5219 while (off-- && s < lim) {
ffc61ed2 5220 /* XXX could check well-formedness here */
a0ed51b3 5221 s += UTF8SKIP(s);
ffc61ed2 5222 }
a0ed51b3 5223 if (off >= 0)
3dab1dad 5224 return NULL;
a0ed51b3
LW
5225 }
5226 else {
1de06328
YO
5227 while (off++ && s > lim) {
5228 s--;
5229 if (UTF8_IS_CONTINUED(*s)) {
5230 while (s > lim && UTF8_IS_CONTINUATION(*s))
5231 s--;
a0ed51b3 5232 }
1de06328 5233 /* XXX could check well-formedness here */
a0ed51b3
LW
5234 }
5235 if (off <= 0)
3dab1dad 5236 return NULL;
a0ed51b3
LW
5237 }
5238 return s;
5239}
51371543 5240
51371543 5241static void
acfe0abc 5242restore_pos(pTHX_ void *arg)
51371543 5243{
97aff369 5244 dVAR;
097eb12c 5245 regexp * const rex = (regexp *)arg;
51371543
GS
5246 if (PL_reg_eval_set) {
5247 if (PL_reg_oldsaved) {
4f639d21
DM
5248 rex->subbeg = PL_reg_oldsaved;
5249 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5250#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5251 rex->saved_copy = PL_nrs;
ed252734 5252#endif
4f639d21 5253 RX_MATCH_COPIED_on(rex);
51371543
GS
5254 }
5255 PL_reg_magic->mg_len = PL_reg_oldpos;
5256 PL_reg_eval_set = 0;
5257 PL_curpm = PL_reg_oldcurpm;
5258 }
5259}
33b8afdf
JH
5260
5261STATIC void
5262S_to_utf8_substr(pTHX_ register regexp *prog)
5263{
33b8afdf 5264 if (prog->float_substr && !prog->float_utf8) {
097eb12c
AL
5265 SV* const sv = newSVsv(prog->float_substr);
5266 prog->float_utf8 = sv;
33b8afdf
JH
5267 sv_utf8_upgrade(sv);
5268 if (SvTAIL(prog->float_substr))
5269 SvTAIL_on(sv);
5270 if (prog->float_substr == prog->check_substr)
5271 prog->check_utf8 = sv;
5272 }
5273 if (prog->anchored_substr && !prog->anchored_utf8) {
097eb12c
AL
5274 SV* const sv = newSVsv(prog->anchored_substr);
5275 prog->anchored_utf8 = sv;
33b8afdf
JH
5276 sv_utf8_upgrade(sv);
5277 if (SvTAIL(prog->anchored_substr))
5278 SvTAIL_on(sv);
5279 if (prog->anchored_substr == prog->check_substr)
5280 prog->check_utf8 = sv;
5281 }
5282}
5283
5284STATIC void
5285S_to_byte_substr(pTHX_ register regexp *prog)
5286{
97aff369 5287 dVAR;
33b8afdf 5288 if (prog->float_utf8 && !prog->float_substr) {
097eb12c
AL
5289 SV* sv = newSVsv(prog->float_utf8);
5290 prog->float_substr = sv;
33b8afdf
JH
5291 if (sv_utf8_downgrade(sv, TRUE)) {
5292 if (SvTAIL(prog->float_utf8))
5293 SvTAIL_on(sv);
5294 } else {
5295 SvREFCNT_dec(sv);
5296 prog->float_substr = sv = &PL_sv_undef;
5297 }
5298 if (prog->float_utf8 == prog->check_utf8)
5299 prog->check_substr = sv;
5300 }
5301 if (prog->anchored_utf8 && !prog->anchored_substr) {
097eb12c
AL
5302 SV* sv = newSVsv(prog->anchored_utf8);
5303 prog->anchored_substr = sv;
33b8afdf
JH
5304 if (sv_utf8_downgrade(sv, TRUE)) {
5305 if (SvTAIL(prog->anchored_utf8))
5306 SvTAIL_on(sv);
5307 } else {
5308 SvREFCNT_dec(sv);
5309 prog->anchored_substr = sv = &PL_sv_undef;
5310 }
5311 if (prog->anchored_utf8 == prog->check_utf8)
5312 prog->check_substr = sv;
5313 }
5314}
66610fdd
RGS
5315
5316/*
5317 * Local variables:
5318 * c-indentation-style: bsd
5319 * c-basic-offset: 4
5320 * indent-tabs-mode: t
5321 * End:
5322 *
37442d52
RGS
5323 * ex: set ts=8 sts=4 sw=4 noet:
5324 */