This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Revert accidental file deletion
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
9041c2e3 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 **** Alterations to Henry's code are...
57 ****
4bb101f2 58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 60 ****
9ef589d8
LW
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
a687059c
LW
63 *
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
67 */
68#include "EXTERN.h"
864dbfa3 69#define PERL_IN_REGEXEC_C
a687059c 70#include "perl.h"
0f5d15d6 71
54df2634
NC
72#ifdef PERL_IN_XSUB_RE
73# include "re_comp.h"
74#else
75# include "regcomp.h"
76#endif
a687059c 77
c277df42
IZ
78#define RF_tainted 1 /* tainted information used? */
79#define RF_warned 2 /* warned about big count? */
ce862d02 80#define RF_evaled 4 /* Did an EVAL with setting? */
ab3bbdeb 81#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 82
eb160463 83#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
84
85#define RS_init 1 /* eval environment created */
86#define RS_set 2 /* replsv value is set */
c277df42 87
a687059c
LW
88#ifndef STATIC
89#define STATIC static
90#endif
91
32fc9b6a 92#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 93
c277df42
IZ
94/*
95 * Forwards.
96 */
97
33b8afdf 98#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 99#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 100
3dab1dad
YO
101#define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
52657f30 103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
104 : (U8*)(pos + off))
105#define HOPBACKc(pos, off) \
07be1b83
YO
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
8e11feef 109 ? (U8*)pos - off \
3dab1dad 110 : NULL)
efb30f32 111
53c4c00c 112#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 113#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 114
1a4fad37
AL
115#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 121
3dab1dad
YO
122/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
123
5f80c4cf 124/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
125#define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
130)
131
cca55fe3 132#define HAS_TEXT(rn) ( \
3dab1dad 133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 134)
e2d8ce26 135
a84d97b6
HS
136/*
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
139*/
cca55fe3 140#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 144 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 145 else if (type == PLUS) \
cca55fe3 146 rn = NEXTOPER(rn); \
3dab1dad 147 else if (type == IFMATCH) \
a84d97b6 148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 149 else rn += NEXT_OFF(rn); \
3dab1dad 150 } \
5f80c4cf 151} STMT_END
74750237 152
acfe0abc 153static void restore_pos(pTHX_ void *arg);
51371543 154
76e3520e 155STATIC CHECKPOINT
cea2e8a9 156S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 157{
97aff369 158 dVAR;
a3b680e6 159 const int retval = PL_savestack_ix;
b1ce53c5 160#define REGCP_PAREN_ELEMS 4
a3b680e6 161 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 162 int p;
40a82448 163 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 164
e49a9654
IH
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
167
a01268b5 168#define REGCP_OTHER_ELEMS 6
4b3c1a47 169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 170 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 171/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
3280af22 174 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 175 SSPUSHINT(p);
40a82448
DM
176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
177 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
178 (UV)p, (IV)PL_regstartp[p],
179 (IV)(PL_reg_start_tmp[p] - PL_bostr),
180 (IV)PL_regendp[p]
181 ));
a0d0e21e 182 }
b1ce53c5 183/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
184 SSPUSHINT(PL_regsize);
185 SSPUSHINT(*PL_reglastparen);
a01268b5 186 SSPUSHINT(*PL_reglastcloseparen);
3280af22 187 SSPUSHPTR(PL_reginput);
41123dfd
JH
188#define REGCP_FRAME_ELEMS 2
189/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
190 * are needed for the regexp context stack bookkeeping. */
191 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 192 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 193
a0d0e21e
LW
194 return retval;
195}
196
c277df42 197/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
198#define REGCP_SET(cp) \
199 DEBUG_STATE_r( \
ab3bbdeb 200 PerlIO_printf(Perl_debug_log, \
e4f74956 201 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
202 (IV)PL_savestack_ix)); \
203 cp = PL_savestack_ix
c3464db5 204
ab3bbdeb 205#define REGCP_UNWIND(cp) \
e4f74956 206 DEBUG_STATE_r( \
ab3bbdeb 207 if (cp != PL_savestack_ix) \
e4f74956
YO
208 PerlIO_printf(Perl_debug_log, \
209 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
210 (IV)(cp), (IV)PL_savestack_ix)); \
211 regcpblow(cp)
c277df42 212
76e3520e 213STATIC char *
097eb12c 214S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 215{
97aff369 216 dVAR;
b1ce53c5 217 I32 i;
a0d0e21e 218 char *input;
b1ce53c5 219
a3621e74
YO
220 GET_RE_DEBUG_FLAGS_DECL;
221
b1ce53c5 222 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 223 i = SSPOPINT;
b1ce53c5
JH
224 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
225 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 226 input = (char *) SSPOPPTR;
a01268b5 227 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
228 *PL_reglastparen = SSPOPINT;
229 PL_regsize = SSPOPINT;
b1ce53c5
JH
230
231 /* Now restore the parentheses context. */
41123dfd
JH
232 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
233 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 234 I32 tmps;
097eb12c 235 U32 paren = (U32)SSPOPINT;
3280af22 236 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
237 PL_regstartp[paren] = SSPOPINT;
238 tmps = SSPOPINT;
3280af22
NIS
239 if (paren <= *PL_reglastparen)
240 PL_regendp[paren] = tmps;
a3621e74 241 DEBUG_EXECUTE_r(
c3464db5 242 PerlIO_printf(Perl_debug_log,
b900a521 243 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 244 (UV)paren, (IV)PL_regstartp[paren],
b900a521 245 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 246 (IV)PL_regendp[paren],
3280af22 247 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 248 );
a0d0e21e 249 }
a3621e74 250 DEBUG_EXECUTE_r(
bb7a0f54 251 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 252 PerlIO_printf(Perl_debug_log,
faccc32b 253 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 254 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
255 }
256 );
daf18116 257#if 1
dafc8851
JH
258 /* It would seem that the similar code in regtry()
259 * already takes care of this, and in fact it is in
260 * a better location to since this code can #if 0-ed out
261 * but the code in regtry() is needed or otherwise tests
262 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
263 * (as of patchlevel 7877) will fail. Then again,
264 * this code seems to be necessary or otherwise
265 * building DynaLoader will fail:
266 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
267 * --jhi */
bb7a0f54 268 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
269 if (i > PL_regsize)
270 PL_regstartp[i] = -1;
271 PL_regendp[i] = -1;
a0d0e21e 272 }
dafc8851 273#endif
a0d0e21e
LW
274 return input;
275}
276
02db2b7b 277#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 278
a687059c 279/*
e50aee73 280 * pregexec and friends
a687059c
LW
281 */
282
76234dfb 283#ifndef PERL_IN_XSUB_RE
a687059c 284/*
c277df42 285 - pregexec - match a regexp against a string
a687059c 286 */
c277df42 287I32
864dbfa3 288Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 289 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
290/* strend: pointer to null at end of string */
291/* strbeg: real beginning of string */
292/* minend: end of match must be >=minend after stringarg. */
293/* nosave: For optimizations. */
294{
295 return
9041c2e3 296 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
297 nosave ? 0 : REXEC_COPY_STR);
298}
76234dfb 299#endif
22e551b9 300
9041c2e3 301/*
cad2e5aa
JH
302 * Need to implement the following flags for reg_anch:
303 *
304 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
305 * USE_INTUIT_ML
306 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
307 * INTUIT_AUTORITATIVE_ML
308 * INTUIT_ONCE_NOML - Intuit can match in one location only.
309 * INTUIT_ONCE_ML
310 *
311 * Another flag for this function: SECOND_TIME (so that float substrs
312 * with giant delta may be not rechecked).
313 */
314
315/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
316
3f7c398e 317/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
318 Otherwise, only SvCUR(sv) is used to get strbeg. */
319
320/* XXXX We assume that strpos is strbeg unless sv. */
321
6eb5f6b9
JH
322/* XXXX Some places assume that there is a fixed substring.
323 An update may be needed if optimizer marks as "INTUITable"
324 RExen without fixed substrings. Similarly, it is assumed that
325 lengths of all the strings are no more than minlen, thus they
326 cannot come from lookahead.
327 (Or minlen should take into account lookahead.) */
328
2c2d71f5
JH
329/* A failure to find a constant substring means that there is no need to make
330 an expensive call to REx engine, thus we celebrate a failure. Similarly,
331 finding a substring too deep into the string means that less calls to
30944b6d
IZ
332 regtry() should be needed.
333
334 REx compiler's optimizer found 4 possible hints:
335 a) Anchored substring;
336 b) Fixed substring;
337 c) Whether we are anchored (beginning-of-line or \G);
338 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 339 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
340 string which does not contradict any of them.
341 */
2c2d71f5 342
6eb5f6b9
JH
343/* Most of decisions we do here should have been done at compile time.
344 The nodes of the REx which we used for the search should have been
345 deleted from the finite automaton. */
346
cad2e5aa
JH
347char *
348Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
349 char *strend, U32 flags, re_scream_pos_data *data)
350{
97aff369 351 dVAR;
b7953727 352 register I32 start_shift = 0;
cad2e5aa 353 /* Should be nonnegative! */
b7953727 354 register I32 end_shift = 0;
2c2d71f5
JH
355 register char *s;
356 register SV *check;
a1933d95 357 char *strbeg;
cad2e5aa 358 char *t;
a3b680e6 359 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 360 I32 ml_anch;
bd61b366
SS
361 register char *other_last = NULL; /* other substr checked before this */
362 char *check_at = NULL; /* check substr found at this pos */
1df70142 363 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 364#ifdef DEBUGGING
890ce7af 365 const char * const i_strpos = strpos;
30944b6d 366#endif
a3621e74
YO
367
368 GET_RE_DEBUG_FLAGS_DECL;
369
a30b2f1f 370 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 371
b8d68ded 372 if (prog->reganch & ROPT_UTF8) {
b8d68ded
JH
373 PL_reg_flags |= RF_utf8;
374 }
ab3bbdeb
YO
375 DEBUG_EXECUTE_r(
376 debug_start_match(prog, do_utf8, strpos, strend,
377 "Guessing start of match for");
2a782b5b 378 );
cad2e5aa 379
c344f387
JH
380 /* CHR_DIST() would be more correct here but it makes things slow. */
381 if (prog->minlen > strend - strpos) {
a3621e74 382 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 383 "String too short... [re_intuit_start]\n"));
cad2e5aa 384 goto fail;
2c2d71f5 385 }
a1933d95 386 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 387 PL_regeol = strend;
33b8afdf
JH
388 if (do_utf8) {
389 if (!prog->check_utf8 && prog->check_substr)
390 to_utf8_substr(prog);
391 check = prog->check_utf8;
392 } else {
393 if (!prog->check_substr && prog->check_utf8)
394 to_byte_substr(prog);
395 check = prog->check_substr;
396 }
397 if (check == &PL_sv_undef) {
a3621e74 398 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
399 "Non-utf string cannot match utf check string\n"));
400 goto fail;
401 }
2c2d71f5 402 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
403 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
404 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 405 && !multiline ) ); /* Check after \n? */
cad2e5aa 406
7e25d62c
JH
407 if (!ml_anch) {
408 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
409 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 410 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
411 && sv && !SvROK(sv)
412 && (strpos != strbeg)) {
a3621e74 413 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
414 goto fail;
415 }
416 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 417 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 418 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
419 I32 slen;
420
1aa99e6b 421 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
422 if (SvTAIL(check)) {
423 slen = SvCUR(check); /* >= 1 */
cad2e5aa 424
9041c2e3 425 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 426 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 427 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 428 goto fail_finish;
cad2e5aa
JH
429 }
430 /* Now should match s[0..slen-2] */
431 slen--;
3f7c398e 432 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 433 || (slen > 1
3f7c398e 434 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 435 report_neq:
a3621e74 436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
437 goto fail_finish;
438 }
cad2e5aa 439 }
3f7c398e 440 else if (*SvPVX_const(check) != *s
653099ff 441 || ((slen = SvCUR(check)) > 1
3f7c398e 442 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 443 goto report_neq;
c315bfe8 444 check_at = s;
2c2d71f5 445 goto success_at_start;
7e25d62c 446 }
cad2e5aa 447 }
2c2d71f5 448 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 449 s = strpos;
2c2d71f5 450 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 451 end_shift = prog->minlen - start_shift -
653099ff 452 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 453 if (!ml_anch) {
a3b680e6 454 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 455 - (SvTAIL(check) != 0);
a3b680e6 456 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
457
458 if (end_shift < eshift)
459 end_shift = eshift;
460 }
cad2e5aa 461 }
2c2d71f5 462 else { /* Can match at random position */
cad2e5aa
JH
463 ml_anch = 0;
464 s = strpos;
2c2d71f5
JH
465 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
466 /* Should be nonnegative! */
467 end_shift = prog->minlen - start_shift -
653099ff 468 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
469 }
470
2c2d71f5 471#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 472 if (end_shift < 0)
6bbae5e6 473 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
474#endif
475
2c2d71f5
JH
476 restart:
477 /* Find a possible match in the region s..strend by looking for
478 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 479 if (flags & REXEC_SCREAM) {
cad2e5aa 480 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 481 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 482
2c2d71f5
JH
483 if (PL_screamfirst[BmRARE(check)] >= 0
484 || ( BmRARE(check) == '\n'
485 && (BmPREVIOUS(check) == SvCUR(check) - 1)
486 && SvTAIL(check) ))
9041c2e3 487 s = screaminstr(sv, check,
2c2d71f5 488 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 489 else
2c2d71f5 490 goto fail_finish;
4addbd3b
HS
491 /* we may be pointing at the wrong string */
492 if (s && RX_MATCH_COPIED(prog))
3f7c398e 493 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
494 if (data)
495 *data->scream_olds = s;
496 }
f33976b4 497 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
498 s = fbm_instr((U8*)(s + start_shift),
499 (U8*)(strend - end_shift),
7fba1cd6 500 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 501 else
1aa99e6b
IH
502 s = fbm_instr(HOP3(s, start_shift, strend),
503 HOP3(strend, -end_shift, strbeg),
7fba1cd6 504 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
505
506 /* Update the count-of-usability, remove useless subpatterns,
507 unshift s. */
2c2d71f5 508
ab3bbdeb
YO
509 DEBUG_EXECUTE_r({
510 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
511 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
512 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 513 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
514 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
515 ? "anchored" : "floating"),
516 quoted,
517 RE_SV_TAIL(check),
518 (s ? " at offset " : "...\n") );
519 });
2c2d71f5
JH
520
521 if (!s)
522 goto fail_finish;
523
6eb5f6b9
JH
524 check_at = s;
525
2c2d71f5 526 /* Finish the diagnostic message */
a3621e74 527 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
528
529 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
530 Start with the other substr.
531 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 532 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
533 *always* match. Probably should be marked during compile...
534 Probably it is right to do no SCREAM here...
535 */
536
33b8afdf 537 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 538 /* Take into account the "other" substring. */
2c2d71f5
JH
539 /* XXXX May be hopelessly wrong for UTF... */
540 if (!other_last)
6eb5f6b9 541 other_last = strpos;
33b8afdf 542 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
543 do_other_anchored:
544 {
890ce7af
AL
545 char * const last = HOP3c(s, -start_shift, strbeg);
546 char *last1, *last2;
be8e71aa 547 char * const saved_s = s;
33b8afdf 548 SV* must;
2c2d71f5 549
2c2d71f5
JH
550 t = s - prog->check_offset_max;
551 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 552 && (!do_utf8
0ce71af7 553 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 554 && t > strpos)))
6f207bd3 555 NOOP;
2c2d71f5
JH
556 else
557 t = strpos;
1aa99e6b 558 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
559 if (t < other_last) /* These positions already checked */
560 t = other_last;
1aa99e6b 561 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
562 if (last < last1)
563 last1 = last;
564 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
565 /* On end-of-str: see comment below. */
33b8afdf
JH
566 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
567 if (must == &PL_sv_undef) {
568 s = (char*)NULL;
a3621e74 569 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
570 }
571 else
572 s = fbm_instr(
573 (unsigned char*)t,
574 HOP3(HOP3(last1, prog->anchored_offset, strend)
575 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
576 must,
7fba1cd6 577 multiline ? FBMrf_MULTILINE : 0
33b8afdf 578 );
ab3bbdeb
YO
579 DEBUG_EXECUTE_r({
580 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
581 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
582 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 583 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
584 quoted, RE_SV_TAIL(must));
585 });
586
587
2c2d71f5
JH
588 if (!s) {
589 if (last1 >= last2) {
a3621e74 590 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
591 ", giving up...\n"));
592 goto fail_finish;
593 }
a3621e74 594 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 595 ", trying floating at offset %ld...\n",
be8e71aa 596 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
597 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
598 s = HOP3c(last, 1, strend);
2c2d71f5
JH
599 goto restart;
600 }
601 else {
a3621e74 602 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 603 (long)(s - i_strpos)));
1aa99e6b
IH
604 t = HOP3c(s, -prog->anchored_offset, strbeg);
605 other_last = HOP3c(s, 1, strend);
be8e71aa 606 s = saved_s;
2c2d71f5
JH
607 if (t == strpos)
608 goto try_at_start;
2c2d71f5
JH
609 goto try_at_offset;
610 }
30944b6d 611 }
2c2d71f5
JH
612 }
613 else { /* Take into account the floating substring. */
33b8afdf 614 char *last, *last1;
be8e71aa 615 char * const saved_s = s;
33b8afdf
JH
616 SV* must;
617
618 t = HOP3c(s, -start_shift, strbeg);
619 last1 = last =
620 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
621 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
622 last = HOP3c(t, prog->float_max_offset, strend);
623 s = HOP3c(t, prog->float_min_offset, strend);
624 if (s < other_last)
625 s = other_last;
2c2d71f5 626 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
627 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
628 /* fbm_instr() takes into account exact value of end-of-str
629 if the check is SvTAIL(ed). Since false positives are OK,
630 and end-of-str is not later than strend we are OK. */
631 if (must == &PL_sv_undef) {
632 s = (char*)NULL;
a3621e74 633 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
634 }
635 else
2c2d71f5 636 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
637 (unsigned char*)last + SvCUR(must)
638 - (SvTAIL(must)!=0),
7fba1cd6 639 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
640 DEBUG_EXECUTE_r({
641 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
642 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
643 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 644 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
645 quoted, RE_SV_TAIL(must));
646 });
33b8afdf
JH
647 if (!s) {
648 if (last1 == last) {
a3621e74 649 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
650 ", giving up...\n"));
651 goto fail_finish;
2c2d71f5 652 }
a3621e74 653 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 654 ", trying anchored starting at offset %ld...\n",
be8e71aa 655 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
656 other_last = last;
657 s = HOP3c(t, 1, strend);
658 goto restart;
659 }
660 else {
a3621e74 661 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
662 (long)(s - i_strpos)));
663 other_last = s; /* Fix this later. --Hugo */
be8e71aa 664 s = saved_s;
33b8afdf
JH
665 if (t == strpos)
666 goto try_at_start;
667 goto try_at_offset;
668 }
2c2d71f5 669 }
cad2e5aa 670 }
2c2d71f5
JH
671
672 t = s - prog->check_offset_max;
2c2d71f5 673 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 674 && (!do_utf8
0ce71af7 675 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
1aa99e6b 676 && t > strpos))) {
2c2d71f5
JH
677 /* Fixed substring is found far enough so that the match
678 cannot start at strpos. */
679 try_at_offset:
cad2e5aa 680 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
681 /* Eventually fbm_*() should handle this, but often
682 anchored_offset is not 0, so this check will not be wasted. */
683 /* XXXX In the code below we prefer to look for "^" even in
684 presence of anchored substrings. And we search even
685 beyond the found float position. These pessimizations
686 are historical artefacts only. */
687 find_anchor:
2c2d71f5 688 while (t < strend - prog->minlen) {
cad2e5aa 689 if (*t == '\n') {
4ee3650e 690 if (t < check_at - prog->check_offset_min) {
33b8afdf 691 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
692 /* Since we moved from the found position,
693 we definitely contradict the found anchored
30944b6d
IZ
694 substr. Due to the above check we do not
695 contradict "check" substr.
696 Thus we can arrive here only if check substr
697 is float. Redo checking for "other"=="fixed".
698 */
9041c2e3 699 strpos = t + 1;
a3621e74 700 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 701 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
702 goto do_other_anchored;
703 }
4ee3650e
GS
704 /* We don't contradict the found floating substring. */
705 /* XXXX Why not check for STCLASS? */
cad2e5aa 706 s = t + 1;
a3621e74 707 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 708 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
709 goto set_useful;
710 }
4ee3650e
GS
711 /* Position contradicts check-string */
712 /* XXXX probably better to look for check-string
713 than for "\n", so one should lower the limit for t? */
a3621e74 714 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 715 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 716 other_last = strpos = s = t + 1;
cad2e5aa
JH
717 goto restart;
718 }
719 t++;
720 }
a3621e74 721 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 722 PL_colors[0], PL_colors[1]));
2c2d71f5 723 goto fail_finish;
cad2e5aa 724 }
f5952150 725 else {
a3621e74 726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 727 PL_colors[0], PL_colors[1]));
f5952150 728 }
cad2e5aa
JH
729 s = t;
730 set_useful:
33b8afdf 731 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
732 }
733 else {
f5952150 734 /* The found string does not prohibit matching at strpos,
2c2d71f5 735 - no optimization of calling REx engine can be performed,
f5952150
GS
736 unless it was an MBOL and we are not after MBOL,
737 or a future STCLASS check will fail this. */
2c2d71f5
JH
738 try_at_start:
739 /* Even in this situation we may use MBOL flag if strpos is offset
740 wrt the start of the string. */
05b4157f 741 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 742 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
743 /* May be due to an implicit anchor of m{.*foo} */
744 && !(prog->reganch & ROPT_IMPLICIT))
745 {
cad2e5aa
JH
746 t = strpos;
747 goto find_anchor;
748 }
a3621e74 749 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 750 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 751 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 752 );
2c2d71f5 753 success_at_start:
30944b6d 754 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
755 && (do_utf8 ? (
756 prog->check_utf8 /* Could be deleted already */
757 && --BmUSEFUL(prog->check_utf8) < 0
758 && (prog->check_utf8 == prog->float_utf8)
759 ) : (
760 prog->check_substr /* Could be deleted already */
761 && --BmUSEFUL(prog->check_substr) < 0
762 && (prog->check_substr == prog->float_substr)
763 )))
66e933ab 764 {
cad2e5aa 765 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 766 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
767 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
768 if (do_utf8 ? prog->check_substr : prog->check_utf8)
769 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
770 prog->check_substr = prog->check_utf8 = NULL; /* disable */
771 prog->float_substr = prog->float_utf8 = NULL; /* clear */
772 check = NULL; /* abort */
cad2e5aa 773 s = strpos;
3cf5c195
IZ
774 /* XXXX This is a remnant of the old implementation. It
775 looks wasteful, since now INTUIT can use many
6eb5f6b9 776 other heuristics. */
cad2e5aa
JH
777 prog->reganch &= ~RE_USE_INTUIT;
778 }
779 else
780 s = strpos;
781 }
782
6eb5f6b9
JH
783 /* Last resort... */
784 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
786e8c11 785 if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) {
6eb5f6b9
JH
786 /* minlen == 0 is possible if regstclass is \b or \B,
787 and the fixed substr is ''$.
788 Since minlen is already taken into account, s+1 is before strend;
789 accidentally, minlen >= 1 guaranties no false positives at s + 1
790 even for \b or \B. But (minlen? 1 : 0) below assumes that
791 regstclass does not come from lookahead... */
792 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
793 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 794 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 795 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 796 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 797 : 1);
07be1b83 798 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 799 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 800 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
801 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
802 cl_l, strend)
803 : strend);
07be1b83
YO
804 /*if (OP(prog->regstclass) == TRIE)
805 endpos++;*/
6eb5f6b9 806 t = s;
3b0527fe 807 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
808 if (!s) {
809#ifdef DEBUGGING
cbbf8932 810 const char *what = NULL;
6eb5f6b9
JH
811#endif
812 if (endpos == strend) {
a3621e74 813 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
814 "Could not match STCLASS...\n") );
815 goto fail;
816 }
a3621e74 817 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 818 "This position contradicts STCLASS...\n") );
653099ff
GS
819 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
820 goto fail;
6eb5f6b9 821 /* Contradict one of substrings */
33b8afdf
JH
822 if (prog->anchored_substr || prog->anchored_utf8) {
823 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 824 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 825 hop_and_restart:
1aa99e6b 826 s = HOP3c(t, 1, strend);
66e933ab
GS
827 if (s + start_shift + end_shift > strend) {
828 /* XXXX Should be taken into account earlier? */
a3621e74 829 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
830 "Could not match STCLASS...\n") );
831 goto fail;
832 }
5e39e1e5
HS
833 if (!check)
834 goto giveup;
a3621e74 835 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 836 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
837 what, (long)(s + start_shift - i_strpos)) );
838 goto restart;
839 }
66e933ab 840 /* Have both, check_string is floating */
6eb5f6b9
JH
841 if (t + start_shift >= check_at) /* Contradicts floating=check */
842 goto retry_floating_check;
843 /* Recheck anchored substring, but not floating... */
9041c2e3 844 s = check_at;
5e39e1e5
HS
845 if (!check)
846 goto giveup;
a3621e74 847 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 848 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
849 (long)(other_last - i_strpos)) );
850 goto do_other_anchored;
851 }
60e71179
GS
852 /* Another way we could have checked stclass at the
853 current position only: */
854 if (ml_anch) {
855 s = t = t + 1;
5e39e1e5
HS
856 if (!check)
857 goto giveup;
a3621e74 858 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 859 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 860 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 861 goto try_at_offset;
66e933ab 862 }
33b8afdf 863 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 864 goto fail;
6eb5f6b9
JH
865 /* Check is floating subtring. */
866 retry_floating_check:
867 t = check_at - start_shift;
a3621e74 868 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
869 goto hop_and_restart;
870 }
b7953727 871 if (t != s) {
a3621e74 872 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 873 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
874 (long)(t - i_strpos), (long)(s - i_strpos))
875 );
876 }
877 else {
a3621e74 878 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
879 "Does not contradict STCLASS...\n");
880 );
881 }
6eb5f6b9 882 }
5e39e1e5 883 giveup:
a3621e74 884 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
885 PL_colors[4], (check ? "Guessed" : "Giving up"),
886 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 887 return s;
2c2d71f5
JH
888
889 fail_finish: /* Substring not found */
33b8afdf
JH
890 if (prog->check_substr || prog->check_utf8) /* could be removed already */
891 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 892 fail:
a3621e74 893 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 894 PL_colors[4], PL_colors[5]));
bd61b366 895 return NULL;
cad2e5aa 896}
9661b544 897
786e8c11 898
3b0527fe 899
4cadc6a9
YO
900#define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
901foldlen, foldbuf, uniflags) STMT_START { \
902 switch (trie_type) { \
903 case trie_utf8_fold: \
904 if ( foldlen>0 ) { \
905 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
906 foldlen -= len; \
907 uscan += len; \
908 len=0; \
909 } else { \
910 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
911 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
912 foldlen -= UNISKIP( uvc ); \
913 uscan = foldbuf + UNISKIP( uvc ); \
914 } \
915 break; \
916 case trie_utf8: \
917 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
918 break; \
919 case trie_plain: \
920 uvc = (UV)*uc; \
921 len = 1; \
922 } \
923 \
924 if (uvc < 256) { \
925 charid = trie->charmap[ uvc ]; \
926 } \
927 else { \
928 charid = 0; \
929 if (trie->widecharmap) { \
930 SV** const svpp = hv_fetch(trie->widecharmap, \
931 (char*)&uvc, sizeof(UV), 0); \
932 if (svpp) \
933 charid = (U16)SvIV(*svpp); \
934 } \
935 } \
936} STMT_END
937
938#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
939 if ( (CoNd) \
940 && (ln == len || \
941 ibcmp_utf8(s, NULL, 0, do_utf8, \
942 m, NULL, ln, (bool)UTF)) \
943 && (!reginfo || regtry(reginfo, s)) ) \
944 goto got_it; \
945 else { \
946 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
947 uvchr_to_utf8(tmpbuf, c); \
948 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
949 if ( f != c \
950 && (f == c1 || f == c2) \
951 && (ln == foldlen || \
952 !ibcmp_utf8((char *) foldbuf, \
953 NULL, foldlen, do_utf8, \
954 m, \
955 NULL, ln, (bool)UTF)) \
956 && (!reginfo || regtry(reginfo, s)) ) \
957 goto got_it; \
958 } \
959 s += len
960
961#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
962STMT_START { \
963 while (s <= e) { \
964 if ( (CoNd) \
965 && (ln == 1 || !(OP(c) == EXACTF \
966 ? ibcmp(s, m, ln) \
967 : ibcmp_locale(s, m, ln))) \
968 && (!reginfo || regtry(reginfo, s)) ) \
969 goto got_it; \
970 s++; \
971 } \
972} STMT_END
973
974#define REXEC_FBC_UTF8_SCAN(CoDe) \
975STMT_START { \
976 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
977 CoDe \
978 s += uskip; \
979 } \
980} STMT_END
981
982#define REXEC_FBC_SCAN(CoDe) \
983STMT_START { \
984 while (s < strend) { \
985 CoDe \
986 s++; \
987 } \
988} STMT_END
989
990#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
991REXEC_FBC_UTF8_SCAN( \
992 if (CoNd) { \
993 if (tmp && (!reginfo || regtry(reginfo, s))) \
994 goto got_it; \
995 else \
996 tmp = doevery; \
997 } \
998 else \
999 tmp = 1; \
1000)
1001
1002#define REXEC_FBC_CLASS_SCAN(CoNd) \
1003REXEC_FBC_SCAN( \
1004 if (CoNd) { \
1005 if (tmp && (!reginfo || regtry(reginfo, s))) \
1006 goto got_it; \
1007 else \
1008 tmp = doevery; \
1009 } \
1010 else \
1011 tmp = 1; \
1012)
1013
1014#define REXEC_FBC_TRYIT \
1015if ((!reginfo || regtry(reginfo, s))) \
1016 goto got_it
1017
1018#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1019 if (do_utf8) { \
1020 UtFpReLoAd; \
1021 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1022 } \
1023 else { \
1024 REXEC_FBC_CLASS_SCAN(CoNd); \
1025 } \
1026 break
1027
1028#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1029 PL_reg_flags |= RF_tainted; \
1030 if (do_utf8) { \
1031 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1032 } \
1033 else { \
1034 REXEC_FBC_CLASS_SCAN(CoNd); \
1035 } \
1036 break
1037
786e8c11
YO
1038#define DUMP_EXEC_POS(li,s,doutf8) \
1039 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1040
1041/* We know what class REx starts with. Try to find this position... */
1042/* if reginfo is NULL, its a dryrun */
1043/* annoyingly all the vars in this routine have different names from their counterparts
1044 in regmatch. /grrr */
1045
3c3eec57 1046STATIC char *
07be1b83
YO
1047S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1048 const char *strend, const regmatch_info *reginfo)
a687059c 1049{
27da23d5 1050 dVAR;
1df70142 1051 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 1052 char *m;
d8093b23 1053 STRLEN ln;
5dab1207 1054 STRLEN lnc;
078c425b 1055 register STRLEN uskip;
d8093b23
G
1056 unsigned int c1;
1057 unsigned int c2;
6eb5f6b9
JH
1058 char *e;
1059 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1060 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 1061
6eb5f6b9
JH
1062 /* We know what class it must start with. */
1063 switch (OP(c)) {
6eb5f6b9 1064 case ANYOF:
388cc4de 1065 if (do_utf8) {
4cadc6a9 1066 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1067 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1068 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1069 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1070 }
1071 else {
1072 while (s < strend) {
1073 STRLEN skip = 1;
1074
32fc9b6a 1075 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1076 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1077 /* The assignment of 2 is intentional:
1078 * for the folded sharp s, the skip is 2. */
1079 (skip = SHARP_S_SKIP))) {
3b0527fe 1080 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
1081 goto got_it;
1082 else
1083 tmp = doevery;
1084 }
1085 else
1086 tmp = 1;
1087 s += skip;
1088 }
a0d0e21e 1089 }
6eb5f6b9 1090 break;
f33976b4 1091 case CANY:
4cadc6a9 1092 REXEC_FBC_SCAN(
3b0527fe 1093 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
1094 goto got_it;
1095 else
1096 tmp = doevery;
4cadc6a9 1097 );
f33976b4 1098 break;
6eb5f6b9 1099 case EXACTF:
5dab1207
NIS
1100 m = STRING(c);
1101 ln = STR_LEN(c); /* length to match in octets/bytes */
1102 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1103 if (UTF) {
a2a2844f 1104 STRLEN ulen1, ulen2;
5dab1207 1105 U8 *sm = (U8 *) m;
89ebb4a3
JH
1106 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1107 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1108 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1109
1110 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1111 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1112
89ebb4a3 1113 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1114 0, uniflags);
89ebb4a3 1115 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1116 0, uniflags);
5dab1207
NIS
1117 lnc = 0;
1118 while (sm < ((U8 *) m + ln)) {
1119 lnc++;
1120 sm += UTF8SKIP(sm);
1121 }
1aa99e6b
IH
1122 }
1123 else {
1124 c1 = *(U8*)m;
1125 c2 = PL_fold[c1];
1126 }
6eb5f6b9
JH
1127 goto do_exactf;
1128 case EXACTFL:
5dab1207
NIS
1129 m = STRING(c);
1130 ln = STR_LEN(c);
1131 lnc = (I32) ln;
d8093b23 1132 c1 = *(U8*)m;
6eb5f6b9
JH
1133 c2 = PL_fold_locale[c1];
1134 do_exactf:
db12adc6 1135 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1136
3b0527fe 1137 if (!reginfo && e < s)
6eb5f6b9 1138 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1139
60a8b682
JH
1140 /* The idea in the EXACTF* cases is to first find the
1141 * first character of the EXACTF* node and then, if
1142 * necessary, case-insensitively compare the full
1143 * text of the node. The c1 and c2 are the first
1144 * characters (though in Unicode it gets a bit
1145 * more complicated because there are more cases
7f16dd3d
JH
1146 * than just upper and lower: one needs to use
1147 * the so-called folding case for case-insensitive
1148 * matching (called "loose matching" in Unicode).
1149 * ibcmp_utf8() will do just that. */
60a8b682 1150
1aa99e6b 1151 if (do_utf8) {
575cac57 1152 UV c, f;
89ebb4a3 1153 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1154 STRLEN len, foldlen;
4ad0818d 1155 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1156 if (c1 == c2) {
5dab1207
NIS
1157 /* Upper and lower of 1st char are equal -
1158 * probably not a "letter". */
1aa99e6b 1159 while (s <= e) {
89ebb4a3 1160 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1161 uniflags);
4cadc6a9 1162 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1163 }
09091399
JH
1164 }
1165 else {
1aa99e6b 1166 while (s <= e) {
89ebb4a3 1167 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1168 uniflags);
80aecb99 1169
60a8b682 1170 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1171 * Note that not all the possible combinations
1172 * are handled here: some of them are handled
1173 * by the standard folding rules, and some of
1174 * them (the character class or ANYOF cases)
1175 * are handled during compiletime in
1176 * regexec.c:S_regclass(). */
880bd946
JH
1177 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1178 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1179 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1180
4cadc6a9 1181 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1182 }
09091399 1183 }
1aa99e6b
IH
1184 }
1185 else {
1186 if (c1 == c2)
4cadc6a9 1187 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1188 else
4cadc6a9 1189 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1190 }
1191 break;
bbce6d69 1192 case BOUNDL:
3280af22 1193 PL_reg_flags |= RF_tainted;
bbce6d69 1194 /* FALL THROUGH */
a0d0e21e 1195 case BOUND:
ffc61ed2 1196 if (do_utf8) {
12d33761 1197 if (s == PL_bostr)
ffc61ed2
JH
1198 tmp = '\n';
1199 else {
6136c704 1200 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1201 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1202 }
1203 tmp = ((OP(c) == BOUND ?
9041c2e3 1204 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1205 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1206 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1207 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1208 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1209 isALNUM_LC_utf8((U8*)s)))
1210 {
1211 tmp = !tmp;
4cadc6a9 1212 REXEC_FBC_TRYIT;
a687059c 1213 }
4cadc6a9 1214 );
a0d0e21e 1215 }
667bb95a 1216 else {
12d33761 1217 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1218 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1219 REXEC_FBC_SCAN(
ffc61ed2
JH
1220 if (tmp ==
1221 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1222 tmp = !tmp;
4cadc6a9 1223 REXEC_FBC_TRYIT;
a0ed51b3 1224 }
4cadc6a9 1225 );
a0ed51b3 1226 }
3b0527fe 1227 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1228 goto got_it;
1229 break;
bbce6d69 1230 case NBOUNDL:
3280af22 1231 PL_reg_flags |= RF_tainted;
bbce6d69 1232 /* FALL THROUGH */
a0d0e21e 1233 case NBOUND:
ffc61ed2 1234 if (do_utf8) {
12d33761 1235 if (s == PL_bostr)
ffc61ed2
JH
1236 tmp = '\n';
1237 else {
6136c704 1238 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1239 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1240 }
1241 tmp = ((OP(c) == NBOUND ?
9041c2e3 1242 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1243 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1244 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1245 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1246 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1247 isALNUM_LC_utf8((U8*)s)))
1248 tmp = !tmp;
4cadc6a9
YO
1249 else REXEC_FBC_TRYIT;
1250 );
a0d0e21e 1251 }
667bb95a 1252 else {
12d33761 1253 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1254 tmp = ((OP(c) == NBOUND ?
1255 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1256 REXEC_FBC_SCAN(
ffc61ed2
JH
1257 if (tmp ==
1258 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1259 tmp = !tmp;
4cadc6a9
YO
1260 else REXEC_FBC_TRYIT;
1261 );
a0ed51b3 1262 }
3b0527fe 1263 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1264 goto got_it;
1265 break;
a0d0e21e 1266 case ALNUM:
4cadc6a9
YO
1267 REXEC_FBC_CSCAN_PRELOAD(
1268 LOAD_UTF8_CHARCLASS_ALNUM(),
1269 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1270 isALNUM(*s)
1271 );
bbce6d69 1272 case ALNUML:
4cadc6a9
YO
1273 REXEC_FBC_CSCAN_TAINT(
1274 isALNUM_LC_utf8((U8*)s),
1275 isALNUM_LC(*s)
1276 );
a0d0e21e 1277 case NALNUM:
4cadc6a9
YO
1278 REXEC_FBC_CSCAN_PRELOAD(
1279 LOAD_UTF8_CHARCLASS_ALNUM(),
1280 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1281 !isALNUM(*s)
1282 );
bbce6d69 1283 case NALNUML:
4cadc6a9
YO
1284 REXEC_FBC_CSCAN_TAINT(
1285 !isALNUM_LC_utf8((U8*)s),
1286 !isALNUM_LC(*s)
1287 );
a0d0e21e 1288 case SPACE:
4cadc6a9
YO
1289 REXEC_FBC_CSCAN_PRELOAD(
1290 LOAD_UTF8_CHARCLASS_SPACE(),
1291 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1292 isSPACE(*s)
1293 );
bbce6d69 1294 case SPACEL:
4cadc6a9
YO
1295 REXEC_FBC_CSCAN_TAINT(
1296 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1297 isSPACE_LC(*s)
1298 );
a0d0e21e 1299 case NSPACE:
4cadc6a9
YO
1300 REXEC_FBC_CSCAN_PRELOAD(
1301 LOAD_UTF8_CHARCLASS_SPACE(),
1302 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1303 !isSPACE(*s)
1304 );
bbce6d69 1305 case NSPACEL:
4cadc6a9
YO
1306 REXEC_FBC_CSCAN_TAINT(
1307 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1308 !isSPACE_LC(*s)
1309 );
a0d0e21e 1310 case DIGIT:
4cadc6a9
YO
1311 REXEC_FBC_CSCAN_PRELOAD(
1312 LOAD_UTF8_CHARCLASS_DIGIT(),
1313 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1314 isDIGIT(*s)
1315 );
b8c5462f 1316 case DIGITL:
4cadc6a9
YO
1317 REXEC_FBC_CSCAN_TAINT(
1318 isDIGIT_LC_utf8((U8*)s),
1319 isDIGIT_LC(*s)
1320 );
a0d0e21e 1321 case NDIGIT:
4cadc6a9
YO
1322 REXEC_FBC_CSCAN_PRELOAD(
1323 LOAD_UTF8_CHARCLASS_DIGIT(),
1324 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1325 !isDIGIT(*s)
1326 );
b8c5462f 1327 case NDIGITL:
4cadc6a9
YO
1328 REXEC_FBC_CSCAN_TAINT(
1329 !isDIGIT_LC_utf8((U8*)s),
1330 !isDIGIT_LC(*s)
1331 );
786e8c11 1332 case TRIEC:
07be1b83 1333 case TRIE:
07be1b83
YO
1334 {
1335 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1336 trie_type = do_utf8 ?
1337 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1338 : trie_plain;
1339 /* what trie are we using right now */
1340 reg_ac_data *aho
1341 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1342 reg_trie_data *trie=aho->trie;
1343
1344 const char *last_start = strend - trie->minlen;
6148ee25 1345#ifdef DEBUGGING
07be1b83 1346 const char *real_start = s;
6148ee25 1347#endif
07be1b83 1348 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1349 SV *sv_points;
1350 U8 **points; /* map of where we were in the input string
786e8c11 1351 when reading a given char. For ASCII this
be8e71aa
YO
1352 is unnecessary overhead as the relationship
1353 is always 1:1, but for unicode, especially
1354 case folded unicode this is not true. */
f9e705e8 1355 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1356 U8 *bitmap=NULL;
1357
07be1b83
YO
1358
1359 GET_RE_DEBUG_FLAGS_DECL;
1360
be8e71aa
YO
1361 /* We can't just allocate points here. We need to wrap it in
1362 * an SV so it gets freed properly if there is a croak while
1363 * running the match */
1364 ENTER;
1365 SAVETMPS;
1366 sv_points=newSV(maxlen * sizeof(U8 *));
1367 SvCUR_set(sv_points,
1368 maxlen * sizeof(U8 *));
1369 SvPOK_on(sv_points);
1370 sv_2mortal(sv_points);
1371 points=(U8**)SvPV_nolen(sv_points );
786e8c11
YO
1372 if ( trie_type != trie_utf8_fold && (trie->bitmap || OP(c)==TRIEC) ) {
1373 if (trie->bitmap)
1374 bitmap=(U8*)trie->bitmap;
1375 else
1376 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1377 }
786e8c11
YO
1378 /* this is the Aho-Corasick algorithm modified a touch
1379 to include special handling for long "unknown char"
1380 sequences. The basic idea being that we use AC as long
1381 as we are dealing with a possible matching char, when
1382 we encounter an unknown char (and we have not encountered
1383 an accepting state) we scan forward until we find a legal
1384 starting char.
1385 AC matching is basically that of trie matching, except
1386 that when we encounter a failing transition, we fall back
1387 to the current states "fail state", and try the current char
1388 again, a process we repeat until we reach the root state,
1389 state 1, or a legal transition. If we fail on the root state
1390 then we can either terminate if we have reached an accepting
1391 state previously, or restart the entire process from the beginning
1392 if we have not.
1393
1394 */
07be1b83
YO
1395 while (s <= last_start) {
1396 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1397 U8 *uc = (U8*)s;
1398 U16 charid = 0;
1399 U32 base = 1;
1400 U32 state = 1;
1401 UV uvc = 0;
1402 STRLEN len = 0;
1403 STRLEN foldlen = 0;
1404 U8 *uscan = (U8*)NULL;
1405 U8 *leftmost = NULL;
786e8c11
YO
1406#ifdef DEBUGGING
1407 U32 accepted_word= 0;
1408#endif
07be1b83
YO
1409 U32 pointpos = 0;
1410
1411 while ( state && uc <= (U8*)strend ) {
1412 int failed=0;
786e8c11
YO
1413 U32 word = aho->states[ state ].wordnum;
1414
1415 if( state==1 && bitmap ) {
1416 DEBUG_TRIE_EXECUTE_r(
1417 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1418 dump_exec_pos( (char *)uc, c, strend, real_start,
85c3142d 1419 (char*)uc, do_utf8 );
786e8c11
YO
1420 PerlIO_printf( Perl_debug_log,
1421 " Scanning for legal start char...\n");
1422 }
1423 );
1424 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1425 uc++;
1426 }
1427 s= (char *)uc;
1428 if (uc >(U8*)last_start) break;
1429 }
1430
1431 if ( word ) {
1432 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1433 if (!leftmost || lpos < leftmost) {
1434 DEBUG_r(accepted_word=word);
07be1b83 1435 leftmost= lpos;
786e8c11 1436 }
07be1b83 1437 if (base==0) break;
786e8c11 1438
07be1b83
YO
1439 }
1440 points[pointpos++ % maxlen]= uc;
4cadc6a9
YO
1441 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1442 uvc, charid, foldlen, foldbuf, uniflags);
786e8c11
YO
1443 DEBUG_TRIE_EXECUTE_r({
1444 dump_exec_pos( (char *)uc, c, strend, real_start,
1445 s, do_utf8 );
07be1b83 1446 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1447 " Charid:%3u CP:%4"UVxf" ",
1448 charid, uvc);
1449 });
07be1b83
YO
1450
1451 do {
6148ee25 1452#ifdef DEBUGGING
786e8c11 1453 word = aho->states[ state ].wordnum;
6148ee25 1454#endif
07be1b83
YO
1455 base = aho->states[ state ].trans.base;
1456
786e8c11
YO
1457 DEBUG_TRIE_EXECUTE_r({
1458 if (failed)
1459 dump_exec_pos( (char *)uc, c, strend, real_start,
1460 s, do_utf8 );
07be1b83 1461 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1462 "%sState: %4"UVxf", word=%"UVxf,
1463 failed ? " Fail transition to " : "",
1464 (UV)state, (UV)word);
1465 });
07be1b83
YO
1466 if ( base ) {
1467 U32 tmp;
1468 if (charid &&
1469 (base + charid > trie->uniquecharcount )
1470 && (base + charid - 1 - trie->uniquecharcount
1471 < trie->lasttrans)
1472 && trie->trans[base + charid - 1 -
1473 trie->uniquecharcount].check == state
1474 && (tmp=trie->trans[base + charid - 1 -
1475 trie->uniquecharcount ].next))
1476 {
786e8c11
YO
1477 DEBUG_TRIE_EXECUTE_r(
1478 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1479 state = tmp;
1480 break;
1481 }
1482 else {
786e8c11
YO
1483 DEBUG_TRIE_EXECUTE_r(
1484 PerlIO_printf( Perl_debug_log," - fail\n"));
1485 failed = 1;
1486 state = aho->fail[state];
07be1b83
YO
1487 }
1488 }
1489 else {
1490 /* we must be accepting here */
786e8c11
YO
1491 DEBUG_TRIE_EXECUTE_r(
1492 PerlIO_printf( Perl_debug_log," - accepting\n"));
1493 failed = 1;
07be1b83
YO
1494 break;
1495 }
1496 } while(state);
786e8c11 1497 uc += len;
07be1b83
YO
1498 if (failed) {
1499 if (leftmost)
1500 break;
786e8c11 1501 if (!state) state = 1;
07be1b83
YO
1502 }
1503 }
1504 if ( aho->states[ state ].wordnum ) {
1505 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
786e8c11
YO
1506 if (!leftmost || lpos < leftmost) {
1507 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1508 leftmost = lpos;
786e8c11 1509 }
07be1b83 1510 }
07be1b83
YO
1511 if (leftmost) {
1512 s = (char*)leftmost;
786e8c11
YO
1513 DEBUG_TRIE_EXECUTE_r({
1514 PerlIO_printf(
1515 Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n",
1516 (UV)accepted_word, s - real_start
1517 );
1518 });
be8e71aa
YO
1519 if (!reginfo || regtry(reginfo, s)) {
1520 FREETMPS;
1521 LEAVE;
07be1b83 1522 goto got_it;
be8e71aa 1523 }
07be1b83 1524 s = HOPc(s,1);
786e8c11
YO
1525 DEBUG_TRIE_EXECUTE_r({
1526 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1527 });
07be1b83 1528 } else {
786e8c11
YO
1529 DEBUG_TRIE_EXECUTE_r(
1530 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1531 break;
1532 }
1533 }
be8e71aa
YO
1534 FREETMPS;
1535 LEAVE;
07be1b83
YO
1536 }
1537 break;
b3c9acc1 1538 default:
3c3eec57
GS
1539 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1540 break;
d6a28714 1541 }
6eb5f6b9
JH
1542 return 0;
1543 got_it:
1544 return s;
1545}
1546
1547/*
1548 - regexec_flags - match a regexp against a string
1549 */
1550I32
1551Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1552 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1553/* strend: pointer to null at end of string */
1554/* strbeg: real beginning of string */
1555/* minend: end of match must be >=minend after stringarg. */
1556/* data: May be used for some additional optimizations. */
1557/* nosave: For optimizations. */
1558{
97aff369 1559 dVAR;
6eb5f6b9
JH
1560 register char *s;
1561 register regnode *c;
1562 register char *startpos = stringarg;
6eb5f6b9
JH
1563 I32 minlen; /* must match at least this many chars */
1564 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1565 I32 end_shift = 0; /* Same for the end. */ /* CC */
1566 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1567 char *scream_olds = NULL;
3dab1dad 1568 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1569 const bool do_utf8 = DO_UTF8(sv);
2757e526 1570 I32 multiline;
0df25f3d 1571
3b0527fe 1572 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1573
1574 GET_RE_DEBUG_FLAGS_DECL;
1575
9d4ba2ae 1576 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1577
1578 /* Be paranoid... */
1579 if (prog == NULL || startpos == NULL) {
1580 Perl_croak(aTHX_ "NULL regexp parameter");
1581 return 0;
1582 }
1583
2757e526 1584 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1585 reginfo.prog = prog;
2757e526 1586
bac06658
JH
1587 RX_MATCH_UTF8_set(prog, do_utf8);
1588
6eb5f6b9 1589 minlen = prog->minlen;
61a36c01 1590 if (strend - startpos < minlen) {
a3621e74 1591 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1592 "String too short [regexec_flags]...\n"));
1593 goto phooey;
1aa99e6b 1594 }
6eb5f6b9 1595
6eb5f6b9
JH
1596 /* Check validity of program. */
1597 if (UCHARAT(prog->program) != REG_MAGIC) {
1598 Perl_croak(aTHX_ "corrupted regexp program");
1599 }
1600
1601 PL_reg_flags = 0;
1602 PL_reg_eval_set = 0;
1603 PL_reg_maxiter = 0;
1604
1605 if (prog->reganch & ROPT_UTF8)
1606 PL_reg_flags |= RF_utf8;
1607
1608 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1609 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1610 PL_bostr = strbeg;
3b0527fe 1611 reginfo.sv = sv;
6eb5f6b9
JH
1612
1613 /* Mark end of line for $ (and such) */
1614 PL_regeol = strend;
1615
1616 /* see how far we have to get to not match where we matched before */
3b0527fe 1617 reginfo.till = startpos+minend;
6eb5f6b9 1618
6eb5f6b9
JH
1619 /* If there is a "must appear" string, look for it. */
1620 s = startpos;
1621
3b0527fe 1622 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1623 MAGIC *mg;
1624
1625 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1626 reginfo.ganch = startpos;
6eb5f6b9
JH
1627 else if (sv && SvTYPE(sv) >= SVt_PVMG
1628 && SvMAGIC(sv)
14befaf4
DM
1629 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1630 && mg->mg_len >= 0) {
3b0527fe 1631 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1632 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1633 if (s > reginfo.ganch)
6eb5f6b9 1634 goto phooey;
3b0527fe 1635 s = reginfo.ganch;
6eb5f6b9
JH
1636 }
1637 }
1638 else /* pos() not defined */
3b0527fe 1639 reginfo.ganch = strbeg;
6eb5f6b9
JH
1640 }
1641
a0714e2c 1642 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1643 re_scream_pos_data d;
1644
1645 d.scream_olds = &scream_olds;
1646 d.scream_pos = &scream_pos;
1647 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1648 if (!s) {
a3621e74 1649 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1650 goto phooey; /* not present */
3fa9c3d7 1651 }
6eb5f6b9
JH
1652 }
1653
ab3bbdeb
YO
1654 DEBUG_EXECUTE_r(
1655 debug_start_match(prog, do_utf8, startpos, strend,
1656 "Matching");
2a782b5b 1657 );
6eb5f6b9
JH
1658
1659 /* Simplest case: anchored match need be tried only once. */
1660 /* [unless only anchor is BOL and multiline is set] */
1661 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1662 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1663 goto got_it;
7fba1cd6 1664 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1665 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1666 {
1667 char *end;
1668
1669 if (minlen)
1670 dontbother = minlen - 1;
1aa99e6b 1671 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1672 /* for multiline we only have to try after newlines */
33b8afdf 1673 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1674 if (s == startpos)
1675 goto after_try;
1676 while (1) {
3b0527fe 1677 if (regtry(&reginfo, s))
6eb5f6b9
JH
1678 goto got_it;
1679 after_try:
1680 if (s >= end)
1681 goto phooey;
1682 if (prog->reganch & RE_USE_INTUIT) {
1683 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1684 if (!s)
1685 goto phooey;
1686 }
1687 else
1688 s++;
1689 }
1690 } else {
1691 if (s > startpos)
1692 s--;
1693 while (s < end) {
1694 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1695 if (regtry(&reginfo, s))
6eb5f6b9
JH
1696 goto got_it;
1697 }
1698 }
1699 }
1700 }
1701 goto phooey;
1702 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1703 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1704 goto got_it;
1705 goto phooey;
1706 }
1707
1708 /* Messy cases: unanchored match. */
33b8afdf 1709 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1710 /* we have /x+whatever/ */
1711 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1712 char ch;
bf93d4cc
GS
1713#ifdef DEBUGGING
1714 int did_match = 0;
1715#endif
33b8afdf
JH
1716 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1717 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1718 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1719
1aa99e6b 1720 if (do_utf8) {
4cadc6a9 1721 REXEC_FBC_SCAN(
6eb5f6b9 1722 if (*s == ch) {
a3621e74 1723 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1724 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1725 s += UTF8SKIP(s);
1726 while (s < strend && *s == ch)
1727 s += UTF8SKIP(s);
1728 }
4cadc6a9 1729 );
6eb5f6b9
JH
1730 }
1731 else {
4cadc6a9 1732 REXEC_FBC_SCAN(
6eb5f6b9 1733 if (*s == ch) {
a3621e74 1734 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1735 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1736 s++;
1737 while (s < strend && *s == ch)
1738 s++;
1739 }
4cadc6a9 1740 );
6eb5f6b9 1741 }
a3621e74 1742 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1743 PerlIO_printf(Perl_debug_log,
b7953727
JH
1744 "Did not find anchored character...\n")
1745 );
6eb5f6b9 1746 }
a0714e2c
SS
1747 else if (prog->anchored_substr != NULL
1748 || prog->anchored_utf8 != NULL
1749 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1750 && prog->float_max_offset < strend - s)) {
1751 SV *must;
1752 I32 back_max;
1753 I32 back_min;
1754 char *last;
6eb5f6b9 1755 char *last1; /* Last position checked before */
bf93d4cc
GS
1756#ifdef DEBUGGING
1757 int did_match = 0;
1758#endif
33b8afdf
JH
1759 if (prog->anchored_substr || prog->anchored_utf8) {
1760 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1761 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1762 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1763 back_max = back_min = prog->anchored_offset;
1764 } else {
1765 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1766 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1767 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1768 back_max = prog->float_max_offset;
1769 back_min = prog->float_min_offset;
1770 }
1771 if (must == &PL_sv_undef)
1772 /* could not downgrade utf8 check substring, so must fail */
1773 goto phooey;
1774
1775 last = HOP3c(strend, /* Cannot start after this */
1776 -(I32)(CHR_SVLEN(must)
1777 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1778
1779 if (s > PL_bostr)
1780 last1 = HOPc(s, -1);
1781 else
1782 last1 = s - 1; /* bogus */
1783
a0288114 1784 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1785 check_substr==must. */
1786 scream_pos = -1;
1787 dontbother = end_shift;
1788 strend = HOPc(strend, -dontbother);
1789 while ( (s <= last) &&
9041c2e3 1790 ((flags & REXEC_SCREAM)
1aa99e6b 1791 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1792 end_shift, &scream_pos, 0))
1aa99e6b 1793 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1794 (unsigned char*)strend, must,
7fba1cd6 1795 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1796 /* we may be pointing at the wrong string */
1797 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1798 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1799 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1800 if (HOPc(s, -back_max) > last1) {
1801 last1 = HOPc(s, -back_min);
1802 s = HOPc(s, -back_max);
1803 }
1804 else {
52657f30 1805 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1806
1807 last1 = HOPc(s, -back_min);
52657f30 1808 s = t;
6eb5f6b9 1809 }
1aa99e6b 1810 if (do_utf8) {
6eb5f6b9 1811 while (s <= last1) {
3b0527fe 1812 if (regtry(&reginfo, s))
6eb5f6b9
JH
1813 goto got_it;
1814 s += UTF8SKIP(s);
1815 }
1816 }
1817 else {
1818 while (s <= last1) {
3b0527fe 1819 if (regtry(&reginfo, s))
6eb5f6b9
JH
1820 goto got_it;
1821 s++;
1822 }
1823 }
1824 }
ab3bbdeb
YO
1825 DEBUG_EXECUTE_r(if (!did_match) {
1826 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1827 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1828 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 1829 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 1830 ? "anchored" : "floating"),
ab3bbdeb
YO
1831 quoted, RE_SV_TAIL(must));
1832 });
6eb5f6b9
JH
1833 goto phooey;
1834 }
786e8c11 1835 else if ( (c = prog->regstclass) ) {
f14c76ed 1836 if (minlen) {
be8e71aa 1837 const OPCODE op = OP(prog->regstclass);
66e933ab 1838 /* don't bother with what can't match */
786e8c11 1839 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
1840 strend = HOPc(strend, -(minlen - 1));
1841 }
a3621e74 1842 DEBUG_EXECUTE_r({
be8e71aa 1843 SV * const prop = sv_newmortal();
32fc9b6a 1844 regprop(prog, prop, c);
0df25f3d 1845 {
ab3bbdeb
YO
1846 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1847 s,strend-s,60);
0df25f3d 1848 PerlIO_printf(Perl_debug_log,
ab3bbdeb 1849 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 1850 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 1851 quoted, (int)(strend - s));
0df25f3d 1852 }
ffc61ed2 1853 });
3b0527fe 1854 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 1855 goto got_it;
07be1b83 1856 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
1857 }
1858 else {
1859 dontbother = 0;
a0714e2c 1860 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1861 /* Trim the end. */
d6a28714 1862 char *last;
33b8afdf
JH
1863 SV* float_real;
1864
1865 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1866 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1867 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1868
1869 if (flags & REXEC_SCREAM) {
33b8afdf 1870 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1871 end_shift, &scream_pos, 1); /* last one */
1872 if (!last)
ffc61ed2 1873 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1874 /* we may be pointing at the wrong string */
1875 else if (RX_MATCH_COPIED(prog))
3f7c398e 1876 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1877 }
d6a28714
JH
1878 else {
1879 STRLEN len;
cfd0369c 1880 const char * const little = SvPV_const(float_real, len);
d6a28714 1881
33b8afdf 1882 if (SvTAIL(float_real)) {
d6a28714
JH
1883 if (memEQ(strend - len + 1, little, len - 1))
1884 last = strend - len + 1;
7fba1cd6 1885 else if (!multiline)
9041c2e3 1886 last = memEQ(strend - len, little, len)
bd61b366 1887 ? strend - len : NULL;
b8c5462f 1888 else
d6a28714
JH
1889 goto find_last;
1890 } else {
1891 find_last:
9041c2e3 1892 if (len)
d6a28714 1893 last = rninstr(s, strend, little, little + len);
b8c5462f 1894 else
a0288114 1895 last = strend; /* matching "$" */
b8c5462f 1896 }
b8c5462f 1897 }
bf93d4cc 1898 if (last == NULL) {
a3621e74 1899 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 1900 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 1901 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1902 goto phooey; /* Should not happen! */
1903 }
d6a28714
JH
1904 dontbother = strend - last + prog->float_min_offset;
1905 }
1906 if (minlen && (dontbother < minlen))
1907 dontbother = minlen - 1;
1908 strend -= dontbother; /* this one's always in bytes! */
1909 /* We don't know much -- general case. */
1aa99e6b 1910 if (do_utf8) {
d6a28714 1911 for (;;) {
3b0527fe 1912 if (regtry(&reginfo, s))
d6a28714
JH
1913 goto got_it;
1914 if (s >= strend)
1915 break;
b8c5462f 1916 s += UTF8SKIP(s);
d6a28714
JH
1917 };
1918 }
1919 else {
1920 do {
3b0527fe 1921 if (regtry(&reginfo, s))
d6a28714
JH
1922 goto got_it;
1923 } while (s++ < strend);
1924 }
1925 }
1926
1927 /* Failure. */
1928 goto phooey;
1929
1930got_it:
1931 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1932
1933 if (PL_reg_eval_set) {
1934 /* Preserve the current value of $^R */
1935 if (oreplsv != GvSV(PL_replgv))
1936 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1937 restored, the value remains
1938 the same. */
4f639d21 1939 restore_pos(aTHX_ prog);
d6a28714
JH
1940 }
1941
1942 /* make sure $`, $&, $', and $digit will work later */
1943 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 1944 RX_MATCH_COPY_FREE(prog);
d6a28714 1945 if (flags & REXEC_COPY_STR) {
be8e71aa 1946 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 1947#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
1948 if ((SvIsCOW(sv)
1949 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
1950 if (DEBUG_C_TEST) {
1951 PerlIO_printf(Perl_debug_log,
1952 "Copy on write: regexp capture, type %d\n",
1953 (int) SvTYPE(sv));
1954 }
1955 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 1956 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
1957 assert (SvPOKp(prog->saved_copy));
1958 } else
1959#endif
1960 {
1961 RX_MATCH_COPIED_on(prog);
1962 s = savepvn(strbeg, i);
1963 prog->subbeg = s;
1964 }
d6a28714 1965 prog->sublen = i;
d6a28714
JH
1966 }
1967 else {
1968 prog->subbeg = strbeg;
1969 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1970 }
1971 }
9041c2e3 1972
d6a28714
JH
1973 return 1;
1974
1975phooey:
a3621e74 1976 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 1977 PL_colors[4], PL_colors[5]));
d6a28714 1978 if (PL_reg_eval_set)
4f639d21 1979 restore_pos(aTHX_ prog);
d6a28714
JH
1980 return 0;
1981}
1982
1983/*
1984 - regtry - try match at specific point
1985 */
1986STATIC I32 /* 0 failure, 1 success */
3b0527fe 1987S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 1988{
97aff369 1989 dVAR;
d6a28714
JH
1990 register I32 *sp;
1991 register I32 *ep;
1992 CHECKPOINT lastcp;
3b0527fe 1993 regexp *prog = reginfo->prog;
a3621e74 1994 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 1995
02db2b7b
IZ
1996#ifdef DEBUGGING
1997 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1998#endif
d6a28714
JH
1999 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2000 MAGIC *mg;
2001
2002 PL_reg_eval_set = RS_init;
a3621e74 2003 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2004 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2005 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2006 ));
e8347627 2007 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2008 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2009 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2010 SAVETMPS;
2011 /* Apparently this is not needed, judging by wantarray. */
e8347627 2012 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2013 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2014
3b0527fe 2015 if (reginfo->sv) {
d6a28714 2016 /* Make $_ available to executed code. */
3b0527fe 2017 if (reginfo->sv != DEFSV) {
59f00321 2018 SAVE_DEFSV;
3b0527fe 2019 DEFSV = reginfo->sv;
b8c5462f 2020 }
d6a28714 2021
3b0527fe
DM
2022 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2023 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2024 /* prepare for quick setting of pos */
d300d9fa
NC
2025#ifdef PERL_OLD_COPY_ON_WRITE
2026 if (SvIsCOW(sv))
2027 sv_force_normal_flags(sv, 0);
2028#endif
3dab1dad 2029 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2030 &PL_vtbl_mglob, NULL, 0);
d6a28714 2031 mg->mg_len = -1;
b8c5462f 2032 }
d6a28714
JH
2033 PL_reg_magic = mg;
2034 PL_reg_oldpos = mg->mg_len;
4f639d21 2035 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2036 }
09687e5a 2037 if (!PL_reg_curpm) {
a02a5408 2038 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2039#ifdef USE_ITHREADS
2040 {
be8e71aa 2041 SV* const repointer = newSViv(0);
577e12cc 2042 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2043 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2044 av_push(PL_regex_padav,repointer);
2045 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2046 PL_regex_pad = AvARRAY(PL_regex_padav);
2047 }
2048#endif
2049 }
aaa362c4 2050 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2051 PL_reg_oldcurpm = PL_curpm;
2052 PL_curpm = PL_reg_curpm;
2053 if (RX_MATCH_COPIED(prog)) {
2054 /* Here is a serious problem: we cannot rewrite subbeg,
2055 since it may be needed if this match fails. Thus
2056 $` inside (?{}) could fail... */
2057 PL_reg_oldsaved = prog->subbeg;
2058 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2059#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2060 PL_nrs = prog->saved_copy;
2061#endif
d6a28714
JH
2062 RX_MATCH_COPIED_off(prog);
2063 }
2064 else
bd61b366 2065 PL_reg_oldsaved = NULL;
d6a28714
JH
2066 prog->subbeg = PL_bostr;
2067 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2068 }
973dddac 2069 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2070 PL_reginput = startpos;
2071 PL_regstartp = prog->startp;
2072 PL_regendp = prog->endp;
2073 PL_reglastparen = &prog->lastparen;
a01268b5 2074 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2075 prog->lastparen = 0;
03994de8 2076 prog->lastcloseparen = 0;
d6a28714 2077 PL_regsize = 0;
a3621e74 2078 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2079 if (PL_reg_start_tmpl <= prog->nparens) {
2080 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2081 if(PL_reg_start_tmp)
2082 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2083 else
a02a5408 2084 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2085 }
2086
2087 /* XXXX What this code is doing here?!!! There should be no need
2088 to do this again and again, PL_reglastparen should take care of
3dd2943c 2089 this! --ilya*/
dafc8851
JH
2090
2091 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2092 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2093 * PL_reglastparen), is not needed at all by the test suite
2094 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2095 * enough, for building DynaLoader, or otherwise this
2096 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2097 * will happen. Meanwhile, this code *is* needed for the
2098 * above-mentioned test suite tests to succeed. The common theme
2099 * on those tests seems to be returning null fields from matches.
2100 * --jhi */
dafc8851 2101#if 1
d6a28714
JH
2102 sp = prog->startp;
2103 ep = prog->endp;
2104 if (prog->nparens) {
097eb12c 2105 register I32 i;
eb160463 2106 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2107 *++sp = -1;
2108 *++ep = -1;
2109 }
2110 }
dafc8851 2111#endif
02db2b7b 2112 REGCP_SET(lastcp);
3b0527fe 2113 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2114 prog->endp[0] = PL_reginput - PL_bostr;
2115 return 1;
2116 }
02db2b7b 2117 REGCP_UNWIND(lastcp);
d6a28714
JH
2118 return 0;
2119}
2120
02db2b7b 2121
8ba1375e
MJD
2122#define sayYES goto yes
2123#define sayNO goto no
e0f9d4a8 2124#define sayNO_ANYOF goto no_anyof
8ba1375e 2125#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2126#define sayNO_FINAL goto no_final
2127#define sayNO_SILENT goto do_no
2128#define saySAME(x) if (x) goto yes; else goto no
2129
3ab3c9b4 2130#define CACHEsayNO STMT_START { \
3298f257
DM
2131 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) \
2132 PL_reg_poscache[st->u.whilem.cache_offset] |= \
2133 (1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2134 sayNO; \
2135} STMT_END
2136
3298f257 2137
a3621e74
YO
2138/* this is used to determine how far from the left messages like
2139 'failed...' are printed. Currently 29 makes these messages line
2140 up with the opcode they refer to. Earlier perls used 25 which
2141 left these messages outdented making reviewing a debug output
2142 quite difficult.
2143*/
2144#define REPORT_CODE_OFF 29
2145
2146
2147/* Make sure there is a test for this +1 options in re_tests */
2148#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2149
40a82448
DM
2150#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2151#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2152
86545054
DM
2153#define SLAB_FIRST(s) (&(s)->states[0])
2154#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2155
5d9a96ca
DM
2156/* grab a new slab and return the first slot in it */
2157
2158STATIC regmatch_state *
2159S_push_slab(pTHX)
2160{
54df2634
NC
2161#if PERL_VERSION < 9
2162 dMY_CXT;
2163#endif
5d9a96ca
DM
2164 regmatch_slab *s = PL_regmatch_slab->next;
2165 if (!s) {
2166 Newx(s, 1, regmatch_slab);
2167 s->prev = PL_regmatch_slab;
2168 s->next = NULL;
2169 PL_regmatch_slab->next = s;
2170 }
2171 PL_regmatch_slab = s;
86545054 2172 return SLAB_FIRST(s);
5d9a96ca 2173}
5b47454d 2174
95b24440
DM
2175/* simulate a recursive call to regmatch */
2176
2177#define REGMATCH(ns, where) \
5d9a96ca
DM
2178 st->scan = scan; \
2179 scan = (ns); \
2180 st->resume_state = resume_##where; \
95b24440
DM
2181 goto start_recurse; \
2182 resume_point_##where:
2183
40a82448
DM
2184/* push a new state then goto it */
2185
2186#define PUSH_STATE_GOTO(state, node) \
2187 scan = node; \
2188 st->resume_state = state; \
2189 goto push_state;
2190
2191/* push a new state with success backtracking, then goto it */
2192
2193#define PUSH_YES_STATE_GOTO(state, node) \
2194 scan = node; \
2195 st->resume_state = state; \
2196 goto push_yes_state;
2197
aa283a38 2198
aa283a38 2199
d6a28714
JH
2200/*
2201 - regmatch - main matching routine
2202 *
2203 * Conceptually the strategy is simple: check to see whether the current
2204 * node matches, call self recursively to see whether the rest matches,
2205 * and then act accordingly. In practice we make some effort to avoid
2206 * recursion, in particular by going through "ordinary" nodes (that don't
2207 * need to know whether the rest of the match failed) by a loop instead of
2208 * by recursion.
2209 */
2210/* [lwall] I've hoisted the register declarations to the outer block in order to
2211 * maybe save a little bit of pushing and popping on the stack. It also takes
2212 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2213 *
2214 * This function used to be heavily recursive, but since this had the
2215 * effect of blowing the CPU stack on complex regexes, it has been
2216 * restructured to be iterative, and to save state onto the heap rather
2217 * than the stack. Essentially whereever regmatch() used to be called, it
2218 * pushes the current state, notes where to return, then jumps back into
2219 * the main loop.
2220 *
2221 * Originally the structure of this function used to look something like
2222
2223 S_regmatch() {
2224 int a = 1, b = 2;
2225 ...
2226 while (scan != NULL) {
5d9a96ca 2227 a++; // do stuff with a and b
95b24440
DM
2228 ...
2229 switch (OP(scan)) {
2230 case FOO: {
2231 int local = 3;
2232 ...
2233 if (regmatch(...)) // recurse
2234 goto yes;
2235 }
2236 ...
2237 }
2238 }
2239 yes:
2240 return 1;
2241 }
2242
2243 * Now it looks something like this:
2244
5d9a96ca 2245 typedef struct {
95b24440
DM
2246 int a, b, local;
2247 int resume_state;
5d9a96ca 2248 } regmatch_state;
95b24440
DM
2249
2250 S_regmatch() {
5d9a96ca
DM
2251 regmatch_state *st = new();
2252 int depth=0;
2253 st->a++; // do stuff with a and b
95b24440
DM
2254 ...
2255 while (scan != NULL) {
2256 ...
2257 switch (OP(scan)) {
2258 case FOO: {
5d9a96ca 2259 st->local = 3;
95b24440 2260 ...
5d9a96ca
DM
2261 st->scan = scan;
2262 scan = ...;
2263 st->resume_state = resume_FOO;
2264 goto start_recurse; // recurse
95b24440 2265
5d9a96ca
DM
2266 resume_point_FOO:
2267 if (result)
95b24440
DM
2268 goto yes;
2269 }
2270 ...
2271 }
5d9a96ca
DM
2272 start_recurse:
2273 st = new(); push a new state
2274 st->a = 1; st->b = 2;
2275 depth++;
95b24440 2276 }
5d9a96ca 2277 yes:
95b24440 2278 result = 1;
5d9a96ca
DM
2279 if (depth--) {
2280 st = pop();
95b24440
DM
2281 switch (resume_state) {
2282 case resume_FOO:
2283 goto resume_point_FOO;
2284 ...
2285 }
2286 }
2287 return result
2288 }
2289
2290 * WARNING: this means that any line in this function that contains a
2291 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2292 * regmatch() using gotos instead. Thus the values of any local variables
2293 * not saved in the regmatch_state structure will have been lost when
2294 * execution resumes on the next line .
5d9a96ca
DM
2295 *
2296 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2297 * PL_regmatch_state always points to the currently active state, and
2298 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2299 * The first time regmatch is called, the first slab is allocated, and is
2300 * never freed until interpreter desctruction. When the slab is full,
2301 * a new one is allocated chained to the end. At exit from regmatch, slabs
2302 * allocated since entry are freed.
d6a28714 2303 */
95b24440 2304
40a82448 2305/* *** every FOO_fail should = FOO+1 */
c255a977
DM
2306#define TRIE_next (REGNODE_MAX+1)
2307#define TRIE_next_fail (REGNODE_MAX+2)
2308#define EVAL_A (REGNODE_MAX+3)
2309#define EVAL_A_fail (REGNODE_MAX+4)
2310#define resume_CURLYX (REGNODE_MAX+5)
2311#define resume_WHILEM1 (REGNODE_MAX+6)
2312#define resume_WHILEM2 (REGNODE_MAX+7)
2313#define resume_WHILEM3 (REGNODE_MAX+8)
2314#define resume_WHILEM4 (REGNODE_MAX+9)
2315#define resume_WHILEM5 (REGNODE_MAX+10)
2316#define resume_WHILEM6 (REGNODE_MAX+11)
2317#define BRANCH_next (REGNODE_MAX+12)
2318#define BRANCH_next_fail (REGNODE_MAX+13)
2319#define CURLYM_A (REGNODE_MAX+14)
2320#define CURLYM_A_fail (REGNODE_MAX+15)
2321#define CURLYM_B (REGNODE_MAX+16)
2322#define CURLYM_B_fail (REGNODE_MAX+17)
2323#define IFMATCH_A (REGNODE_MAX+18)
2324#define IFMATCH_A_fail (REGNODE_MAX+19)
2325#define CURLY_B_min_known (REGNODE_MAX+20)
2326#define CURLY_B_min_known_fail (REGNODE_MAX+21)
2327#define CURLY_B_min (REGNODE_MAX+22)
2328#define CURLY_B_min_fail (REGNODE_MAX+23)
2329#define CURLY_B_max (REGNODE_MAX+24)
2330#define CURLY_B_max_fail (REGNODE_MAX+25)
40a82448
DM
2331
2332
3dab1dad 2333#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2334
3df15adc 2335#ifdef DEBUGGING
ab3bbdeb
YO
2336STATIC void
2337S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2338 const char *start, const char *end, const char *blurb)
2339{
2340 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2341 if (!PL_colorset)
2342 reginitcolors();
2343 {
2344 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2345 prog->precomp, prog->prelen, 60);
2346
2347 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2348 start, end - start, 60);
2349
2350 PerlIO_printf(Perl_debug_log,
2351 "%s%s REx%s %s against %s\n",
2352 PL_colors[4], blurb, PL_colors[5], s0, s1);
2353
2354 if (do_utf8||utf8_pat)
2355 PerlIO_printf(Perl_debug_log, "UTF-8 %s...\n",
2356 !do_utf8 ? "pattern" : !utf8_pat ? "string" :
2357 "pattern and string"
2358 );
2359 }
2360}
3df15adc
YO
2361
2362STATIC void
786e8c11
YO
2363S_dump_exec_pos(pTHX_ const char *locinput,
2364 const regnode *scan,
2365 const char *loc_regeol,
2366 const char *loc_bostr,
2367 const char *loc_reg_starttry,
2368 const bool do_utf8)
07be1b83 2369{
786e8c11 2370 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2371 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2372 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2373 /* The part of the string before starttry has one color
2374 (pref0_len chars), between starttry and current
2375 position another one (pref_len - pref0_len chars),
2376 after the current position the third one.
2377 We assume that pref0_len <= pref_len, otherwise we
2378 decrease pref0_len. */
786e8c11
YO
2379 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2380 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2381 int pref0_len;
2382
2383 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2384 pref_len++;
786e8c11
YO
2385 pref0_len = pref_len - (locinput - loc_reg_starttry);
2386 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2387 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2388 ? (5 + taill) - pref_len : loc_regeol - locinput);
07be1b83
YO
2389 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2390 l--;
2391 if (pref0_len < 0)
2392 pref0_len = 0;
2393 if (pref0_len > pref_len)
2394 pref0_len = pref_len;
2395 {
3df15adc 2396 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2397
ab3bbdeb 2398 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
786e8c11 2399 (locinput - pref_len),pref0_len, pref0_len, 4, 5);
0df25f3d 2400
ab3bbdeb 2401 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2402 (locinput - pref_len + pref0_len),
786e8c11 2403 pref_len - pref0_len, pref_len - pref0_len, 2, 3);
0df25f3d 2404
ab3bbdeb 2405 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
786e8c11 2406 locinput, loc_regeol - locinput, l, 0, 1);
0df25f3d 2407
3df15adc 2408 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2409 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2410 (IV)(locinput - loc_bostr),
07be1b83 2411 len0, s0,
07be1b83 2412 len1, s1,
07be1b83 2413 (docolor ? "" : "> <"),
07be1b83 2414 len2, s2,
07be1b83
YO
2415 15 - l - pref_len + 1,
2416 "");
2417 }
2418}
3df15adc 2419
07be1b83
YO
2420#endif
2421
d6a28714 2422STATIC I32 /* 0 failure, 1 success */
3b0527fe 2423S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2424{
54df2634
NC
2425#if PERL_VERSION < 9
2426 dMY_CXT;
2427#endif
27da23d5 2428 dVAR;
95b24440 2429 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2430 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2431
3b0527fe
DM
2432 regexp *rex = reginfo->prog;
2433
5d9a96ca
DM
2434 regmatch_slab *orig_slab;
2435 regmatch_state *orig_state;
a3621e74 2436
5d9a96ca
DM
2437 /* the current state. This is a cached copy of PL_regmatch_state */
2438 register regmatch_state *st;
95b24440 2439
5d9a96ca
DM
2440 /* cache heavy used fields of st in registers */
2441 register regnode *scan;
2442 register regnode *next;
2443 register I32 n = 0; /* initialize to shut up compiler warning */
2444 register char *locinput = PL_reginput;
95b24440 2445
5d9a96ca
DM
2446 /* these variables are NOT saved during a recusive RFEGMATCH: */
2447 register I32 nextchr; /* is always set to UCHARAT(locinput) */
b69b0499 2448 bool result = 0; /* return value of S_regmatch */
5d9a96ca 2449 int depth = 0; /* depth of recursion */
77cb431f
DM
2450 regmatch_state *yes_state = NULL; /* state to pop to on success of
2451 subpattern */
40a82448 2452 U32 state_num;
95b24440 2453
10edeb5d
JH
2454 I32 parenfloor = 0;
2455
95b24440 2456#ifdef DEBUGGING
e68ec53f 2457 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2458 PL_regindent++;
2459#endif
2460
5d9a96ca
DM
2461 /* on first ever call to regmatch, allocate first slab */
2462 if (!PL_regmatch_slab) {
2463 Newx(PL_regmatch_slab, 1, regmatch_slab);
2464 PL_regmatch_slab->prev = NULL;
2465 PL_regmatch_slab->next = NULL;
86545054 2466 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2467 }
2468
2469 /* remember current high-water mark for exit */
2470 /* XXX this should be done with SAVE* instead */
2471 orig_slab = PL_regmatch_slab;
2472 orig_state = PL_regmatch_state;
2473
2474 /* grab next free state slot */
2475 st = ++PL_regmatch_state;
86545054 2476 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2477 st = PL_regmatch_state = S_push_slab(aTHX);
2478
2479 st->minmod = 0;
2480 st->sw = 0;
2481 st->logical = 0;
5d9a96ca 2482 st->cc = NULL;
786e8c11 2483
d6a28714
JH
2484 /* Note that nextchr is a byte even in UTF */
2485 nextchr = UCHARAT(locinput);
2486 scan = prog;
2487 while (scan != NULL) {
8ba1375e 2488
a3621e74 2489 DEBUG_EXECUTE_r( {
6136c704 2490 SV * const prop = sv_newmortal();
786e8c11 2491 DUMP_EXEC_POS( locinput, scan, do_utf8 );
32fc9b6a 2492 regprop(rex, prop, scan);
07be1b83
YO
2493
2494 PerlIO_printf(Perl_debug_log,
2495 "%3"IVdf":%*s%s(%"IVdf")\n",
2496 (IV)(scan - rex->program), PL_regindent*2, "",
2497 SvPVX_const(prop),
2498 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2a782b5b 2499 });
d6a28714
JH
2500
2501 next = scan + NEXT_OFF(scan);
2502 if (next == scan)
2503 next = NULL;
40a82448 2504 state_num = OP(scan);
d6a28714 2505
40a82448
DM
2506 reenter_switch:
2507 switch (state_num) {
d6a28714 2508 case BOL:
7fba1cd6 2509 if (locinput == PL_bostr)
d6a28714 2510 {
3b0527fe 2511 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2512 break;
2513 }
d6a28714
JH
2514 sayNO;
2515 case MBOL:
12d33761
HS
2516 if (locinput == PL_bostr ||
2517 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2518 {
b8c5462f
JH
2519 break;
2520 }
d6a28714
JH
2521 sayNO;
2522 case SBOL:
c2a73568 2523 if (locinput == PL_bostr)
b8c5462f 2524 break;
d6a28714
JH
2525 sayNO;
2526 case GPOS:
3b0527fe 2527 if (locinput == reginfo->ganch)
d6a28714
JH
2528 break;
2529 sayNO;
2530 case EOL:
d6a28714
JH
2531 goto seol;
2532 case MEOL:
d6a28714 2533 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2534 sayNO;
b8c5462f 2535 break;
d6a28714
JH
2536 case SEOL:
2537 seol:
2538 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2539 sayNO;
d6a28714 2540 if (PL_regeol - locinput > 1)
b8c5462f 2541 sayNO;
b8c5462f 2542 break;
d6a28714
JH
2543 case EOS:
2544 if (PL_regeol != locinput)
b8c5462f 2545 sayNO;
d6a28714 2546 break;
ffc61ed2 2547 case SANY:
d6a28714 2548 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2549 sayNO;
f33976b4
DB
2550 if (do_utf8) {
2551 locinput += PL_utf8skip[nextchr];
2552 if (locinput > PL_regeol)
2553 sayNO;
2554 nextchr = UCHARAT(locinput);
2555 }
2556 else
2557 nextchr = UCHARAT(++locinput);
2558 break;
2559 case CANY:
2560 if (!nextchr && locinput >= PL_regeol)
2561 sayNO;
b8c5462f 2562 nextchr = UCHARAT(++locinput);
a0d0e21e 2563 break;
ffc61ed2 2564 case REG_ANY:
1aa99e6b
IH
2565 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2566 sayNO;
2567 if (do_utf8) {
b8c5462f 2568 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2569 if (locinput > PL_regeol)
2570 sayNO;
a0ed51b3 2571 nextchr = UCHARAT(locinput);
a0ed51b3 2572 }
1aa99e6b
IH
2573 else
2574 nextchr = UCHARAT(++locinput);
a0ed51b3 2575 break;
166ba7cd
DM
2576
2577#undef ST
2578#define ST st->u.trie
786e8c11
YO
2579 case TRIEC:
2580 /* In this case the charclass data is available inline so
2581 we can fail fast without a lot of extra overhead.
2582 */
2583 if (scan->flags == EXACT || !do_utf8) {
2584 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2585 DEBUG_EXECUTE_r(
2586 PerlIO_printf(Perl_debug_log,
2587 "%*s %sfailed to match trie start class...%s\n",
2588 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2589 );
2590 sayNO_SILENT;
2591 /* NOTREACHED */
2592 }
2593 }
2594 /* FALL THROUGH */
5b47454d 2595 case TRIE:
3dab1dad 2596 {
07be1b83 2597 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2598 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2599 trie_type = do_utf8 ?
2600 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2601 : trie_plain;
2602
2603 /* what trie are we using right now */
be8e71aa 2604 reg_trie_data * const trie
3dab1dad
YO
2605 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2606 U32 state = trie->startstate;
166ba7cd 2607
3dab1dad
YO
2608 if (trie->bitmap && trie_type != trie_utf8_fold &&
2609 !TRIE_BITMAP_TEST(trie,*locinput)
2610 ) {
2611 if (trie->states[ state ].wordnum) {
2612 DEBUG_EXECUTE_r(
2613 PerlIO_printf(Perl_debug_log,
2614 "%*s %smatched empty string...%s\n",
2615 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2616 );
2617 break;
2618 } else {
2619 DEBUG_EXECUTE_r(
2620 PerlIO_printf(Perl_debug_log,
786e8c11 2621 "%*s %sfailed to match trie start class...%s\n",
3dab1dad
YO
2622 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2623 );
2624 sayNO_SILENT;
2625 }
2626 }
166ba7cd 2627
786e8c11
YO
2628 {
2629 U8 *uc = ( U8* )locinput;
2630
2631 STRLEN len = 0;
2632 STRLEN foldlen = 0;
2633 U8 *uscan = (U8*)NULL;
2634 STRLEN bufflen=0;
2635 SV *sv_accept_buff = NULL;
2636 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2637
2638 ST.accepted = 0; /* how many accepting states we have seen */
2639 ST.B = next;
2640 ST.jump = trie->jump;
2641
2642#ifdef DEBUGGING
2643 ST.me = scan;
2644#endif
2645
2646
2647
07be1b83
YO
2648 /*
2649 traverse the TRIE keeping track of all accepting states
2650 we transition through until we get to a failing node.
2651 */
2652
a3621e74 2653 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11
YO
2654 U32 base = trie->states[ state ].trans.base;
2655 UV uvc;
2656 U16 charid;
2657 /* We use charid to hold the wordnum as we don't use it
2658 for charid until after we have done the wordnum logic.
2659 We define an alias just so that the wordnum logic reads
2660 more naturally. */
2661
2662#define got_wordnum charid
2663 got_wordnum = trie->states[ state ].wordnum;
2664
2665 if ( got_wordnum ) {
2666 if ( ! ST.accepted ) {
5b47454d
DM
2667 ENTER;
2668 SAVETMPS;
2669 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2670 sv_accept_buff=newSV(bufflen *
2671 sizeof(reg_trie_accepted) - 1);
786e8c11 2672 SvCUR_set(sv_accept_buff, 0);
5b47454d
DM
2673 SvPOK_on(sv_accept_buff);
2674 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2675 SAVETMPS;
2676 ST.accept_buff =
5b47454d
DM
2677 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2678 }
786e8c11 2679 do {
166ba7cd 2680 if (ST.accepted >= bufflen) {
5b47454d 2681 bufflen *= 2;
166ba7cd 2682 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2683 SvGROW(sv_accept_buff,
2684 bufflen * sizeof(reg_trie_accepted));
2685 }
2686 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2687 + sizeof(reg_trie_accepted));
a3621e74 2688
786e8c11
YO
2689
2690 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2691 ST.accept_buff[ST.accepted].endpos = uc;
2692 ++ST.accepted;
2693 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2694 }
2695#undef got_wordnum
a3621e74 2696
07be1b83 2697 DEBUG_TRIE_EXECUTE_r({
786e8c11 2698 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
a3621e74 2699 PerlIO_printf( Perl_debug_log,
786e8c11 2700 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
07be1b83 2701 2+PL_regindent * 2, "", PL_colors[4],
786e8c11 2702 (UV)state, (UV)ST.accepted );
07be1b83 2703 });
a3621e74
YO
2704
2705 if ( base ) {
4cadc6a9
YO
2706 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2707 uvc, charid, foldlen, foldbuf, uniflags);
a3621e74 2708
5b47454d
DM
2709 if (charid &&
2710 (base + charid > trie->uniquecharcount )
2711 && (base + charid - 1 - trie->uniquecharcount
2712 < trie->lasttrans)
2713 && trie->trans[base + charid - 1 -
2714 trie->uniquecharcount].check == state)
2715 {
2716 state = trie->trans[base + charid - 1 -
2717 trie->uniquecharcount ].next;
2718 }
2719 else {
2720 state = 0;
2721 }
2722 uc += len;
2723
2724 }
2725 else {
a3621e74
YO
2726 state = 0;
2727 }
2728 DEBUG_TRIE_EXECUTE_r(
e4584336 2729 PerlIO_printf( Perl_debug_log,
786e8c11 2730 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 2731 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2732 );
2733 }
166ba7cd 2734 if (!ST.accepted )
a3621e74 2735 sayNO;
a3621e74 2736
166ba7cd
DM
2737 DEBUG_EXECUTE_r(
2738 PerlIO_printf( Perl_debug_log,
2739 "%*s %sgot %"IVdf" possible matches%s\n",
2740 REPORT_CODE_OFF + PL_regindent * 2, "",
2741 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2742 );
786e8c11 2743 }}
166ba7cd
DM
2744
2745 /* FALL THROUGH */
2746
2747 case TRIE_next_fail: /* we failed - try next alterative */
2748
2749 if ( ST.accepted == 1 ) {
2750 /* only one choice left - just continue */
2751 DEBUG_EXECUTE_r({
2752 reg_trie_data * const trie
2753 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2754 SV ** const tmp = RX_DEBUG(reginfo->prog)
2755 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2756 : NULL;
2757 PerlIO_printf( Perl_debug_log,
2758 "%*s %sonly one match left: #%d <%s>%s\n",
2759 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2760 ST.accept_buff[ 0 ].wordnum,
2761 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2762 PL_colors[5] );
2763 });
2764 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2765 /* in this case we free tmps/leave before we call regmatch
2766 as we wont be using accept_buff again. */
2767 FREETMPS;
2768 LEAVE;
2769 locinput = PL_reginput;
2770 nextchr = UCHARAT(locinput);
786e8c11
YO
2771
2772 if ( !ST.jump )
2773 scan = ST.B;
2774 else
2775 scan = ST.B - ST.jump[ST.accept_buff[0].wordnum];
2776
166ba7cd
DM
2777 continue; /* execute rest of RE */
2778 }
2779
2780 if (!ST.accepted-- ) {
2781 FREETMPS;
2782 LEAVE;
2783 sayNO;
2784 }
2785
a3621e74 2786 /*
166ba7cd
DM
2787 There are at least two accepting states left. Presumably
2788 the number of accepting states is going to be low,
2789 typically two. So we simply scan through to find the one
2790 with lowest wordnum. Once we find it, we swap the last
2791 state into its place and decrement the size. We then try to
2792 match the rest of the pattern at the point where the word
2793 ends. If we succeed, control just continues along the
2794 regex; if we fail we return here to try the next accepting
2795 state
2796 */
a3621e74 2797
166ba7cd
DM
2798 {
2799 U32 best = 0;
2800 U32 cur;
2801 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2802 DEBUG_TRIE_EXECUTE_r(
f2278c82 2803 PerlIO_printf( Perl_debug_log,
166ba7cd
DM
2804 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2805 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2806 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2807 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2808 );
2809
2810 if (ST.accept_buff[cur].wordnum <
2811 ST.accept_buff[best].wordnum)
2812 best = cur;
a3621e74 2813 }
166ba7cd
DM
2814
2815 DEBUG_EXECUTE_r({
2816 reg_trie_data * const trie
2817 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2818 SV ** const tmp = RX_DEBUG(reginfo->prog)
2819 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2820 : NULL;
2821 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
2822 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2823 ST.accept_buff[best].wordnum,
2824 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
2825 PL_colors[5] );
2826 });
2827
2828 if ( best<ST.accepted ) {
2829 reg_trie_accepted tmp = ST.accept_buff[ best ];
2830 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2831 ST.accept_buff[ ST.accepted ] = tmp;
2832 best = ST.accepted;
a3621e74 2833 }
166ba7cd 2834 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
786e8c11
YO
2835 if ( !ST.jump ) {
2836 PUSH_STATE_GOTO(TRIE_next, ST.B);
2837 /* NOTREACHED */
2838 } else {
2839 PUSH_STATE_GOTO(TRIE_next, ST.B - ST.jump[ST.accept_buff[best].wordnum]);
2840 /* NOTREACHED */
2841 }
2842 /* NOTREACHED */
166ba7cd 2843 }
166ba7cd
DM
2844 /* NOTREACHED */
2845
2846#undef ST
2847
95b24440
DM
2848 case EXACT: {
2849 char *s = STRING(scan);
5d9a96ca 2850 st->ln = STR_LEN(scan);
eb160463 2851 if (do_utf8 != UTF) {
bc517b45 2852 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2853 char *l = locinput;
be8e71aa 2854 const char * const e = s + st->ln;
a72c7584 2855
5ff6fc6d
JH
2856 if (do_utf8) {
2857 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2858 while (s < e) {
a3b680e6 2859 STRLEN ulen;
1aa99e6b 2860 if (l >= PL_regeol)
5ff6fc6d
JH
2861 sayNO;
2862 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2863 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2864 uniflags))
5ff6fc6d 2865 sayNO;
bc517b45 2866 l += ulen;
5ff6fc6d 2867 s ++;
1aa99e6b 2868 }
5ff6fc6d
JH
2869 }
2870 else {
2871 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2872 while (s < e) {
a3b680e6 2873 STRLEN ulen;
1aa99e6b
IH
2874 if (l >= PL_regeol)
2875 sayNO;
5ff6fc6d 2876 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2877 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2878 uniflags))
1aa99e6b 2879 sayNO;
bc517b45 2880 s += ulen;
a72c7584 2881 l ++;
1aa99e6b 2882 }
5ff6fc6d 2883 }
1aa99e6b
IH
2884 locinput = l;
2885 nextchr = UCHARAT(locinput);
2886 break;
2887 }
bc517b45 2888 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2889 /* Inline the first character, for speed. */
2890 if (UCHARAT(s) != nextchr)
2891 sayNO;
5d9a96ca 2892 if (PL_regeol - locinput < st->ln)
d6a28714 2893 sayNO;
5d9a96ca 2894 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 2895 sayNO;
5d9a96ca 2896 locinput += st->ln;
d6a28714
JH
2897 nextchr = UCHARAT(locinput);
2898 break;
95b24440 2899 }
d6a28714 2900 case EXACTFL:
b8c5462f
JH
2901 PL_reg_flags |= RF_tainted;
2902 /* FALL THROUGH */
95b24440 2903 case EXACTF: {
be8e71aa 2904 char * const s = STRING(scan);
5d9a96ca 2905 st->ln = STR_LEN(scan);
d6a28714 2906
d07ddd77
JH
2907 if (do_utf8 || UTF) {
2908 /* Either target or the pattern are utf8. */
be8e71aa 2909 const char * const l = locinput;
d07ddd77 2910 char *e = PL_regeol;
bc517b45 2911
5d9a96ca 2912 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 2913 l, &e, 0, do_utf8)) {
5486206c
JH
2914 /* One more case for the sharp s:
2915 * pack("U0U*", 0xDF) =~ /ss/i,
2916 * the 0xC3 0x9F are the UTF-8
2917 * byte sequence for the U+00DF. */
2918 if (!(do_utf8 &&
2919 toLOWER(s[0]) == 's' &&
5d9a96ca 2920 st->ln >= 2 &&
5486206c
JH
2921 toLOWER(s[1]) == 's' &&
2922 (U8)l[0] == 0xC3 &&
2923 e - l >= 2 &&
2924 (U8)l[1] == 0x9F))
2925 sayNO;
2926 }
d07ddd77
JH
2927 locinput = e;
2928 nextchr = UCHARAT(locinput);
2929 break;
a0ed51b3 2930 }
d6a28714 2931
bc517b45
JH
2932 /* Neither the target and the pattern are utf8. */
2933
d6a28714
JH
2934 /* Inline the first character, for speed. */
2935 if (UCHARAT(s) != nextchr &&
2936 UCHARAT(s) != ((OP(scan) == EXACTF)
2937 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2938 sayNO;
5d9a96ca 2939 if (PL_regeol - locinput < st->ln)
b8c5462f 2940 sayNO;
5d9a96ca
DM
2941 if (st->ln > 1 && (OP(scan) == EXACTF
2942 ? ibcmp(s, locinput, st->ln)
2943 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 2944 sayNO;
5d9a96ca 2945 locinput += st->ln;
d6a28714 2946 nextchr = UCHARAT(locinput);
a0d0e21e 2947 break;
95b24440 2948 }
d6a28714 2949 case ANYOF:
ffc61ed2 2950 if (do_utf8) {
9e55ce06
JH
2951 STRLEN inclasslen = PL_regeol - locinput;
2952
32fc9b6a 2953 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2954 sayNO_ANYOF;
ffc61ed2
JH
2955 if (locinput >= PL_regeol)
2956 sayNO;
0f0076b4 2957 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2958 nextchr = UCHARAT(locinput);
e0f9d4a8 2959 break;
ffc61ed2
JH
2960 }
2961 else {
2962 if (nextchr < 0)
2963 nextchr = UCHARAT(locinput);
32fc9b6a 2964 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 2965 sayNO_ANYOF;
ffc61ed2
JH
2966 if (!nextchr && locinput >= PL_regeol)
2967 sayNO;
2968 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2969 break;
2970 }
2971 no_anyof:
2972 /* If we might have the case of the German sharp s
2973 * in a casefolding Unicode character class. */
2974
ebc501f0
JH
2975 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2976 locinput += SHARP_S_SKIP;
e0f9d4a8 2977 nextchr = UCHARAT(locinput);
ffc61ed2 2978 }
e0f9d4a8
JH
2979 else
2980 sayNO;
b8c5462f 2981 break;
d6a28714 2982 case ALNUML:
b8c5462f
JH
2983 PL_reg_flags |= RF_tainted;
2984 /* FALL THROUGH */
d6a28714 2985 case ALNUM:
b8c5462f 2986 if (!nextchr)
4633a7c4 2987 sayNO;
ffc61ed2 2988 if (do_utf8) {
1a4fad37 2989 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2990 if (!(OP(scan) == ALNUM
bb7a0f54 2991 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2992 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2993 {
2994 sayNO;
a0ed51b3 2995 }
b8c5462f 2996 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2997 nextchr = UCHARAT(locinput);
2998 break;
2999 }
ffc61ed2 3000 if (!(OP(scan) == ALNUM
d6a28714 3001 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3002 sayNO;
b8c5462f 3003 nextchr = UCHARAT(++locinput);
a0d0e21e 3004 break;
d6a28714 3005 case NALNUML:
b8c5462f
JH
3006 PL_reg_flags |= RF_tainted;
3007 /* FALL THROUGH */
d6a28714
JH
3008 case NALNUM:
3009 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3010 sayNO;
ffc61ed2 3011 if (do_utf8) {
1a4fad37 3012 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3013 if (OP(scan) == NALNUM
bb7a0f54 3014 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3015 : isALNUM_LC_utf8((U8*)locinput))
3016 {
b8c5462f 3017 sayNO;
d6a28714 3018 }
b8c5462f
JH
3019 locinput += PL_utf8skip[nextchr];
3020 nextchr = UCHARAT(locinput);
3021 break;
3022 }
ffc61ed2 3023 if (OP(scan) == NALNUM
d6a28714 3024 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3025 sayNO;
76e3520e 3026 nextchr = UCHARAT(++locinput);
a0d0e21e 3027 break;
d6a28714
JH
3028 case BOUNDL:
3029 case NBOUNDL:
3280af22 3030 PL_reg_flags |= RF_tainted;
bbce6d69 3031 /* FALL THROUGH */
d6a28714
JH
3032 case BOUND:
3033 case NBOUND:
3034 /* was last char in word? */
ffc61ed2 3035 if (do_utf8) {
12d33761 3036 if (locinput == PL_bostr)
5d9a96ca 3037 st->ln = '\n';
ffc61ed2 3038 else {
a3b680e6 3039 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3040
4ad0818d 3041 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3042 }
3043 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3044 st->ln = isALNUM_uni(st->ln);
1a4fad37 3045 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3046 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3047 }
3048 else {
5d9a96ca 3049 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3050 n = isALNUM_LC_utf8((U8*)locinput);
3051 }
a0ed51b3 3052 }
d6a28714 3053 else {
5d9a96ca 3054 st->ln = (locinput != PL_bostr) ?
12d33761 3055 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3056 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3057 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3058 n = isALNUM(nextchr);
3059 }
3060 else {
5d9a96ca 3061 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3062 n = isALNUM_LC(nextchr);
3063 }
d6a28714 3064 }
5d9a96ca 3065 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3066 OP(scan) == BOUNDL))
3067 sayNO;
a0ed51b3 3068 break;
d6a28714 3069 case SPACEL:
3280af22 3070 PL_reg_flags |= RF_tainted;
bbce6d69 3071 /* FALL THROUGH */
d6a28714 3072 case SPACE:
9442cb0e 3073 if (!nextchr)
4633a7c4 3074 sayNO;
1aa99e6b 3075 if (do_utf8) {
fd400ab9 3076 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3077 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3078 if (!(OP(scan) == SPACE
bb7a0f54 3079 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3080 : isSPACE_LC_utf8((U8*)locinput)))
3081 {
3082 sayNO;
3083 }
3084 locinput += PL_utf8skip[nextchr];
3085 nextchr = UCHARAT(locinput);
3086 break;
d6a28714 3087 }
ffc61ed2
JH
3088 if (!(OP(scan) == SPACE
3089 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3090 sayNO;
3091 nextchr = UCHARAT(++locinput);
3092 }
3093 else {
3094 if (!(OP(scan) == SPACE
3095 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3096 sayNO;
3097 nextchr = UCHARAT(++locinput);
a0ed51b3 3098 }
a0ed51b3 3099 break;
d6a28714 3100 case NSPACEL:
3280af22 3101 PL_reg_flags |= RF_tainted;
bbce6d69 3102 /* FALL THROUGH */
d6a28714 3103 case NSPACE:
9442cb0e 3104 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3105 sayNO;
1aa99e6b 3106 if (do_utf8) {
1a4fad37 3107 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3108 if (OP(scan) == NSPACE
bb7a0f54 3109 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3110 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3111 {
3112 sayNO;
3113 }
3114 locinput += PL_utf8skip[nextchr];
3115 nextchr = UCHARAT(locinput);
3116 break;
a0ed51b3 3117 }
ffc61ed2 3118 if (OP(scan) == NSPACE
d6a28714 3119 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3120 sayNO;
76e3520e 3121 nextchr = UCHARAT(++locinput);
a0d0e21e 3122 break;
d6a28714 3123 case DIGITL:
a0ed51b3
LW
3124 PL_reg_flags |= RF_tainted;
3125 /* FALL THROUGH */
d6a28714 3126 case DIGIT:
9442cb0e 3127 if (!nextchr)
a0ed51b3 3128 sayNO;
1aa99e6b 3129 if (do_utf8) {
1a4fad37 3130 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3131 if (!(OP(scan) == DIGIT
bb7a0f54 3132 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3133 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3134 {
a0ed51b3 3135 sayNO;
dfe13c55 3136 }
6f06b55f 3137 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3138 nextchr = UCHARAT(locinput);
3139 break;
3140 }
ffc61ed2 3141 if (!(OP(scan) == DIGIT
9442cb0e 3142 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3143 sayNO;
3144 nextchr = UCHARAT(++locinput);
3145 break;
d6a28714 3146 case NDIGITL:
b8c5462f
JH
3147 PL_reg_flags |= RF_tainted;
3148 /* FALL THROUGH */
d6a28714 3149 case NDIGIT:
9442cb0e 3150 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3151 sayNO;
1aa99e6b 3152 if (do_utf8) {
1a4fad37 3153 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3154 if (OP(scan) == NDIGIT
bb7a0f54 3155 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3156 : isDIGIT_LC_utf8((U8*)locinput))
3157 {
a0ed51b3 3158 sayNO;
9442cb0e 3159 }
6f06b55f 3160 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3161 nextchr = UCHARAT(locinput);
3162 break;
3163 }
ffc61ed2 3164 if (OP(scan) == NDIGIT
9442cb0e 3165 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3166 sayNO;
3167 nextchr = UCHARAT(++locinput);
3168 break;
3169 case CLUMP:
b7c83a7e 3170 if (locinput >= PL_regeol)
a0ed51b3 3171 sayNO;
b7c83a7e 3172 if (do_utf8) {
1a4fad37 3173 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3174 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3175 sayNO;
3176 locinput += PL_utf8skip[nextchr];
3177 while (locinput < PL_regeol &&
3178 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3179 locinput += UTF8SKIP(locinput);
3180 if (locinput > PL_regeol)
3181 sayNO;
eb08e2da
JH
3182 }
3183 else
3184 locinput++;
a0ed51b3
LW
3185 nextchr = UCHARAT(locinput);
3186 break;
c8756f30 3187 case REFFL:
3280af22 3188 PL_reg_flags |= RF_tainted;
c8756f30 3189 /* FALL THROUGH */
c277df42 3190 case REF:
95b24440
DM
3191 case REFF: {
3192 char *s;
c277df42 3193 n = ARG(scan); /* which paren pair */
5d9a96ca 3194 st->ln = PL_regstartp[n];
2c2d71f5 3195 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3196 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3197 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3198 if (st->ln == PL_regendp[n])
a0d0e21e 3199 break;
a0ed51b3 3200
5d9a96ca 3201 s = PL_bostr + st->ln;
1aa99e6b 3202 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3203 char *l = locinput;
a3b680e6 3204 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3205 /*
3206 * Note that we can't do the "other character" lookup trick as
3207 * in the 8-bit case (no pun intended) because in Unicode we
3208 * have to map both upper and title case to lower case.
3209 */
3210 if (OP(scan) == REFF) {
3211 while (s < e) {
a3b680e6
AL
3212 STRLEN ulen1, ulen2;
3213 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3214 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3215
a0ed51b3
LW
3216 if (l >= PL_regeol)
3217 sayNO;
a2a2844f
JH
3218 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3219 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3220 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3221 sayNO;
a2a2844f
JH
3222 s += ulen1;
3223 l += ulen2;
a0ed51b3
LW
3224 }
3225 }
3226 locinput = l;
3227 nextchr = UCHARAT(locinput);
3228 break;
3229 }
3230
a0d0e21e 3231 /* Inline the first character, for speed. */
76e3520e 3232 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3233 (OP(scan) == REF ||
3234 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3235 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3236 sayNO;
5d9a96ca
DM
3237 st->ln = PL_regendp[n] - st->ln;
3238 if (locinput + st->ln > PL_regeol)
4633a7c4 3239 sayNO;
5d9a96ca
DM
3240 if (st->ln > 1 && (OP(scan) == REF
3241 ? memNE(s, locinput, st->ln)
c8756f30 3242 : (OP(scan) == REFF
5d9a96ca
DM
3243 ? ibcmp(s, locinput, st->ln)
3244 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3245 sayNO;
5d9a96ca 3246 locinput += st->ln;
76e3520e 3247 nextchr = UCHARAT(locinput);
a0d0e21e 3248 break;
95b24440 3249 }
a0d0e21e
LW
3250
3251 case NOTHING:
c277df42 3252 case TAIL:
a0d0e21e
LW
3253 break;
3254 case BACK:
3255 break;
40a82448
DM
3256
3257#undef ST
3258#define ST st->u.eval
3259
3260 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
c277df42 3261 {
c277df42 3262 SV *ret;
8e5e9ebe 3263 {
4aabdb9b
DM
3264 /* execute the code in the {...} */
3265 dSP;
6136c704 3266 SV ** const before = SP;
4aabdb9b
DM
3267 OP_4tree * const oop = PL_op;
3268 COP * const ocurcop = PL_curcop;
3269 PAD *old_comppad;
4aabdb9b
DM
3270
3271 n = ARG(scan);
32fc9b6a 3272 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3273 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3274 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3275 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3276
8e5e9ebe
RGS
3277 CALLRUNOPS(aTHX); /* Scalar context. */
3278 SPAGAIN;
3279 if (SP == before)
075aa684 3280 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3281 else {
3282 ret = POPs;
3283 PUTBACK;
3284 }
4aabdb9b
DM
3285
3286 PL_op = oop;
3287 PAD_RESTORE_LOCAL(old_comppad);
3288 PL_curcop = ocurcop;
3289 if (!st->logical) {
3290 /* /(?{...})/ */
3291 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3292 break;
3293 }
8e5e9ebe 3294 }
4aabdb9b
DM
3295 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3296 regexp *re;
4aabdb9b 3297 {
4f639d21
DM
3298 /* extract RE object from returned value; compiling if
3299 * necessary */
3300
6136c704 3301 MAGIC *mg = NULL;
be8e71aa 3302 const SV *sv;
faf82a0b
AE
3303 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3304 mg = mg_find(sv, PERL_MAGIC_qr);
3305 else if (SvSMAGICAL(ret)) {
3306 if (SvGMAGICAL(ret))
3307 sv_unmagic(ret, PERL_MAGIC_qr);
3308 else
3309 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3310 }
faf82a0b 3311
0f5d15d6
IZ
3312 if (mg) {
3313 re = (regexp *)mg->mg_obj;
df0003d4 3314 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3315 }
3316 else {
3317 STRLEN len;
6136c704 3318 const char * const t = SvPV_const(ret, len);
0f5d15d6 3319 PMOP pm;
a3b680e6 3320 const I32 osize = PL_regsize;
0f5d15d6 3321
5fcd1c1b 3322 Zero(&pm, 1, PMOP);
4aabdb9b 3323 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3324 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3325 if (!(SvFLAGS(ret)
faf82a0b
AE
3326 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3327 | SVs_GMG)))
14befaf4
DM
3328 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3329 PERL_MAGIC_qr,0,0);
0f5d15d6 3330 PL_regsize = osize;
0f5d15d6 3331 }
4aabdb9b 3332 }
aa283a38
DM
3333
3334 /* run the pattern returned from (??{...}) */
4aabdb9b 3335 DEBUG_EXECUTE_r(
ab3bbdeb
YO
3336 debug_start_match(re, do_utf8, locinput, PL_regeol,
3337 "Matching embedded");
4aabdb9b 3338 );
2c2d71f5 3339
40a82448
DM
3340 ST.cp = regcppush(0); /* Save *all* the positions. */
3341 REGCP_SET(ST.lastcp);
4aabdb9b
DM
3342 *PL_reglastparen = 0;
3343 *PL_reglastcloseparen = 0;
4aabdb9b 3344 PL_reginput = locinput;
4aabdb9b
DM
3345
3346 /* XXXX This is too dramatic a measure... */
3347 PL_reg_maxiter = 0;
3348
5d9a96ca 3349 st->logical = 0;
40a82448 3350 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
aa283a38 3351 ((re->reganch & ROPT_UTF8) != 0);
40a82448
DM
3352 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3353 ST.prev_rex = rex;
aa283a38 3354 rex = re;
aa283a38 3355
40a82448 3356 ST.B = next;
aa283a38 3357 /* now continue from first node in postoned RE */
40a82448 3358 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
4aabdb9b 3359 /* NOTREACHED */
a0ed51b3 3360 }
4aabdb9b
DM
3361 /* /(?(?{...})X|Y)/ */
3362 st->sw = SvTRUE(ret);
3363 st->logical = 0;
c277df42
IZ
3364 break;
3365 }
40a82448
DM
3366
3367 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3368 if (ST.toggleutf)
3369 PL_reg_flags ^= RF_utf8;
3370 ReREFCNT_dec(rex);
3371 rex = ST.prev_rex;
3372 /* XXXX This is too dramatic a measure... */
3373 PL_reg_maxiter = 0;
3374 /* Restore parens of the caller without popping the
3375 * savestack */
3376 {
3377 const I32 tmp = PL_savestack_ix;
3378 PL_savestack_ix = ST.lastcp;
3379 regcppop(rex);
3380 PL_savestack_ix = tmp;
3381 }
3382 PL_reginput = locinput;
3383 /* continue at the node following the (??{...}) */
3384 scan = ST.B;
3385 continue;
3386
3387 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3388 /* Restore state to the outer re then re-throw the failure */
3389 if (ST.toggleutf)
3390 PL_reg_flags ^= RF_utf8;
3391 ReREFCNT_dec(rex);
3392 rex = ST.prev_rex;
3393
3394 /* XXXX This is too dramatic a measure... */
3395 PL_reg_maxiter = 0;
3396
3397 PL_reginput = locinput;
3398 REGCP_UNWIND(ST.lastcp);
3399 regcppop(rex);
3400 sayNO_SILENT;
3401
3402#undef ST
3403
a0d0e21e 3404 case OPEN:
c277df42 3405 n = ARG(scan); /* which paren pair */
3280af22
NIS
3406 PL_reg_start_tmp[n] = locinput;
3407 if (n > PL_regsize)
3408 PL_regsize = n;
a0d0e21e
LW
3409 break;
3410 case CLOSE:
c277df42 3411 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3412 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3413 PL_regendp[n] = locinput - PL_bostr;
eb160463 3414 if (n > (I32)*PL_reglastparen)
3280af22 3415 *PL_reglastparen = n;
a01268b5 3416 *PL_reglastcloseparen = n;
a0d0e21e 3417 break;
c277df42
IZ
3418 case GROUPP:
3419 n = ARG(scan); /* which paren pair */
5d9a96ca 3420 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3421 break;
3422 case IFTHEN:
2c2d71f5 3423 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3424 if (st->sw)
c277df42
IZ
3425 next = NEXTOPER(NEXTOPER(scan));
3426 else {
3427 next = scan + ARG(scan);
3428 if (OP(next) == IFTHEN) /* Fake one. */
3429 next = NEXTOPER(NEXTOPER(next));
3430 }
3431 break;
3432 case LOGICAL:
5d9a96ca 3433 st->logical = scan->flags;
c277df42 3434 break;
2ab05381 3435/*******************************************************************
a0374537
DM
3436 cc points to the regmatch_state associated with the most recent CURLYX.
3437 This struct contains info about the innermost (...)* loop (an
3438 "infoblock"), and a pointer to the next outer cc.
2ab05381
IZ
3439
3440 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3441
95b24440 3442 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3443
a0374537 3444 2) This regnode populates cc, and calls regmatch() recursively
95b24440 3445 with the starting point at WHILEM node;
2ab05381
IZ
3446
3447 3) Each hit of WHILEM node tries to match A and Z (in the order
3448 depending on the current iteration, min/max of {min,max} and
3449 greediness). The information about where are nodes for "A"
a0374537 3450 and "Z" is read from cc, as is info on how many times "A"
2ab05381
IZ
3451 was already matched, and greediness.
3452
3453 4) After A matches, the same WHILEM node is hit again.
3454
95b24440 3455 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
2ab05381 3456 of the same pair. Thus when WHILEM tries to match Z, it temporarily
95b24440 3457 resets cc, since this Y(A)*Z can be a part of some other loop:
2ab05381
IZ
3458 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3459 of the external loop.
3460
a0374537 3461 Currently present infoblocks form a tree with a stem formed by st->cc
2ab05381
IZ
3462 and whatever it mentions via ->next, and additional attached trees
3463 corresponding to temporarily unset infoblocks as in "5" above.
3464
95b24440 3465 In the following picture, infoblocks for outer loop of
2ab05381
IZ
3466 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3467 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3468 infoblocks are drawn below the "reset" infoblock.
3469
3470 In fact in the picture below we do not show failed matches for Z and T
3471 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3472 more obvious *why* one needs to *temporary* unset infoblocks.]
3473
3474 Matched REx position InfoBlocks Comment
3475 (Y(A)*?Z)*?T x
3476 Y(A)*?Z)*?T x <- O
3477 Y (A)*?Z)*?T x <- O
3478 Y A)*?Z)*?T x <- O <- I
3479 YA )*?Z)*?T x <- O <- I
3480 YA A)*?Z)*?T x <- O <- I
3481 YAA )*?Z)*?T x <- O <- I
3482 YAA Z)*?T x <- O # Temporary unset I
3483 I
3484
3485 YAAZ Y(A)*?Z)*?T x <- O
3486 I
3487
3488 YAAZY (A)*?Z)*?T x <- O
3489 I
3490
3491 YAAZY A)*?Z)*?T x <- O <- I
3492 I
3493
3494 YAAZYA )*?Z)*?T x <- O <- I
3495 I
3496
3497 YAAZYA Z)*?T x <- O # Temporary unset I
3498 I,I
3499
3500 YAAZYAZ )*?T x <- O
3501 I,I
3502
3503 YAAZYAZ T x # Temporary unset O
3504 O
3505 I,I
3506
3507 YAAZYAZT x
3508 O
3509 I,I
3510 *******************************************************************/
95b24440 3511
a0d0e21e 3512 case CURLYX: {
cb434fcc 3513 /* No need to save/restore up to this paren */
10edeb5d
JH
3514 parenfloor = scan->flags;
3515
c2b7afd3
NC
3516 /* Dave says:
3517
3518 CURLYX and WHILEM are always paired: they're the moral
3519 equivalent of pp_enteriter anbd pp_iter.
3520
3521 The only time next could be null is if the node tree is
3522 corrupt. This was mentioned on p5p a few days ago.
3523
3524 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3525 So we'll assert that this is true:
3526 */
3527 assert(next);
30b2893d 3528 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
c277df42 3529 next += ARG(next);
cb434fcc
IZ
3530 /* XXXX Probably it is better to teach regpush to support
3531 parenfloor > PL_regsize... */
eb160463 3532 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc 3533 parenfloor = *PL_reglastparen; /* Pessimization... */
a0374537 3534
d8319b27
DM
3535 st->u.curlyx.cp = PL_savestack_ix;
3536 st->u.curlyx.outercc = st->cc;
a0374537
DM
3537 st->cc = st;
3538 /* these fields contain the state of the current curly.
3539 * they are accessed by subsequent WHILEMs;
3540 * cur and lastloc are also updated by WHILEM */
d8319b27
DM
3541 st->u.curlyx.parenfloor = parenfloor;
3542 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3543 st->u.curlyx.min = ARG1(scan);
3544 st->u.curlyx.max = ARG2(scan);
3545 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3546 st->u.curlyx.lastloc = 0;
a0374537
DM
3547 /* st->next and st->minmod are also read by WHILEM */
3548
3280af22 3549 PL_reginput = locinput;
95b24440
DM
3550 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3551 /*** all unsaved local vars undefined at this point */
d8319b27
DM
3552 regcpblow(st->u.curlyx.cp);
3553 st->cc = st->u.curlyx.outercc;
95b24440 3554 saySAME(result);
a0d0e21e 3555 }
5f66b61c 3556 /* NOTREACHED */
a0d0e21e
LW
3557 case WHILEM: {
3558 /*
3559 * This is really hard to understand, because after we match
3560 * what we're trying to match, we must make sure the rest of
2c2d71f5 3561 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3562 * to go back UP the parse tree by recursing ever deeper. And
3563 * if it fails, we have to reset our parent's current state
3564 * that we can try again after backing off.
3565 */
3566
c2b7afd3
NC
3567 /* Dave says:
3568
3569 st->cc gets initialised by CURLYX ready for use by WHILEM.
3570 So again, unless somethings been corrupted, st->cc cannot
3571 be null at that point in WHILEM.
3572
3573 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3574 So we'll assert that this is true:
3575 */
3576 assert(st->cc);
d8319b27
DM
3577 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3578 st->u.whilem.cache_offset = 0;
3579 st->u.whilem.cache_bit = 0;
c277df42 3580
d8319b27 3581 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3280af22 3582 PL_reginput = locinput;
a0d0e21e 3583
a3621e74 3584 DEBUG_EXECUTE_r(
9041c2e3 3585 PerlIO_printf(Perl_debug_log,
91f3b821 3586 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3587 REPORT_CODE_OFF+PL_regindent*2, "",
d8319b27
DM
3588 (long)n, (long)st->cc->u.curlyx.min,
3589 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
c277df42 3590 );
4633a7c4 3591
a0d0e21e
LW
3592 /* If degenerate scan matches "", assume scan done. */
3593
d8319b27
DM
3594 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3595 st->u.whilem.savecc = st->cc;
3596 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3597 if (st->cc)
d8319b27 3598 st->ln = st->cc->u.curlyx.cur;
a3621e74 3599 DEBUG_EXECUTE_r(
c3464db5
DD
3600 PerlIO_printf(Perl_debug_log,
3601 "%*s empty match detected, try continuation...\n",
3280af22 3602 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3603 );
d8319b27 3604 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
95b24440 3605 /*** all unsaved local vars undefined at this point */
d8319b27 3606 st->cc = st->u.whilem.savecc;
95b24440 3607 if (result)
4633a7c4 3608 sayYES;
d8319b27
DM
3609 if (st->cc->u.curlyx.outercc)
3610 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4633a7c4 3611 sayNO;
a0d0e21e
LW
3612 }
3613
3614 /* First just match a string of min scans. */
3615
d8319b27
DM
3616 if (n < st->cc->u.curlyx.min) {
3617 st->cc->u.curlyx.cur = n;
3618 st->cc->u.curlyx.lastloc = locinput;
3619 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
95b24440
DM
3620 /*** all unsaved local vars undefined at this point */
3621 if (result)
4633a7c4 3622 sayYES;
d8319b27
DM
3623 st->cc->u.curlyx.cur = n - 1;
3624 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
4633a7c4 3625 sayNO;
a0d0e21e
LW
3626 }
3627
2c2d71f5
JH
3628 if (scan->flags) {
3629 /* Check whether we already were at this position.
3630 Postpone detection until we know the match is not
3631 *that* much linear. */
3632 if (!PL_reg_maxiter) {
3633 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
3634 /* possible overflow for long strings and many CURLYX's */
3635 if (PL_reg_maxiter < 0)
3636 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
3637 PL_reg_leftiter = PL_reg_maxiter;
3638 }
3639 if (PL_reg_leftiter-- == 0) {
3298f257 3640 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 3641 if (PL_reg_poscache) {
eb160463 3642 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3643 Renew(PL_reg_poscache, size, char);
3644 PL_reg_poscache_size = size;
3645 }
3646 Zero(PL_reg_poscache, size, char);
3647 }
3648 else {
3649 PL_reg_poscache_size = size;
a02a5408 3650 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3651 }
a3621e74 3652 DEBUG_EXECUTE_r(
2c2d71f5
JH
3653 PerlIO_printf(Perl_debug_log,
3654 "%sDetected a super-linear match, switching on caching%s...\n",
3655 PL_colors[4], PL_colors[5])
3656 );
3657 }
3658 if (PL_reg_leftiter < 0) {
d8319b27 3659 st->u.whilem.cache_offset = locinput - PL_bostr;
2c2d71f5 3660
3298f257 3661 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1
d8319b27
DM
3662 + st->u.whilem.cache_offset * (scan->flags>>4);
3663 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3664 st->u.whilem.cache_offset /= 8;
3665 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
a3621e74 3666 DEBUG_EXECUTE_r(
2c2d71f5
JH
3667 PerlIO_printf(Perl_debug_log,
3668 "%*s already tried at this position...\n",
3669 REPORT_CODE_OFF+PL_regindent*2, "")
3670 );
3298f257 3671 sayNO; /* cache records failure */
2c2d71f5 3672 }
2c2d71f5
JH
3673 }
3674 }
3675
a0d0e21e
LW
3676 /* Prefer next over scan for minimal matching. */
3677
5d9a96ca 3678 if (st->cc->minmod) {
d8319b27
DM
3679 st->u.whilem.savecc = st->cc;
3680 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3681 if (st->cc)
d8319b27
DM
3682 st->ln = st->cc->u.curlyx.cur;
3683 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3684 REGCP_SET(st->u.whilem.lastcp);
3685 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
95b24440 3686 /*** all unsaved local vars undefined at this point */
d8319b27 3687 st->cc = st->u.whilem.savecc;
95b24440 3688 if (result) {
d8319b27 3689 regcpblow(st->u.whilem.cp);
3298f257 3690 sayYES; /* All done. */
5f05dabc 3691 }
d8319b27 3692 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3693 regcppop(rex);
d8319b27
DM
3694 if (st->cc->u.curlyx.outercc)
3695 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
a0d0e21e 3696
d8319b27 3697 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
9041c2e3 3698 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3699 && !(PL_reg_flags & RF_warned)) {
3700 PL_reg_flags |= RF_warned;
9014280d 3701 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3702 "Complex regular subexpression recursion",
3703 REG_INFTY - 1);
c277df42 3704 }
3ab3c9b4 3705 CACHEsayNO;
c277df42 3706 }
a687059c 3707
a3621e74 3708 DEBUG_EXECUTE_r(
c3464db5
DD
3709 PerlIO_printf(Perl_debug_log,
3710 "%*s trying longer...\n",
3280af22 3711 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3712 );
a0d0e21e 3713 /* Try scanning more and see if it helps. */
3280af22 3714 PL_reginput = locinput;
d8319b27
DM
3715 st->cc->u.curlyx.cur = n;
3716 st->cc->u.curlyx.lastloc = locinput;
3717 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3718 REGCP_SET(st->u.whilem.lastcp);
3719 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
95b24440
DM
3720 /*** all unsaved local vars undefined at this point */
3721 if (result) {
d8319b27 3722 regcpblow(st->u.whilem.cp);
3298f257 3723 sayYES;
5f05dabc 3724 }
d8319b27 3725 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3726 regcppop(rex);
d8319b27
DM
3727 st->cc->u.curlyx.cur = n - 1;
3728 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3729 CACHEsayNO;
a0d0e21e
LW
3730 }
3731
3732 /* Prefer scan over next for maximal matching. */
3733
d8319b27
DM
3734 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3735 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3736 st->cc->u.curlyx.cur = n;
3737 st->cc->u.curlyx.lastloc = locinput;
3738 REGCP_SET(st->u.whilem.lastcp);
3739 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
95b24440
DM
3740 /*** all unsaved local vars undefined at this point */
3741 if (result) {
d8319b27 3742 regcpblow(st->u.whilem.cp);
3298f257 3743 sayYES;
5f05dabc 3744 }
d8319b27 3745 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3746 regcppop(rex); /* Restore some previous $<digit>s? */
3280af22 3747 PL_reginput = locinput;
a3621e74 3748 DEBUG_EXECUTE_r(
c3464db5
DD
3749 PerlIO_printf(Perl_debug_log,
3750 "%*s failed, try continuation...\n",
3280af22 3751 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3752 );
3753 }
9041c2e3 3754 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3755 && !(PL_reg_flags & RF_warned)) {
3280af22 3756 PL_reg_flags |= RF_warned;
9014280d 3757 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3758 "Complex regular subexpression recursion",
3759 REG_INFTY - 1);
a0d0e21e
LW
3760 }
3761
3762 /* Failed deeper matches of scan, so see if this one works. */
d8319b27
DM
3763 st->u.whilem.savecc = st->cc;
3764 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3765 if (st->cc)
d8319b27
DM
3766 st->ln = st->cc->u.curlyx.cur;
3767 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
95b24440 3768 /*** all unsaved local vars undefined at this point */
d8319b27 3769 st->cc = st->u.whilem.savecc;
95b24440 3770 if (result)
3298f257 3771 sayYES;
d8319b27
DM
3772 if (st->cc->u.curlyx.outercc)
3773 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3774 st->cc->u.curlyx.cur = n - 1;
3775 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3776 CACHEsayNO;
a0d0e21e 3777 }
5f66b61c 3778 /* NOTREACHED */
40a82448
DM
3779
3780#undef ST
3781#define ST st->u.branch
3782
3783 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
3784 next = scan + ARG(scan);
3785 if (next == scan)
3786 next = NULL;
40a82448
DM
3787 scan = NEXTOPER(scan);
3788 /* FALL THROUGH */
c277df42 3789
40a82448
DM
3790 case BRANCH: /* /(...|A|...)/ */
3791 scan = NEXTOPER(scan); /* scan now points to inner node */
3792 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3793 /* last branch; skip state push and jump direct to node */
3794 continue;
3795 ST.lastparen = *PL_reglastparen;
3796 ST.next_branch = next;
3797 REGCP_SET(ST.cp);
3798 PL_reginput = locinput;
02db2b7b 3799
40a82448
DM
3800 /* Now go into the branch */
3801 PUSH_STATE_GOTO(BRANCH_next, scan);
3802 /* NOTREACHED */
3803
3804 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3805 REGCP_UNWIND(ST.cp);
3806 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3807 PL_regendp[n] = -1;
3808 *PL_reglastparen = n;
3809 scan = ST.next_branch;
3810 /* no more branches? */
3811 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
3812 sayNO;
3813 continue; /* execute next BRANCH[J] op */
3814 /* NOTREACHED */
3815
a0d0e21e 3816 case MINMOD:
5d9a96ca 3817 st->minmod = 1;
a0d0e21e 3818 break;
40a82448
DM
3819
3820#undef ST
3821#define ST st->u.curlym
3822
3823 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
3824
3825 /* This is an optimisation of CURLYX that enables us to push
3826 * only a single backtracking state, no matter now many matches
3827 * there are in {m,n}. It relies on the pattern being constant
3828 * length, with no parens to influence future backrefs
3829 */
3830
3831 ST.me = scan;
dc45a647 3832 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
3833
3834 /* if paren positive, emulate an OPEN/CLOSE around A */
3835 if (ST.me->flags) {
3836 I32 paren = ST.me->flags;
3837 if (paren > PL_regsize)
3838 PL_regsize = paren;
3839 if (paren > (I32)*PL_reglastparen)
3840 *PL_reglastparen = paren;
c277df42 3841 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 3842 }
40a82448
DM
3843 ST.A = scan;
3844 ST.B = next;
3845 ST.alen = 0;
3846 ST.count = 0;
3847 ST.minmod = st->minmod;
3848 st->minmod = 0;
3849 ST.c1 = CHRTEST_UNINIT;
3850 REGCP_SET(ST.cp);
6407bf3b 3851
40a82448
DM
3852 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
3853 goto curlym_do_B;
3854
3855 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 3856 PL_reginput = locinput;
40a82448
DM
3857 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
3858 /* NOTREACHED */
5f80c4cf 3859
40a82448
DM
3860 case CURLYM_A: /* we've just matched an A */
3861 locinput = st->locinput;
3862 nextchr = UCHARAT(locinput);
3863
3864 ST.count++;
3865 /* after first match, determine A's length: u.curlym.alen */
3866 if (ST.count == 1) {
3867 if (PL_reg_match_utf8) {
3868 char *s = locinput;
3869 while (s < PL_reginput) {
3870 ST.alen++;
3871 s += UTF8SKIP(s);
3872 }
3873 }
3874 else {
3875 ST.alen = PL_reginput - locinput;
3876 }
3877 if (ST.alen == 0)
3878 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
3879 }
0cadcf80
DM
3880 DEBUG_EXECUTE_r(
3881 PerlIO_printf(Perl_debug_log,
40a82448 3882 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
ab3bbdeb 3883 (int)(REPORT_CODE_OFF+(PL_regindent*2)), "",
40a82448 3884 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
3885 );
3886
40a82448
DM
3887 locinput = PL_reginput;
3888 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
3889 goto curlym_do_A; /* try to match another A */
3890 goto curlym_do_B; /* try to match B */
5f80c4cf 3891
40a82448
DM
3892 case CURLYM_A_fail: /* just failed to match an A */
3893 REGCP_UNWIND(ST.cp);
3894 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
3895 sayNO;
0cadcf80 3896
40a82448
DM
3897 curlym_do_B: /* execute the B in /A{m,n}B/ */
3898 PL_reginput = locinput;
3899 if (ST.c1 == CHRTEST_UNINIT) {
3900 /* calculate c1 and c2 for possible match of 1st char
3901 * following curly */
3902 ST.c1 = ST.c2 = CHRTEST_VOID;
3903 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
3904 regnode *text_node = ST.B;
3905 if (! HAS_TEXT(text_node))
3906 FIND_NEXT_IMPT(text_node);
3907 if (HAS_TEXT(text_node)
3908 && PL_regkind[OP(text_node)] != REF)
3909 {
3910 ST.c1 = (U8)*STRING(text_node);
3911 ST.c2 =
3912 (OP(text_node) == EXACTF || OP(text_node) == REFF)
3913 ? PL_fold[ST.c1]
3914 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3915 ? PL_fold_locale[ST.c1]
3916 : ST.c1;
c277df42 3917 }
c277df42 3918 }
40a82448
DM
3919 }
3920
3921 DEBUG_EXECUTE_r(
3922 PerlIO_printf(Perl_debug_log,
3923 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
ab3bbdeb 3924 (int)(REPORT_CODE_OFF+(PL_regindent*2)),
40a82448
DM
3925 "", (IV)ST.count)
3926 );
3927 if (ST.c1 != CHRTEST_VOID
3928 && UCHARAT(PL_reginput) != ST.c1
3929 && UCHARAT(PL_reginput) != ST.c2)
3930 {
3931 /* simulate B failing */
3932 state_num = CURLYM_B_fail;
3933 goto reenter_switch;
3934 }
3935
3936 if (ST.me->flags) {
3937 /* mark current A as captured */
3938 I32 paren = ST.me->flags;
3939 if (ST.count) {
3940 PL_regstartp[paren]
3941 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
3942 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3943 }
40a82448
DM
3944 else
3945 PL_regendp[paren] = -1;
c277df42 3946 }
40a82448 3947 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 3948 /* NOTREACHED */
40a82448
DM
3949
3950 case CURLYM_B_fail: /* just failed to match a B */
3951 REGCP_UNWIND(ST.cp);
3952 if (ST.minmod) {
3953 if (ST.count == ARG2(ST.me) /* max */)
3954 sayNO;
3955 goto curlym_do_A; /* try to match a further A */
3956 }
3957 /* backtrack one A */
3958 if (ST.count == ARG1(ST.me) /* min */)
3959 sayNO;
3960 ST.count--;
3961 locinput = HOPc(locinput, -ST.alen);
3962 goto curlym_do_B; /* try to match B */
3963
c255a977
DM
3964#undef ST
3965#define ST st->u.curly
40a82448 3966
c255a977
DM
3967#define CURLY_SETPAREN(paren, success) \
3968 if (paren) { \
3969 if (success) { \
3970 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
3971 PL_regendp[paren] = locinput - PL_bostr; \
3972 } \
3973 else \
3974 PL_regendp[paren] = -1; \
3975 }
3976
3977 case STAR: /* /A*B/ where A is width 1 */
3978 ST.paren = 0;
3979 ST.min = 0;
3980 ST.max = REG_INFTY;
a0d0e21e
LW
3981 scan = NEXTOPER(scan);
3982 goto repeat;
c255a977
DM
3983 case PLUS: /* /A+B/ where A is width 1 */
3984 ST.paren = 0;
3985 ST.min = 1;
3986 ST.max = REG_INFTY;
c277df42 3987 scan = NEXTOPER(scan);
c255a977
DM
3988 goto repeat;
3989 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
3990 ST.paren = scan->flags; /* Which paren to set */
3991 if (ST.paren > PL_regsize)
3992 PL_regsize = ST.paren;
3993 if (ST.paren > (I32)*PL_reglastparen)
3994 *PL_reglastparen = ST.paren;
3995 ST.min = ARG1(scan); /* min to match */
3996 ST.max = ARG2(scan); /* max to match */
3997 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3998 goto repeat;
3999 case CURLY: /* /A{m,n}B/ where A is width 1 */
4000 ST.paren = 0;
4001 ST.min = ARG1(scan); /* min to match */
4002 ST.max = ARG2(scan); /* max to match */
4003 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 4004 repeat:
a0d0e21e
LW
4005 /*
4006 * Lookahead to avoid useless match attempts
4007 * when we know what character comes next.
c255a977 4008 *
5f80c4cf
JP
4009 * Used to only do .*x and .*?x, but now it allows
4010 * for )'s, ('s and (?{ ... })'s to be in the way
4011 * of the quantifier and the EXACT-like node. -- japhy
4012 */
4013
c255a977
DM
4014 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4015 sayNO;
cca55fe3 4016 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4017 U8 *s;
4018 regnode *text_node = next;
4019
3dab1dad
YO
4020 if (! HAS_TEXT(text_node))
4021 FIND_NEXT_IMPT(text_node);
5f80c4cf 4022
9e137952 4023 if (! HAS_TEXT(text_node))
c255a977 4024 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 4025 else {
3dab1dad 4026 if (PL_regkind[OP(text_node)] == REF) {
c255a977 4027 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 4028 goto assume_ok_easy;
cca55fe3 4029 }
be8e71aa
YO
4030 else
4031 s = (U8*)STRING(text_node);
5f80c4cf
JP
4032
4033 if (!UTF) {
c255a977 4034 ST.c2 = ST.c1 = *s;
f65d3ee7 4035 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
c255a977 4036 ST.c2 = PL_fold[ST.c1];
f65d3ee7 4037 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
c255a977 4038 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 4039 }
5f80c4cf 4040 else { /* UTF */
f65d3ee7 4041 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 4042 STRLEN ulen1, ulen2;
89ebb4a3
JH
4043 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4044 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4045
4046 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4047 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
e294cc5d
JH
4048#ifdef EBCDIC
4049 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4050 ckWARN(WARN_UTF8) ?
4051 0 : UTF8_ALLOW_ANY);
4052 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4053 ckWARN(WARN_UTF8) ?
4054 0 : UTF8_ALLOW_ANY);
4055#else
c255a977 4056 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
e294cc5d 4057 uniflags);
c255a977 4058 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
e294cc5d
JH
4059 uniflags);
4060#endif
5f80c4cf
JP
4061 }
4062 else {
c255a977 4063 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4064 uniflags);
5f80c4cf 4065 }
1aa99e6b
IH
4066 }
4067 }
bbce6d69 4068 }
a0d0e21e 4069 else
c255a977 4070 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 4071 assume_ok_easy:
c255a977
DM
4072
4073 ST.A = scan;
4074 ST.B = next;
3280af22 4075 PL_reginput = locinput;
5d9a96ca
DM
4076 if (st->minmod) {
4077 st->minmod = 0;
c255a977 4078 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4633a7c4 4079 sayNO;
c255a977 4080 ST.count = ST.min;
a0ed51b3 4081 locinput = PL_reginput;
c255a977
DM
4082 REGCP_SET(ST.cp);
4083 if (ST.c1 == CHRTEST_VOID)
4084 goto curly_try_B_min;
4085
4086 ST.oldloc = locinput;
4087
4088 /* set ST.maxpos to the furthest point along the
4089 * string that could possibly match */
4090 if (ST.max == REG_INFTY) {
4091 ST.maxpos = PL_regeol - 1;
4092 if (do_utf8)
4093 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4094 ST.maxpos--;
4095 }
4096 else if (do_utf8) {
4097 int m = ST.max - ST.min;
4098 for (ST.maxpos = locinput;
4099 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4100 ST.maxpos += UTF8SKIP(ST.maxpos);
4101 }
4102 else {
4103 ST.maxpos = locinput + ST.max - ST.min;
4104 if (ST.maxpos >= PL_regeol)
4105 ST.maxpos = PL_regeol - 1;
4106 }
4107 goto curly_try_B_min_known;
4108
4109 }
4110 else {
4111 ST.count = regrepeat(rex, ST.A, ST.max);
4112 locinput = PL_reginput;
4113 if (ST.count < ST.min)
4114 sayNO;
4115 if ((ST.count > ST.min)
4116 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4117 {
4118 /* A{m,n} must come at the end of the string, there's
4119 * no point in backing off ... */
4120 ST.min = ST.count;
4121 /* ...except that $ and \Z can match before *and* after
4122 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4123 We may back off by one in this case. */
4124 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4125 ST.min--;
4126 }
4127 REGCP_SET(ST.cp);
4128 goto curly_try_B_max;
4129 }
4130 /* NOTREACHED */
4131
4132
4133 case CURLY_B_min_known_fail:
4134 /* failed to find B in a non-greedy match where c1,c2 valid */
4135 if (ST.paren && ST.count)
4136 PL_regendp[ST.paren] = -1;
4137
4138 PL_reginput = locinput; /* Could be reset... */
4139 REGCP_UNWIND(ST.cp);
4140 /* Couldn't or didn't -- move forward. */
4141 ST.oldloc = locinput;
4142 if (do_utf8)
4143 locinput += UTF8SKIP(locinput);
4144 else
4145 locinput++;
4146 ST.count++;
4147 curly_try_B_min_known:
4148 /* find the next place where 'B' could work, then call B */
4149 {
4150 int n;
4151 if (do_utf8) {
4152 n = (ST.oldloc == locinput) ? 0 : 1;
4153 if (ST.c1 == ST.c2) {
4154 STRLEN len;
4155 /* set n to utf8_distance(oldloc, locinput) */
4156 while (locinput <= ST.maxpos &&
4157 utf8n_to_uvchr((U8*)locinput,
4158 UTF8_MAXBYTES, &len,
4159 uniflags) != (UV)ST.c1) {
4160 locinput += len;
4161 n++;
4162 }
1aa99e6b
IH
4163 }
4164 else {
c255a977
DM
4165 /* set n to utf8_distance(oldloc, locinput) */
4166 while (locinput <= ST.maxpos) {
4167 STRLEN len;
4168 const UV c = utf8n_to_uvchr((U8*)locinput,
4169 UTF8_MAXBYTES, &len,
4170 uniflags);
4171 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4172 break;
4173 locinput += len;
4174 n++;
1aa99e6b 4175 }
0fe9bf95
IZ
4176 }
4177 }
c255a977
DM
4178 else {
4179 if (ST.c1 == ST.c2) {
4180 while (locinput <= ST.maxpos &&
4181 UCHARAT(locinput) != ST.c1)
4182 locinput++;
bbce6d69 4183 }
c255a977
DM
4184 else {
4185 while (locinput <= ST.maxpos
4186 && UCHARAT(locinput) != ST.c1
4187 && UCHARAT(locinput) != ST.c2)
4188 locinput++;
a0ed51b3 4189 }
c255a977
DM
4190 n = locinput - ST.oldloc;
4191 }
4192 if (locinput > ST.maxpos)
4193 sayNO;
4194 /* PL_reginput == oldloc now */
4195 if (n) {
4196 ST.count += n;
4197 if (regrepeat(rex, ST.A, n) < n)
4633a7c4 4198 sayNO;
a0d0e21e 4199 }
c255a977
DM
4200 PL_reginput = locinput;
4201 CURLY_SETPAREN(ST.paren, ST.count);
4202 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 4203 }
c255a977
DM
4204 /* NOTREACHED */
4205
4206
4207 case CURLY_B_min_fail:
4208 /* failed to find B in a non-greedy match where c1,c2 invalid */
4209 if (ST.paren && ST.count)
4210 PL_regendp[ST.paren] = -1;
4211
4212 REGCP_UNWIND(ST.cp);
4213 /* failed -- move forward one */
4214 PL_reginput = locinput;
4215 if (regrepeat(rex, ST.A, 1)) {
4216 ST.count++;
a0ed51b3 4217 locinput = PL_reginput;
c255a977
DM
4218 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4219 ST.count > 0)) /* count overflow ? */
15272685 4220 {
c255a977
DM
4221 curly_try_B_min:
4222 CURLY_SETPAREN(ST.paren, ST.count);
4223 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
4224 }
4225 }
4633a7c4 4226 sayNO;
c255a977
DM
4227 /* NOTREACHED */
4228
4229
4230 curly_try_B_max:
4231 /* a successful greedy match: now try to match B */
4232 {
4233 UV c = 0;
4234 if (ST.c1 != CHRTEST_VOID)
4235 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4236 UTF8_MAXBYTES, 0, uniflags)
466787eb 4237 : (UV) UCHARAT(PL_reginput);
c255a977
DM
4238 /* If it could work, try it. */
4239 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4240 CURLY_SETPAREN(ST.paren, ST.count);
4241 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4242 /* NOTREACHED */
4243 }
4244 }
4245 /* FALL THROUGH */
4246 case CURLY_B_max_fail:
4247 /* failed to find B in a greedy match */
4248 if (ST.paren && ST.count)
4249 PL_regendp[ST.paren] = -1;
4250
4251 REGCP_UNWIND(ST.cp);
4252 /* back up. */
4253 if (--ST.count < ST.min)
4254 sayNO;
4255 PL_reginput = locinput = HOPc(locinput, -1);
4256 goto curly_try_B_max;
4257
4258#undef ST
4259
4260
a0d0e21e 4261 case END:
3b0527fe 4262 if (locinput < reginfo->till) {
a3621e74 4263 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4264 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4265 PL_colors[4],
4266 (long)(locinput - PL_reg_starttry),
3b0527fe 4267 (long)(reginfo->till - PL_reg_starttry),
7821416a
IZ
4268 PL_colors[5]));
4269 sayNO_FINAL; /* Cannot match: too short. */
4270 }
4271 PL_reginput = locinput; /* put where regtry can find it */
4272 sayYES_FINAL; /* Success! */
dad79028
DM
4273
4274 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4275 DEBUG_EXECUTE_r(
4276 PerlIO_printf(Perl_debug_log,
4277 "%*s %ssubpattern success...%s\n",
4278 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
3280af22 4279 PL_reginput = locinput; /* put where regtry can find it */
dad79028
DM
4280 sayYES_FINAL; /* Success! */
4281
40a82448
DM
4282#undef ST
4283#define ST st->u.ifmatch
4284
4285 case SUSPEND: /* (?>A) */
4286 ST.wanted = 1;
9fe1d20c 4287 PL_reginput = locinput;
9041c2e3 4288 goto do_ifmatch;
dad79028 4289
40a82448
DM
4290 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4291 ST.wanted = 0;
dad79028
DM
4292 goto ifmatch_trivial_fail_test;
4293
40a82448
DM
4294 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4295 ST.wanted = 1;
dad79028 4296 ifmatch_trivial_fail_test:
a0ed51b3 4297 if (scan->flags) {
52657f30 4298 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4299 if (!s) {
4300 /* trivial fail */
4301 if (st->logical) {
4302 st->logical = 0;
40a82448 4303 st->sw = 1 - (bool)ST.wanted;
dad79028 4304 }
40a82448 4305 else if (ST.wanted)
dad79028
DM
4306 sayNO;
4307 next = scan + ARG(scan);
4308 if (next == scan)
4309 next = NULL;
4310 break;
4311 }
efb30f32 4312 PL_reginput = s;
a0ed51b3
LW
4313 }
4314 else
4315 PL_reginput = locinput;
4316
c277df42 4317 do_ifmatch:
40a82448
DM
4318 ST.me = scan;
4319 /* execute body of (?...A) */
4320 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4321 /* NOTREACHED */
4322
4323 case IFMATCH_A_fail: /* body of (?...A) failed */
4324 ST.wanted = !ST.wanted;
4325 /* FALL THROUGH */
4326
4327 case IFMATCH_A: /* body of (?...A) succeeded */
4328 if (st->logical) {
4329 st->logical = 0;
4330 st->sw = (bool)ST.wanted;
4331 }
4332 else if (!ST.wanted)
4333 sayNO;
4334
4335 if (OP(ST.me) == SUSPEND)
4336 locinput = PL_reginput;
4337 else {
4338 locinput = PL_reginput = st->locinput;
4339 nextchr = UCHARAT(locinput);
4340 }
4341 scan = ST.me + ARG(ST.me);
4342 if (scan == ST.me)
4343 scan = NULL;
4344 continue; /* execute B */
4345
4346#undef ST
dad79028 4347
c277df42 4348 case LONGJMP:
c277df42
IZ
4349 next = scan + ARG(scan);
4350 if (next == scan)
4351 next = NULL;
a0d0e21e
LW
4352 break;
4353 default:
b900a521 4354 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4355 PTR2UV(scan), OP(scan));
cea2e8a9 4356 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4357 }
95b24440 4358
a0d0e21e 4359 scan = next;
95b24440
DM
4360 continue;
4361 /* NOTREACHED */
4362
40a82448
DM
4363 push_yes_state:
4364 /* push a state that backtracks on success */
4365 st->u.yes.prev_yes_state = yes_state;
4366 yes_state = st;
4367 /* FALL THROUGH */
4368 push_state:
4369 /* push a new regex state, then continue at scan */
4370 {
4371 regmatch_state *newst;
4372
4373 depth++;
ab3bbdeb 4374 DEBUG_STATE_r(PerlIO_printf(Perl_debug_log,
40a82448
DM
4375 "PUSH STATE(%d)\n", depth));
4376 st->locinput = locinput;
4377 newst = st+1;
4378 if (newst > SLAB_LAST(PL_regmatch_slab))
4379 newst = S_push_slab(aTHX);
4380 PL_regmatch_state = newst;
4381 newst->cc = st->cc;
4382 /* XXX probably don't need to initialise these */
4383 newst->minmod = 0;
4384 newst->sw = 0;
4385 newst->logical = 0;
4386
786e8c11 4387
40a82448
DM
4388 locinput = PL_reginput;
4389 nextchr = UCHARAT(locinput);
4390 st = newst;
4391 continue;
4392 /* NOTREACHED */
4393 }
4394
95b24440
DM
4395 /* simulate recursively calling regmatch(), but without actually
4396 * recursing - ie save the current state on the heap rather than on
4397 * the stack, then re-enter the loop. This avoids complex regexes
4398 * blowing the processor stack */
4399
4400 start_recurse:
4401 {
5d9a96ca
DM
4402 /* push new state */
4403 regmatch_state *oldst = st;
4404
4405 depth++;
40a82448 4406 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH RECURSE STATE(%d)\n", depth));
5d9a96ca
DM
4407
4408 /* grab the next free state slot */
4409 st++;
86545054 4410 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
4411 st = S_push_slab(aTHX);
4412 PL_regmatch_state = st;
4413
4414 oldst->next = next;
4415 oldst->n = n;
4416 oldst->locinput = locinput;
5d9a96ca
DM
4417
4418 st->cc = oldst->cc;
95b24440
DM
4419 locinput = PL_reginput;
4420 nextchr = UCHARAT(locinput);
5d9a96ca
DM
4421 st->minmod = 0;
4422 st->sw = 0;
4423 st->logical = 0;
786e8c11 4424
95b24440
DM
4425#ifdef DEBUGGING
4426 PL_regindent++;
4427#endif
4428 }
a0d0e21e 4429 }
a687059c 4430
aa283a38
DM
4431
4432
a0d0e21e
LW
4433 /*
4434 * We get here only if there's trouble -- normally "case END" is
4435 * the terminating point.
4436 */
cea2e8a9 4437 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4438 /*NOTREACHED*/
4633a7c4
LW
4439 sayNO;
4440
7821416a 4441yes_final:
77cb431f
DM
4442
4443 if (yes_state) {
4444 /* we have successfully completed a subexpression, but we must now
4445 * pop to the state marked by yes_state and continue from there */
4446
4447 assert(st != yes_state);
4448 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4449 || yes_state > SLAB_LAST(PL_regmatch_slab))
4450 {
4451 /* not in this slab, pop slab */
4452 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4453 PL_regmatch_slab = PL_regmatch_slab->prev;
4454 st = SLAB_LAST(PL_regmatch_slab);
4455 }
4456 depth -= (st - yes_state);
e4f74956
YO
4457 DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%"UVuf"..%"UVuf")\n",
4458 (UV)(depth+1), (UV)(depth+(st - yes_state))));
77cb431f
DM
4459 st = yes_state;
4460 yes_state = st->u.yes.prev_yes_state;
4461 PL_regmatch_state = st;
4462
4463 switch (st->resume_state) {
40a82448
DM
4464 case IFMATCH_A:
4465 case CURLYM_A:
40a82448
DM
4466 case EVAL_A:
4467 state_num = st->resume_state;
4468 goto reenter_switch;
4469
4470 case CURLYM_B:
166ba7cd
DM
4471 case BRANCH_next:
4472 case TRIE_next:
c255a977 4473 case CURLY_B_max:
77cb431f 4474 default:
40a82448 4475 Perl_croak(aTHX_ "unexpected yes resume state");
77cb431f
DM
4476 }
4477 }
4478
a3621e74 4479 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4480 PL_colors[4], PL_colors[5]));
4633a7c4
LW
4481yes:
4482#ifdef DEBUGGING
3280af22 4483 PL_regindent--;
4633a7c4 4484#endif
02db2b7b 4485
95b24440 4486 result = 1;
aa283a38 4487 /* XXX this is duplicate(ish) code to that in the do_no section.
40a82448 4488 * will disappear when REGFMATCH goes */
aa283a38
DM
4489 if (depth) {
4490 /* restore previous state and re-enter */
ab3bbdeb 4491 DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
40a82448
DM
4492 depth--;
4493 st--;
4494 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4495 PL_regmatch_slab = PL_regmatch_slab->prev;
4496 st = SLAB_LAST(PL_regmatch_slab);
4497 }
4498 PL_regmatch_state = st;
4499 scan = st->scan;
4500 next = st->next;
4501 n = st->n;
4502 locinput= st->locinput;
4503 nextchr = UCHARAT(locinput);
aa283a38
DM
4504
4505 switch (st->resume_state) {
aa283a38
DM
4506 case resume_CURLYX:
4507 goto resume_point_CURLYX;
4508 case resume_WHILEM1:
4509 goto resume_point_WHILEM1;
4510 case resume_WHILEM2:
4511 goto resume_point_WHILEM2;
4512 case resume_WHILEM3:
4513 goto resume_point_WHILEM3;
4514 case resume_WHILEM4:
4515 goto resume_point_WHILEM4;
4516 case resume_WHILEM5:
4517 goto resume_point_WHILEM5;
4518 case resume_WHILEM6:
4519 goto resume_point_WHILEM6;
77cb431f 4520
166ba7cd 4521 case TRIE_next:
40a82448
DM
4522 case CURLYM_A:
4523 case CURLYM_B:
4524 case EVAL_A:
4525 case IFMATCH_A:
4526 case BRANCH_next:
c255a977
DM
4527 case CURLY_B_max:
4528 case CURLY_B_min:
4529 case CURLY_B_min_known:
40a82448
DM
4530 break;
4531
aa283a38
DM
4532 default:
4533 Perl_croak(aTHX_ "regexp resume memory corruption");
4534 }
4535 }
4536 goto final_exit;
4633a7c4
LW
4537
4538no:
a3621e74 4539 DEBUG_EXECUTE_r(
7821416a 4540 PerlIO_printf(Perl_debug_log,
786e8c11
YO
4541 "%*s %sfailed...%s\n",
4542 REPORT_CODE_OFF+PL_regindent*2, "",
4543 PL_colors[4], PL_colors[5])
7821416a 4544 );
7821416a
IZ
4545no_final:
4546do_no:
aa283a38 4547
4633a7c4 4548#ifdef DEBUGGING
3280af22 4549 PL_regindent--;
4633a7c4 4550#endif
95b24440 4551 result = 0;
5d9a96ca 4552
aa283a38
DM
4553 if (depth) {
4554 /* there's a previous state to backtrack to */
ab3bbdeb 4555 DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
40a82448
DM
4556 depth--;
4557 st--;
4558 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4559 PL_regmatch_slab = PL_regmatch_slab->prev;
4560 st = SLAB_LAST(PL_regmatch_slab);
4561 }
4562 PL_regmatch_state = st;
4563 scan = st->scan;
4564 next = st->next;
4565 n = st->n;
4566 locinput= st->locinput;
4567 nextchr = UCHARAT(locinput);
4568
5d9a96ca 4569 switch (st->resume_state) {
95b24440
DM
4570 case resume_CURLYX:
4571 goto resume_point_CURLYX;
4572 case resume_WHILEM1:
4573 goto resume_point_WHILEM1;
4574 case resume_WHILEM2:
4575 goto resume_point_WHILEM2;
4576 case resume_WHILEM3:
4577 goto resume_point_WHILEM3;
4578 case resume_WHILEM4:
4579 goto resume_point_WHILEM4;
4580 case resume_WHILEM5:
4581 goto resume_point_WHILEM5;
4582 case resume_WHILEM6:
4583 goto resume_point_WHILEM6;
dad79028 4584
166ba7cd 4585 case TRIE_next:
40a82448
DM
4586 case EVAL_A:
4587 case BRANCH_next:
4588 case CURLYM_A:
4589 case CURLYM_B:
4590 case IFMATCH_A:
c255a977
DM
4591 case CURLY_B_max:
4592 case CURLY_B_min:
4593 case CURLY_B_min_known:
40a82448
DM
4594 if (yes_state == st)
4595 yes_state = st->u.yes.prev_yes_state;
4596 state_num = st->resume_state + 1; /* failure = success + 1 */
4597 goto reenter_switch;
dad79028 4598
95b24440
DM
4599 default:
4600 Perl_croak(aTHX_ "regexp resume memory corruption");
4601 }
95b24440 4602 }
aa283a38
DM
4603
4604final_exit:
4605
5d9a96ca
DM
4606 /* restore original high-water mark */
4607 PL_regmatch_slab = orig_slab;
4608 PL_regmatch_state = orig_state;
4609
4610 /* free all slabs above current one */
4611 if (orig_slab->next) {
c4fd8992 4612 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
4613 orig_slab->next = NULL;
4614 while (sl) {
c4fd8992 4615 regmatch_slab * const osl = sl;
5d9a96ca 4616 sl = sl->next;
ad65c075 4617 Safefree(osl);
5d9a96ca
DM
4618 }
4619 }
4620
95b24440
DM
4621 return result;
4622
a687059c
LW
4623}
4624
4625/*
4626 - regrepeat - repeatedly match something simple, report how many
4627 */
4628/*
4629 * [This routine now assumes that it will only match on things of length 1.
4630 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4631 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4632 */
76e3520e 4633STATIC I32
32fc9b6a 4634S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
a687059c 4635{
27da23d5 4636 dVAR;
a0d0e21e 4637 register char *scan;
a0d0e21e 4638 register I32 c;
3280af22 4639 register char *loceol = PL_regeol;
a0ed51b3 4640 register I32 hardcount = 0;
53c4c00c 4641 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4642
3280af22 4643 scan = PL_reginput;
faf11cac
HS
4644 if (max == REG_INFTY)
4645 max = I32_MAX;
4646 else if (max < loceol - scan)
7f596f4c 4647 loceol = scan + max;
a0d0e21e 4648 switch (OP(p)) {
22c35a8c 4649 case REG_ANY:
1aa99e6b 4650 if (do_utf8) {
ffc61ed2 4651 loceol = PL_regeol;
1aa99e6b 4652 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4653 scan += UTF8SKIP(scan);
4654 hardcount++;
4655 }
4656 } else {
4657 while (scan < loceol && *scan != '\n')
4658 scan++;
a0ed51b3
LW
4659 }
4660 break;
ffc61ed2 4661 case SANY:
def8e4ea
JH
4662 if (do_utf8) {
4663 loceol = PL_regeol;
a0804c9e 4664 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4665 scan += UTF8SKIP(scan);
4666 hardcount++;
4667 }
4668 }
4669 else
4670 scan = loceol;
a0ed51b3 4671 break;
f33976b4
DB
4672 case CANY:
4673 scan = loceol;
4674 break;
090f7165
JH
4675 case EXACT: /* length of string is 1 */
4676 c = (U8)*STRING(p);
4677 while (scan < loceol && UCHARAT(scan) == c)
4678 scan++;
bbce6d69 4679 break;
4680 case EXACTF: /* length of string is 1 */
cd439c50 4681 c = (U8)*STRING(p);
bbce6d69 4682 while (scan < loceol &&
22c35a8c 4683 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4684 scan++;
4685 break;
4686 case EXACTFL: /* length of string is 1 */
3280af22 4687 PL_reg_flags |= RF_tainted;
cd439c50 4688 c = (U8)*STRING(p);
bbce6d69 4689 while (scan < loceol &&
22c35a8c 4690 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4691 scan++;
4692 break;
4693 case ANYOF:
ffc61ed2
JH
4694 if (do_utf8) {
4695 loceol = PL_regeol;
cfc92286 4696 while (hardcount < max && scan < loceol &&
32fc9b6a 4697 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4698 scan += UTF8SKIP(scan);
4699 hardcount++;
4700 }
4701 } else {
32fc9b6a 4702 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
4703 scan++;
4704 }
a0d0e21e
LW
4705 break;
4706 case ALNUM:
1aa99e6b 4707 if (do_utf8) {
ffc61ed2 4708 loceol = PL_regeol;
1a4fad37 4709 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4710 while (hardcount < max && scan < loceol &&
3568d838 4711 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4712 scan += UTF8SKIP(scan);
4713 hardcount++;
4714 }
4715 } else {
4716 while (scan < loceol && isALNUM(*scan))
4717 scan++;
a0ed51b3
LW
4718 }
4719 break;
bbce6d69 4720 case ALNUML:
3280af22 4721 PL_reg_flags |= RF_tainted;
1aa99e6b 4722 if (do_utf8) {
ffc61ed2 4723 loceol = PL_regeol;
1aa99e6b
IH
4724 while (hardcount < max && scan < loceol &&
4725 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4726 scan += UTF8SKIP(scan);
4727 hardcount++;
4728 }
4729 } else {
4730 while (scan < loceol && isALNUM_LC(*scan))
4731 scan++;
a0ed51b3
LW
4732 }
4733 break;
a0d0e21e 4734 case NALNUM:
1aa99e6b 4735 if (do_utf8) {
ffc61ed2 4736 loceol = PL_regeol;
1a4fad37 4737 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4738 while (hardcount < max && scan < loceol &&
3568d838 4739 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4740 scan += UTF8SKIP(scan);
4741 hardcount++;
4742 }
4743 } else {
4744 while (scan < loceol && !isALNUM(*scan))
4745 scan++;
a0ed51b3
LW
4746 }
4747 break;
bbce6d69 4748 case NALNUML:
3280af22 4749 PL_reg_flags |= RF_tainted;
1aa99e6b 4750 if (do_utf8) {
ffc61ed2 4751 loceol = PL_regeol;
1aa99e6b
IH
4752 while (hardcount < max && scan < loceol &&
4753 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4754 scan += UTF8SKIP(scan);
4755 hardcount++;
4756 }
4757 } else {
4758 while (scan < loceol && !isALNUM_LC(*scan))
4759 scan++;
a0ed51b3
LW
4760 }
4761 break;
a0d0e21e 4762 case SPACE:
1aa99e6b 4763 if (do_utf8) {
ffc61ed2 4764 loceol = PL_regeol;
1a4fad37 4765 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4766 while (hardcount < max && scan < loceol &&
3568d838
JH
4767 (*scan == ' ' ||
4768 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4769 scan += UTF8SKIP(scan);
4770 hardcount++;
4771 }
4772 } else {
4773 while (scan < loceol && isSPACE(*scan))
4774 scan++;
a0ed51b3
LW
4775 }
4776 break;
bbce6d69 4777 case SPACEL:
3280af22 4778 PL_reg_flags |= RF_tainted;
1aa99e6b 4779 if (do_utf8) {
ffc61ed2 4780 loceol = PL_regeol;
1aa99e6b 4781 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4782 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4783 scan += UTF8SKIP(scan);
4784 hardcount++;
4785 }
4786 } else {
4787 while (scan < loceol && isSPACE_LC(*scan))
4788 scan++;
a0ed51b3
LW
4789 }
4790 break;
a0d0e21e 4791 case NSPACE:
1aa99e6b 4792 if (do_utf8) {
ffc61ed2 4793 loceol = PL_regeol;
1a4fad37 4794 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4795 while (hardcount < max && scan < loceol &&
3568d838
JH
4796 !(*scan == ' ' ||
4797 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4798 scan += UTF8SKIP(scan);
4799 hardcount++;
4800 }
4801 } else {
4802 while (scan < loceol && !isSPACE(*scan))
4803 scan++;
4804 break;
a0ed51b3 4805 }
bbce6d69 4806 case NSPACEL:
3280af22 4807 PL_reg_flags |= RF_tainted;
1aa99e6b 4808 if (do_utf8) {
ffc61ed2 4809 loceol = PL_regeol;
1aa99e6b 4810 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4811 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4812 scan += UTF8SKIP(scan);
4813 hardcount++;
4814 }
4815 } else {
4816 while (scan < loceol && !isSPACE_LC(*scan))
4817 scan++;
a0ed51b3
LW
4818 }
4819 break;
a0d0e21e 4820 case DIGIT:
1aa99e6b 4821 if (do_utf8) {
ffc61ed2 4822 loceol = PL_regeol;
1a4fad37 4823 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4824 while (hardcount < max && scan < loceol &&
3568d838 4825 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4826 scan += UTF8SKIP(scan);
4827 hardcount++;
4828 }
4829 } else {
4830 while (scan < loceol && isDIGIT(*scan))
4831 scan++;
a0ed51b3
LW
4832 }
4833 break;
a0d0e21e 4834 case NDIGIT:
1aa99e6b 4835 if (do_utf8) {
ffc61ed2 4836 loceol = PL_regeol;
1a4fad37 4837 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4838 while (hardcount < max && scan < loceol &&
3568d838 4839 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4840 scan += UTF8SKIP(scan);
4841 hardcount++;
4842 }
4843 } else {
4844 while (scan < loceol && !isDIGIT(*scan))
4845 scan++;
a0ed51b3
LW
4846 }
4847 break;
a0d0e21e
LW
4848 default: /* Called on something of 0 width. */
4849 break; /* So match right here or not at all. */
4850 }
a687059c 4851
a0ed51b3
LW
4852 if (hardcount)
4853 c = hardcount;
4854 else
4855 c = scan - PL_reginput;
3280af22 4856 PL_reginput = scan;
a687059c 4857
a3621e74 4858 DEBUG_r({
e68ec53f 4859 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 4860 DEBUG_EXECUTE_r({
e68ec53f
YO
4861 SV * const prop = sv_newmortal();
4862 regprop(prog, prop, p);
4863 PerlIO_printf(Perl_debug_log,
be8e71aa
YO
4864 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4865 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 4866 });
be8e71aa 4867 });
9041c2e3 4868
a0d0e21e 4869 return(c);
a687059c
LW
4870}
4871
c277df42 4872
be8e71aa 4873#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 4874/*
ffc61ed2
JH
4875- regclass_swash - prepare the utf8 swash
4876*/
4877
4878SV *
32fc9b6a 4879Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4880{
97aff369 4881 dVAR;
9e55ce06
JH
4882 SV *sw = NULL;
4883 SV *si = NULL;
4884 SV *alt = NULL;
3dab1dad 4885 const struct reg_data * const data = prog ? prog->data : NULL;
ffc61ed2 4886
4f639d21 4887 if (data && data->count) {
a3b680e6 4888 const U32 n = ARG(node);
ffc61ed2 4889
4f639d21
DM
4890 if (data->what[n] == 's') {
4891 SV * const rv = (SV*)data->data[n];
890ce7af 4892 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 4893 SV **const ary = AvARRAY(av);
9e55ce06 4894 SV **a, **b;
9041c2e3 4895
711a919c 4896 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
4897 * documentation of these array elements. */
4898
b11f357e 4899 si = *ary;
8f7f7219 4900 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
4901 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4902
ffc61ed2
JH
4903 if (a)
4904 sw = *a;
4905 else if (si && doinit) {
4906 sw = swash_init("utf8", "", si, 1, 0);
4907 (void)av_store(av, 1, sw);
4908 }
9e55ce06
JH
4909 if (b)
4910 alt = *b;
ffc61ed2
JH
4911 }
4912 }
4913
9e55ce06
JH
4914 if (listsvp)
4915 *listsvp = si;
4916 if (altsvp)
4917 *altsvp = alt;
ffc61ed2
JH
4918
4919 return sw;
4920}
76234dfb 4921#endif
ffc61ed2
JH
4922
4923/*
ba7b4546 4924 - reginclass - determine if a character falls into a character class
832705d4
JH
4925
4926 The n is the ANYOF regnode, the p is the target string, lenp
4927 is pointer to the maximum length of how far to go in the p
4928 (if the lenp is zero, UTF8SKIP(p) is used),
4929 do_utf8 tells whether the target string is in UTF-8.
4930
bbce6d69 4931 */
4932
76e3520e 4933STATIC bool
32fc9b6a 4934S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4935{
27da23d5 4936 dVAR;
a3b680e6 4937 const char flags = ANYOF_FLAGS(n);
bbce6d69 4938 bool match = FALSE;
cc07378b 4939 UV c = *p;
ae9ddab8 4940 STRLEN len = 0;
9e55ce06 4941 STRLEN plen;
1aa99e6b 4942
19f67299
TS
4943 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
4944 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
4945 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
4946 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
98fbe989
JH
4947 if (len == (STRLEN)-1) {
4948 int i;
4949 PerlIO_printf(Perl_debug_log, "utf8: ");
4950 for (i = 0; i < 4; i++)
4951 PerlIO_printf(Perl_debug_log, "\\x%02x", p[i]);
4952 PerlIO_printf(Perl_debug_log, "\n");
4953 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "Malformed UTF-8 character");
4954 /* Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); */
4955 }
19f67299 4956 }
bbce6d69 4957
0f0076b4 4958 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4959 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4960 if (lenp)
4961 *lenp = 0;
ffc61ed2 4962 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4963 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4964 match = TRUE;
bbce6d69 4965 }
3568d838 4966 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4967 match = TRUE;
ffc61ed2 4968 if (!match) {
9e55ce06 4969 AV *av;
32fc9b6a 4970 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4971
4972 if (sw) {
3568d838 4973 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4974 match = TRUE;
4975 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4976 if (!match && lenp && av) {
4977 I32 i;
9e55ce06 4978 for (i = 0; i <= av_len(av); i++) {
890ce7af 4979 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 4980 STRLEN len;
890ce7af 4981 const char * const s = SvPV_const(sv, len);
9e55ce06 4982
061b10df 4983 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4984 *lenp = len;
4985 match = TRUE;
4986 break;
4987 }
4988 }
4989 }
4990 if (!match) {
89ebb4a3 4991 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
4992 STRLEN tmplen;
4993
9e55ce06
JH
4994 to_utf8_fold(p, tmpbuf, &tmplen);
4995 if (swash_fetch(sw, tmpbuf, do_utf8))
4996 match = TRUE;
4997 }
ffc61ed2
JH
4998 }
4999 }
bbce6d69 5000 }
9e55ce06 5001 if (match && lenp && *lenp == 0)
0f0076b4 5002 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5003 }
1aa99e6b 5004 if (!match && c < 256) {
ffc61ed2
JH
5005 if (ANYOF_BITMAP_TEST(n, c))
5006 match = TRUE;
5007 else if (flags & ANYOF_FOLD) {
eb160463 5008 U8 f;
a0ed51b3 5009
ffc61ed2
JH
5010 if (flags & ANYOF_LOCALE) {
5011 PL_reg_flags |= RF_tainted;
5012 f = PL_fold_locale[c];
5013 }
5014 else
5015 f = PL_fold[c];
5016 if (f != c && ANYOF_BITMAP_TEST(n, f))
5017 match = TRUE;
5018 }
5019
5020 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5021 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5022 if (
5023 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5024 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5025 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5026 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5027 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5028 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5029 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5030 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5031 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5032 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5033 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5034 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5035 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5036 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5037 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5038 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5039 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5040 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5041 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5042 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5043 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5044 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5045 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5046 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5047 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5048 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5049 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5050 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5051 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5052 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5053 ) /* How's that for a conditional? */
5054 {
5055 match = TRUE;
5056 }
a0ed51b3 5057 }
a0ed51b3
LW
5058 }
5059
a0ed51b3
LW
5060 return (flags & ANYOF_INVERT) ? !match : match;
5061}
161b471a 5062
dfe13c55 5063STATIC U8 *
0ce71af7 5064S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5065{
97aff369 5066 dVAR;
a0ed51b3 5067 if (off >= 0) {
1aa99e6b 5068 while (off-- && s < lim) {
ffc61ed2 5069 /* XXX could check well-formedness here */
a0ed51b3 5070 s += UTF8SKIP(s);
ffc61ed2 5071 }
a0ed51b3
LW
5072 }
5073 else {
5074 while (off++) {
1aa99e6b 5075 if (s > lim) {
a0ed51b3 5076 s--;
ffc61ed2 5077 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5078 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5079 s--;
ffc61ed2
JH
5080 }
5081 /* XXX could check well-formedness here */
a0ed51b3
LW
5082 }
5083 }
5084 }
5085 return s;
5086}
161b471a 5087
dfe13c55 5088STATIC U8 *
0ce71af7 5089S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5090{
97aff369 5091 dVAR;
a0ed51b3 5092 if (off >= 0) {
1aa99e6b 5093 while (off-- && s < lim) {
ffc61ed2 5094 /* XXX could check well-formedness here */
a0ed51b3 5095 s += UTF8SKIP(s);
ffc61ed2 5096 }
a0ed51b3 5097 if (off >= 0)
3dab1dad 5098 return NULL;
a0ed51b3
LW
5099 }
5100 else {
5101 while (off++) {
1aa99e6b 5102 if (s > lim) {
a0ed51b3 5103 s--;
ffc61ed2 5104 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5105 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5106 s--;
ffc61ed2
JH
5107 }
5108 /* XXX could check well-formedness here */
a0ed51b3
LW
5109 }
5110 else
5111 break;
5112 }
5113 if (off <= 0)
3dab1dad 5114 return NULL;
a0ed51b3
LW
5115 }
5116 return s;
5117}
51371543 5118
51371543 5119static void
acfe0abc 5120restore_pos(pTHX_ void *arg)
51371543 5121{
97aff369 5122 dVAR;
097eb12c 5123 regexp * const rex = (regexp *)arg;
51371543
GS
5124 if (PL_reg_eval_set) {
5125 if (PL_reg_oldsaved) {
4f639d21
DM
5126 rex->subbeg = PL_reg_oldsaved;
5127 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5128#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5129 rex->saved_copy = PL_nrs;
ed252734 5130#endif
4f639d21 5131 RX_MATCH_COPIED_on(rex);
51371543
GS
5132 }
5133 PL_reg_magic->mg_len = PL_reg_oldpos;
5134 PL_reg_eval_set = 0;
5135 PL_curpm = PL_reg_oldcurpm;
5136 }
5137}
33b8afdf
JH
5138
5139STATIC void
5140S_to_utf8_substr(pTHX_ register regexp *prog)
5141{
33b8afdf 5142 if (prog->float_substr && !prog->float_utf8) {
097eb12c
AL
5143 SV* const sv = newSVsv(prog->float_substr);
5144 prog->float_utf8 = sv;
33b8afdf
JH
5145 sv_utf8_upgrade(sv);
5146 if (SvTAIL(prog->float_substr))
5147 SvTAIL_on(sv);
5148 if (prog->float_substr == prog->check_substr)
5149 prog->check_utf8 = sv;
5150 }
5151 if (prog->anchored_substr && !prog->anchored_utf8) {
097eb12c
AL
5152 SV* const sv = newSVsv(prog->anchored_substr);
5153 prog->anchored_utf8 = sv;
33b8afdf
JH
5154 sv_utf8_upgrade(sv);
5155 if (SvTAIL(prog->anchored_substr))
5156 SvTAIL_on(sv);
5157 if (prog->anchored_substr == prog->check_substr)
5158 prog->check_utf8 = sv;
5159 }
5160}
5161
5162STATIC void
5163S_to_byte_substr(pTHX_ register regexp *prog)
5164{
97aff369 5165 dVAR;
33b8afdf 5166 if (prog->float_utf8 && !prog->float_substr) {
097eb12c
AL
5167 SV* sv = newSVsv(prog->float_utf8);
5168 prog->float_substr = sv;
33b8afdf
JH
5169 if (sv_utf8_downgrade(sv, TRUE)) {
5170 if (SvTAIL(prog->float_utf8))
5171 SvTAIL_on(sv);
5172 } else {
5173 SvREFCNT_dec(sv);
5174 prog->float_substr = sv = &PL_sv_undef;
5175 }
5176 if (prog->float_utf8 == prog->check_utf8)
5177 prog->check_substr = sv;
5178 }
5179 if (prog->anchored_utf8 && !prog->anchored_substr) {
097eb12c
AL
5180 SV* sv = newSVsv(prog->anchored_utf8);
5181 prog->anchored_substr = sv;
33b8afdf
JH
5182 if (sv_utf8_downgrade(sv, TRUE)) {
5183 if (SvTAIL(prog->anchored_utf8))
5184 SvTAIL_on(sv);
5185 } else {
5186 SvREFCNT_dec(sv);
5187 prog->anchored_substr = sv = &PL_sv_undef;
5188 }
5189 if (prog->anchored_utf8 == prog->check_utf8)
5190 prog->check_substr = sv;
5191 }
5192}
66610fdd
RGS
5193
5194/*
5195 * Local variables:
5196 * c-indentation-style: bsd
5197 * c-basic-offset: 4
5198 * indent-tabs-mode: t
5199 * End:
5200 *
37442d52
RGS
5201 * ex: set ts=8 sts=4 sw=4 noet:
5202 */