This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[perl #38892] PATCH: remove spurious warning from Benchmark.pm
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
50/* *These* symbols are masked to allow static link. */
51# define Perl_pregexec my_pregexec
9041c2e3 52# define Perl_reginitcolors my_reginitcolors
490a3f88 53# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
9041c2e3 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 **** Alterations to Henry's code are...
79 ****
4bb101f2 80 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 81 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 82 ****
9ef589d8
LW
83 **** You may distribute under the terms of either the GNU General Public
84 **** License or the Artistic License, as specified in the README file.
a687059c
LW
85 *
86 * Beware that some of this code is subtly aware of the way operator
87 * precedence is structured in regular expressions. Serious changes in
88 * regular-expression syntax might require a total rethink.
89 */
90#include "EXTERN.h"
864dbfa3 91#define PERL_IN_REGEXEC_C
a687059c 92#include "perl.h"
0f5d15d6 93
a687059c
LW
94#include "regcomp.h"
95
c277df42
IZ
96#define RF_tainted 1 /* tainted information used? */
97#define RF_warned 2 /* warned about big count? */
ce862d02 98#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
99#define RF_utf8 8 /* String contains multibyte chars? */
100
eb160463 101#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
102
103#define RS_init 1 /* eval environment created */
104#define RS_set 2 /* replsv value is set */
c277df42 105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
32fc9b6a 110#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 111
c277df42
IZ
112/*
113 * Forwards.
114 */
115
33b8afdf 116#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 117#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 118
52657f30
AL
119#define HOPc(pos,off) ((char *)(PL_reg_match_utf8 \
120 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
121 : (U8*)(pos + off)))
122#define HOPBACKc(pos, off) ((char*) \
123 ((PL_reg_match_utf8) \
124 ? reghopmaybe3((U8*)pos, -off, ((U8*)(off < 0 ? PL_regeol : PL_bostr))) \
efb30f32
HS
125 : (pos - off >= PL_bostr) \
126 ? (U8*)(pos - off) \
52657f30 127 : (U8*)NULL) \
efb30f32 128)
efb30f32 129
1aa99e6b 130#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c 131#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 132#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 133
1a4fad37
AL
134#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
135 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
136#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
137#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
138#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
139#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 140
5f80c4cf 141/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
142#define JUMPABLE(rn) ( \
143 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
144 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
145 OP(rn) == PLUS || OP(rn) == MINMOD || \
146 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
147)
148
cca55fe3
JP
149#define HAS_TEXT(rn) ( \
150 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
151)
e2d8ce26 152
a84d97b6
HS
153/*
154 Search for mandatory following text node; for lookahead, the text must
155 follow but for lookbehind (rn->flags != 0) we skip to the next step.
156*/
cca55fe3 157#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 158 while (JUMPABLE(rn)) \
a84d97b6 159 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 160 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
161 else if (OP(rn) == PLUS) \
162 rn = NEXTOPER(rn); \
a84d97b6
HS
163 else if (OP(rn) == IFMATCH) \
164 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 165 else rn += NEXT_OFF(rn); \
5f80c4cf 166} STMT_END
74750237 167
acfe0abc 168static void restore_pos(pTHX_ void *arg);
51371543 169
76e3520e 170STATIC CHECKPOINT
cea2e8a9 171S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 172{
97aff369 173 dVAR;
a3b680e6 174 const int retval = PL_savestack_ix;
b1ce53c5 175#define REGCP_PAREN_ELEMS 4
a3b680e6 176 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
177 int p;
178
e49a9654
IH
179 if (paren_elems_to_push < 0)
180 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
181
a01268b5 182#define REGCP_OTHER_ELEMS 6
4b3c1a47 183 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 184 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 185/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
186 SSPUSHINT(PL_regendp[p]);
187 SSPUSHINT(PL_regstartp[p]);
3280af22 188 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
189 SSPUSHINT(p);
190 }
b1ce53c5 191/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
192 SSPUSHINT(PL_regsize);
193 SSPUSHINT(*PL_reglastparen);
a01268b5 194 SSPUSHINT(*PL_reglastcloseparen);
3280af22 195 SSPUSHPTR(PL_reginput);
41123dfd
JH
196#define REGCP_FRAME_ELEMS 2
197/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
198 * are needed for the regexp context stack bookkeeping. */
199 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 200 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 201
a0d0e21e
LW
202 return retval;
203}
204
c277df42 205/* These are needed since we do not localize EVAL nodes: */
a3621e74 206# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 207 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 208 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 209
a3621e74 210# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 211 PerlIO_printf(Perl_debug_log, \
faccc32b 212 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 213 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 214
76e3520e 215STATIC char *
097eb12c 216S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 217{
97aff369 218 dVAR;
b1ce53c5 219 I32 i;
a0d0e21e 220 char *input;
b1ce53c5 221
a3621e74
YO
222 GET_RE_DEBUG_FLAGS_DECL;
223
b1ce53c5 224 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 225 i = SSPOPINT;
b1ce53c5
JH
226 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
227 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 228 input = (char *) SSPOPPTR;
a01268b5 229 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
230 *PL_reglastparen = SSPOPINT;
231 PL_regsize = SSPOPINT;
b1ce53c5
JH
232
233 /* Now restore the parentheses context. */
41123dfd
JH
234 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
235 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 236 I32 tmps;
097eb12c 237 U32 paren = (U32)SSPOPINT;
3280af22 238 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
239 PL_regstartp[paren] = SSPOPINT;
240 tmps = SSPOPINT;
3280af22
NIS
241 if (paren <= *PL_reglastparen)
242 PL_regendp[paren] = tmps;
a3621e74 243 DEBUG_EXECUTE_r(
c3464db5 244 PerlIO_printf(Perl_debug_log,
b900a521 245 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 246 (UV)paren, (IV)PL_regstartp[paren],
b900a521 247 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 248 (IV)PL_regendp[paren],
3280af22 249 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 250 );
a0d0e21e 251 }
a3621e74 252 DEBUG_EXECUTE_r(
4f639d21 253 if ((I32)(*PL_reglastparen + 1) <= rex->nparens) {
c3464db5 254 PerlIO_printf(Perl_debug_log,
faccc32b 255 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 256 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
257 }
258 );
daf18116 259#if 1
dafc8851
JH
260 /* It would seem that the similar code in regtry()
261 * already takes care of this, and in fact it is in
262 * a better location to since this code can #if 0-ed out
263 * but the code in regtry() is needed or otherwise tests
264 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
265 * (as of patchlevel 7877) will fail. Then again,
266 * this code seems to be necessary or otherwise
267 * building DynaLoader will fail:
268 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
269 * --jhi */
097eb12c
AL
270 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
271 if (i > PL_regsize)
272 PL_regstartp[i] = -1;
273 PL_regendp[i] = -1;
a0d0e21e 274 }
dafc8851 275#endif
a0d0e21e
LW
276 return input;
277}
278
02db2b7b 279#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 280
95b24440 281#define TRYPAREN(paren, n, input, where) { \
29d1e993
HS
282 if (paren) { \
283 if (n) { \
284 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
285 PL_regendp[paren] = input - PL_bostr; \
286 } \
287 else \
288 PL_regendp[paren] = -1; \
289 } \
95b24440
DM
290 REGMATCH(next, where); \
291 if (result) \
29d1e993
HS
292 sayYES; \
293 if (paren && n) \
294 PL_regendp[paren] = -1; \
295}
296
297
a687059c 298/*
e50aee73 299 * pregexec and friends
a687059c
LW
300 */
301
302/*
c277df42 303 - pregexec - match a regexp against a string
a687059c 304 */
c277df42 305I32
864dbfa3 306Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 307 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
308/* strend: pointer to null at end of string */
309/* strbeg: real beginning of string */
310/* minend: end of match must be >=minend after stringarg. */
311/* nosave: For optimizations. */
312{
313 return
9041c2e3 314 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
315 nosave ? 0 : REXEC_COPY_STR);
316}
0f5d15d6 317
22e551b9 318
9041c2e3 319/*
cad2e5aa
JH
320 * Need to implement the following flags for reg_anch:
321 *
322 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
323 * USE_INTUIT_ML
324 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
325 * INTUIT_AUTORITATIVE_ML
326 * INTUIT_ONCE_NOML - Intuit can match in one location only.
327 * INTUIT_ONCE_ML
328 *
329 * Another flag for this function: SECOND_TIME (so that float substrs
330 * with giant delta may be not rechecked).
331 */
332
333/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
334
3f7c398e 335/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
336 Otherwise, only SvCUR(sv) is used to get strbeg. */
337
338/* XXXX We assume that strpos is strbeg unless sv. */
339
6eb5f6b9
JH
340/* XXXX Some places assume that there is a fixed substring.
341 An update may be needed if optimizer marks as "INTUITable"
342 RExen without fixed substrings. Similarly, it is assumed that
343 lengths of all the strings are no more than minlen, thus they
344 cannot come from lookahead.
345 (Or minlen should take into account lookahead.) */
346
2c2d71f5
JH
347/* A failure to find a constant substring means that there is no need to make
348 an expensive call to REx engine, thus we celebrate a failure. Similarly,
349 finding a substring too deep into the string means that less calls to
30944b6d
IZ
350 regtry() should be needed.
351
352 REx compiler's optimizer found 4 possible hints:
353 a) Anchored substring;
354 b) Fixed substring;
355 c) Whether we are anchored (beginning-of-line or \G);
356 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 357 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
358 string which does not contradict any of them.
359 */
2c2d71f5 360
6eb5f6b9
JH
361/* Most of decisions we do here should have been done at compile time.
362 The nodes of the REx which we used for the search should have been
363 deleted from the finite automaton. */
364
cad2e5aa
JH
365char *
366Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
367 char *strend, U32 flags, re_scream_pos_data *data)
368{
97aff369 369 dVAR;
b7953727 370 register I32 start_shift = 0;
cad2e5aa 371 /* Should be nonnegative! */
b7953727 372 register I32 end_shift = 0;
2c2d71f5
JH
373 register char *s;
374 register SV *check;
a1933d95 375 char *strbeg;
cad2e5aa 376 char *t;
a3b680e6 377 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 378 I32 ml_anch;
bd61b366
SS
379 register char *other_last = NULL; /* other substr checked before this */
380 char *check_at = NULL; /* check substr found at this pos */
1df70142 381 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 382#ifdef DEBUGGING
890ce7af
AL
383 const char * const i_strpos = strpos;
384 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 385#endif
a3621e74
YO
386
387 GET_RE_DEBUG_FLAGS_DECL;
388
a30b2f1f 389 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 390
b8d68ded 391 if (prog->reganch & ROPT_UTF8) {
a3621e74 392 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
393 "UTF-8 regex...\n"));
394 PL_reg_flags |= RF_utf8;
395 }
396
a3621e74 397 DEBUG_EXECUTE_r({
1df70142 398 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
399 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
400 strpos;
1df70142 401 const int len = PL_reg_match_utf8 ?
b8d68ded 402 strlen(s) : strend - strpos;
2a782b5b
JH
403 if (!PL_colorset)
404 reginitcolors();
b8d68ded 405 if (PL_reg_match_utf8)
a3621e74 406 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 407 "UTF-8 target...\n"));
2a782b5b 408 PerlIO_printf(Perl_debug_log,
a0288114 409 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 410 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
411 prog->precomp,
412 PL_colors[1],
413 (strlen(prog->precomp) > 60 ? "..." : ""),
414 PL_colors[0],
415 (int)(len > 60 ? 60 : len),
416 s, PL_colors[1],
417 (len > 60 ? "..." : "")
418 );
419 });
cad2e5aa 420
c344f387
JH
421 /* CHR_DIST() would be more correct here but it makes things slow. */
422 if (prog->minlen > strend - strpos) {
a3621e74 423 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 424 "String too short... [re_intuit_start]\n"));
cad2e5aa 425 goto fail;
2c2d71f5 426 }
a1933d95 427 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 428 PL_regeol = strend;
33b8afdf
JH
429 if (do_utf8) {
430 if (!prog->check_utf8 && prog->check_substr)
431 to_utf8_substr(prog);
432 check = prog->check_utf8;
433 } else {
434 if (!prog->check_substr && prog->check_utf8)
435 to_byte_substr(prog);
436 check = prog->check_substr;
437 }
438 if (check == &PL_sv_undef) {
a3621e74 439 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
440 "Non-utf string cannot match utf check string\n"));
441 goto fail;
442 }
2c2d71f5 443 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
444 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
445 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 446 && !multiline ) ); /* Check after \n? */
cad2e5aa 447
7e25d62c
JH
448 if (!ml_anch) {
449 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
450 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 451 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
452 && sv && !SvROK(sv)
453 && (strpos != strbeg)) {
a3621e74 454 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
455 goto fail;
456 }
457 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 458 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 459 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
460 I32 slen;
461
1aa99e6b 462 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
463 if (SvTAIL(check)) {
464 slen = SvCUR(check); /* >= 1 */
cad2e5aa 465
9041c2e3 466 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 467 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 468 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 469 goto fail_finish;
cad2e5aa
JH
470 }
471 /* Now should match s[0..slen-2] */
472 slen--;
3f7c398e 473 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 474 || (slen > 1
3f7c398e 475 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 476 report_neq:
a3621e74 477 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
478 goto fail_finish;
479 }
cad2e5aa 480 }
3f7c398e 481 else if (*SvPVX_const(check) != *s
653099ff 482 || ((slen = SvCUR(check)) > 1
3f7c398e 483 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 484 goto report_neq;
c315bfe8 485 check_at = s;
2c2d71f5 486 goto success_at_start;
7e25d62c 487 }
cad2e5aa 488 }
2c2d71f5 489 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 490 s = strpos;
2c2d71f5 491 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 492 end_shift = prog->minlen - start_shift -
653099ff 493 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 494 if (!ml_anch) {
a3b680e6 495 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 496 - (SvTAIL(check) != 0);
a3b680e6 497 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
498
499 if (end_shift < eshift)
500 end_shift = eshift;
501 }
cad2e5aa 502 }
2c2d71f5 503 else { /* Can match at random position */
cad2e5aa
JH
504 ml_anch = 0;
505 s = strpos;
2c2d71f5
JH
506 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
507 /* Should be nonnegative! */
508 end_shift = prog->minlen - start_shift -
653099ff 509 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
510 }
511
2c2d71f5 512#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 513 if (end_shift < 0)
6bbae5e6 514 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
515#endif
516
2c2d71f5
JH
517 restart:
518 /* Find a possible match in the region s..strend by looking for
519 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 520 if (flags & REXEC_SCREAM) {
cad2e5aa 521 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 522 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 523
2c2d71f5
JH
524 if (PL_screamfirst[BmRARE(check)] >= 0
525 || ( BmRARE(check) == '\n'
526 && (BmPREVIOUS(check) == SvCUR(check) - 1)
527 && SvTAIL(check) ))
9041c2e3 528 s = screaminstr(sv, check,
2c2d71f5 529 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 530 else
2c2d71f5 531 goto fail_finish;
4addbd3b
HS
532 /* we may be pointing at the wrong string */
533 if (s && RX_MATCH_COPIED(prog))
3f7c398e 534 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
535 if (data)
536 *data->scream_olds = s;
537 }
f33976b4 538 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
539 s = fbm_instr((U8*)(s + start_shift),
540 (U8*)(strend - end_shift),
7fba1cd6 541 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 542 else
1aa99e6b
IH
543 s = fbm_instr(HOP3(s, start_shift, strend),
544 HOP3(strend, -end_shift, strbeg),
7fba1cd6 545 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
546
547 /* Update the count-of-usability, remove useless subpatterns,
548 unshift s. */
2c2d71f5 549
a0288114 550 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 551 (s ? "Found" : "Did not find"),
33b8afdf 552 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 553 PL_colors[0],
7b0972df 554 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 555 SvPVX_const(check),
2c2d71f5
JH
556 PL_colors[1], (SvTAIL(check) ? "$" : ""),
557 (s ? " at offset " : "...\n") ) );
558
559 if (!s)
560 goto fail_finish;
561
6eb5f6b9
JH
562 check_at = s;
563
2c2d71f5 564 /* Finish the diagnostic message */
a3621e74 565 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
566
567 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
568 Start with the other substr.
569 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 570 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
571 *always* match. Probably should be marked during compile...
572 Probably it is right to do no SCREAM here...
573 */
574
33b8afdf 575 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 576 /* Take into account the "other" substring. */
2c2d71f5
JH
577 /* XXXX May be hopelessly wrong for UTF... */
578 if (!other_last)
6eb5f6b9 579 other_last = strpos;
33b8afdf 580 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
581 do_other_anchored:
582 {
890ce7af
AL
583 char * const last = HOP3c(s, -start_shift, strbeg);
584 char *last1, *last2;
2c2d71f5 585 char *s1 = s;
33b8afdf 586 SV* must;
2c2d71f5 587
2c2d71f5
JH
588 t = s - prog->check_offset_max;
589 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 590 && (!do_utf8
1aa99e6b 591 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 592 && t > strpos)))
30944b6d 593 /* EMPTY */;
2c2d71f5
JH
594 else
595 t = strpos;
1aa99e6b 596 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
597 if (t < other_last) /* These positions already checked */
598 t = other_last;
1aa99e6b 599 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
600 if (last < last1)
601 last1 = last;
602 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
603 /* On end-of-str: see comment below. */
33b8afdf
JH
604 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
605 if (must == &PL_sv_undef) {
606 s = (char*)NULL;
a3621e74 607 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
608 }
609 else
610 s = fbm_instr(
611 (unsigned char*)t,
612 HOP3(HOP3(last1, prog->anchored_offset, strend)
613 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
614 must,
7fba1cd6 615 multiline ? FBMrf_MULTILINE : 0
33b8afdf 616 );
a3621e74 617 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 618 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
619 (s ? "Found" : "Contradicts"),
620 PL_colors[0],
33b8afdf
JH
621 (int)(SvCUR(must)
622 - (SvTAIL(must)!=0)),
3f7c398e 623 SvPVX_const(must),
33b8afdf 624 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
625 if (!s) {
626 if (last1 >= last2) {
a3621e74 627 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
628 ", giving up...\n"));
629 goto fail_finish;
630 }
a3621e74 631 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 632 ", trying floating at offset %ld...\n",
1aa99e6b
IH
633 (long)(HOP3c(s1, 1, strend) - i_strpos)));
634 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
635 s = HOP3c(last, 1, strend);
2c2d71f5
JH
636 goto restart;
637 }
638 else {
a3621e74 639 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 640 (long)(s - i_strpos)));
1aa99e6b
IH
641 t = HOP3c(s, -prog->anchored_offset, strbeg);
642 other_last = HOP3c(s, 1, strend);
30944b6d 643 s = s1;
2c2d71f5
JH
644 if (t == strpos)
645 goto try_at_start;
2c2d71f5
JH
646 goto try_at_offset;
647 }
30944b6d 648 }
2c2d71f5
JH
649 }
650 else { /* Take into account the floating substring. */
33b8afdf
JH
651 char *last, *last1;
652 char *s1 = s;
653 SV* must;
654
655 t = HOP3c(s, -start_shift, strbeg);
656 last1 = last =
657 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
658 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
659 last = HOP3c(t, prog->float_max_offset, strend);
660 s = HOP3c(t, prog->float_min_offset, strend);
661 if (s < other_last)
662 s = other_last;
2c2d71f5 663 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
664 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
665 /* fbm_instr() takes into account exact value of end-of-str
666 if the check is SvTAIL(ed). Since false positives are OK,
667 and end-of-str is not later than strend we are OK. */
668 if (must == &PL_sv_undef) {
669 s = (char*)NULL;
a3621e74 670 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
671 }
672 else
2c2d71f5 673 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
674 (unsigned char*)last + SvCUR(must)
675 - (SvTAIL(must)!=0),
7fba1cd6 676 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 677 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
678 (s ? "Found" : "Contradicts"),
679 PL_colors[0],
680 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 681 SvPVX_const(must),
33b8afdf
JH
682 PL_colors[1], (SvTAIL(must) ? "$" : "")));
683 if (!s) {
684 if (last1 == last) {
a3621e74 685 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
686 ", giving up...\n"));
687 goto fail_finish;
2c2d71f5 688 }
a3621e74 689 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
690 ", trying anchored starting at offset %ld...\n",
691 (long)(s1 + 1 - i_strpos)));
692 other_last = last;
693 s = HOP3c(t, 1, strend);
694 goto restart;
695 }
696 else {
a3621e74 697 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
698 (long)(s - i_strpos)));
699 other_last = s; /* Fix this later. --Hugo */
700 s = s1;
701 if (t == strpos)
702 goto try_at_start;
703 goto try_at_offset;
704 }
2c2d71f5 705 }
cad2e5aa 706 }
2c2d71f5
JH
707
708 t = s - prog->check_offset_max;
2c2d71f5 709 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 710 && (!do_utf8
1aa99e6b
IH
711 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
712 && t > strpos))) {
2c2d71f5
JH
713 /* Fixed substring is found far enough so that the match
714 cannot start at strpos. */
715 try_at_offset:
cad2e5aa 716 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
717 /* Eventually fbm_*() should handle this, but often
718 anchored_offset is not 0, so this check will not be wasted. */
719 /* XXXX In the code below we prefer to look for "^" even in
720 presence of anchored substrings. And we search even
721 beyond the found float position. These pessimizations
722 are historical artefacts only. */
723 find_anchor:
2c2d71f5 724 while (t < strend - prog->minlen) {
cad2e5aa 725 if (*t == '\n') {
4ee3650e 726 if (t < check_at - prog->check_offset_min) {
33b8afdf 727 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
728 /* Since we moved from the found position,
729 we definitely contradict the found anchored
30944b6d
IZ
730 substr. Due to the above check we do not
731 contradict "check" substr.
732 Thus we can arrive here only if check substr
733 is float. Redo checking for "other"=="fixed".
734 */
9041c2e3 735 strpos = t + 1;
a3621e74 736 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 737 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
738 goto do_other_anchored;
739 }
4ee3650e
GS
740 /* We don't contradict the found floating substring. */
741 /* XXXX Why not check for STCLASS? */
cad2e5aa 742 s = t + 1;
a3621e74 743 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 744 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
745 goto set_useful;
746 }
4ee3650e
GS
747 /* Position contradicts check-string */
748 /* XXXX probably better to look for check-string
749 than for "\n", so one should lower the limit for t? */
a3621e74 750 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 751 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 752 other_last = strpos = s = t + 1;
cad2e5aa
JH
753 goto restart;
754 }
755 t++;
756 }
a3621e74 757 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 758 PL_colors[0], PL_colors[1]));
2c2d71f5 759 goto fail_finish;
cad2e5aa 760 }
f5952150 761 else {
a3621e74 762 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 763 PL_colors[0], PL_colors[1]));
f5952150 764 }
cad2e5aa
JH
765 s = t;
766 set_useful:
33b8afdf 767 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
768 }
769 else {
f5952150 770 /* The found string does not prohibit matching at strpos,
2c2d71f5 771 - no optimization of calling REx engine can be performed,
f5952150
GS
772 unless it was an MBOL and we are not after MBOL,
773 or a future STCLASS check will fail this. */
2c2d71f5
JH
774 try_at_start:
775 /* Even in this situation we may use MBOL flag if strpos is offset
776 wrt the start of the string. */
05b4157f 777 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 778 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
779 /* May be due to an implicit anchor of m{.*foo} */
780 && !(prog->reganch & ROPT_IMPLICIT))
781 {
cad2e5aa
JH
782 t = strpos;
783 goto find_anchor;
784 }
a3621e74 785 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 786 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 787 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 788 );
2c2d71f5 789 success_at_start:
30944b6d 790 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
791 && (do_utf8 ? (
792 prog->check_utf8 /* Could be deleted already */
793 && --BmUSEFUL(prog->check_utf8) < 0
794 && (prog->check_utf8 == prog->float_utf8)
795 ) : (
796 prog->check_substr /* Could be deleted already */
797 && --BmUSEFUL(prog->check_substr) < 0
798 && (prog->check_substr == prog->float_substr)
799 )))
66e933ab 800 {
cad2e5aa 801 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 802 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
803 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
804 if (do_utf8 ? prog->check_substr : prog->check_utf8)
805 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
806 prog->check_substr = prog->check_utf8 = NULL; /* disable */
807 prog->float_substr = prog->float_utf8 = NULL; /* clear */
808 check = NULL; /* abort */
cad2e5aa 809 s = strpos;
3cf5c195
IZ
810 /* XXXX This is a remnant of the old implementation. It
811 looks wasteful, since now INTUIT can use many
6eb5f6b9 812 other heuristics. */
cad2e5aa
JH
813 prog->reganch &= ~RE_USE_INTUIT;
814 }
815 else
816 s = strpos;
817 }
818
6eb5f6b9
JH
819 /* Last resort... */
820 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
821 if (prog->regstclass) {
822 /* minlen == 0 is possible if regstclass is \b or \B,
823 and the fixed substr is ''$.
824 Since minlen is already taken into account, s+1 is before strend;
825 accidentally, minlen >= 1 guaranties no false positives at s + 1
826 even for \b or \B. But (minlen? 1 : 0) below assumes that
827 regstclass does not come from lookahead... */
828 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
829 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 830 const U8* const str = (U8*)STRING(prog->regstclass);
06b5626a 831 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 832 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 833 : 1);
a3b680e6 834 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 835 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 836 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
837 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
838 cl_l, strend)
839 : strend);
6eb5f6b9
JH
840
841 t = s;
3b0527fe 842 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
843 if (!s) {
844#ifdef DEBUGGING
cbbf8932 845 const char *what = NULL;
6eb5f6b9
JH
846#endif
847 if (endpos == strend) {
a3621e74 848 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
849 "Could not match STCLASS...\n") );
850 goto fail;
851 }
a3621e74 852 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 853 "This position contradicts STCLASS...\n") );
653099ff
GS
854 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
855 goto fail;
6eb5f6b9 856 /* Contradict one of substrings */
33b8afdf
JH
857 if (prog->anchored_substr || prog->anchored_utf8) {
858 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 859 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 860 hop_and_restart:
1aa99e6b 861 s = HOP3c(t, 1, strend);
66e933ab
GS
862 if (s + start_shift + end_shift > strend) {
863 /* XXXX Should be taken into account earlier? */
a3621e74 864 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
865 "Could not match STCLASS...\n") );
866 goto fail;
867 }
5e39e1e5
HS
868 if (!check)
869 goto giveup;
a3621e74 870 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 871 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
872 what, (long)(s + start_shift - i_strpos)) );
873 goto restart;
874 }
66e933ab 875 /* Have both, check_string is floating */
6eb5f6b9
JH
876 if (t + start_shift >= check_at) /* Contradicts floating=check */
877 goto retry_floating_check;
878 /* Recheck anchored substring, but not floating... */
9041c2e3 879 s = check_at;
5e39e1e5
HS
880 if (!check)
881 goto giveup;
a3621e74 882 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 883 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
884 (long)(other_last - i_strpos)) );
885 goto do_other_anchored;
886 }
60e71179
GS
887 /* Another way we could have checked stclass at the
888 current position only: */
889 if (ml_anch) {
890 s = t = t + 1;
5e39e1e5
HS
891 if (!check)
892 goto giveup;
a3621e74 893 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 894 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 895 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 896 goto try_at_offset;
66e933ab 897 }
33b8afdf 898 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 899 goto fail;
6eb5f6b9
JH
900 /* Check is floating subtring. */
901 retry_floating_check:
902 t = check_at - start_shift;
a3621e74 903 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
904 goto hop_and_restart;
905 }
b7953727 906 if (t != s) {
a3621e74 907 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 908 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
909 (long)(t - i_strpos), (long)(s - i_strpos))
910 );
911 }
912 else {
a3621e74 913 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
914 "Does not contradict STCLASS...\n");
915 );
916 }
6eb5f6b9 917 }
5e39e1e5 918 giveup:
a3621e74 919 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
920 PL_colors[4], (check ? "Guessed" : "Giving up"),
921 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 922 return s;
2c2d71f5
JH
923
924 fail_finish: /* Substring not found */
33b8afdf
JH
925 if (prog->check_substr || prog->check_utf8) /* could be removed already */
926 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 927 fail:
a3621e74 928 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 929 PL_colors[4], PL_colors[5]));
bd61b366 930 return NULL;
cad2e5aa 931}
9661b544 932
6eb5f6b9 933/* We know what class REx starts with. Try to find this position... */
3b0527fe
DM
934/* if reginfo is NULL, its a dryrun */
935
3c3eec57 936STATIC char *
3b0527fe
DM
937S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, const char
938*strend, const regmatch_info *reginfo)
a687059c 939{
27da23d5 940 dVAR;
1df70142 941 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 942 char *m;
d8093b23 943 STRLEN ln;
5dab1207 944 STRLEN lnc;
078c425b 945 register STRLEN uskip;
d8093b23
G
946 unsigned int c1;
947 unsigned int c2;
6eb5f6b9
JH
948 char *e;
949 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 950 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 951
6eb5f6b9
JH
952 /* We know what class it must start with. */
953 switch (OP(c)) {
6eb5f6b9 954 case ANYOF:
388cc4de 955 if (do_utf8) {
078c425b 956 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
957 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
958 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a
DM
959 reginclass(prog, c, (U8*)s, 0, do_utf8) :
960 REGINCLASS(prog, c, (U8*)s)) {
3b0527fe 961 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
962 goto got_it;
963 else
964 tmp = doevery;
965 }
966 else
967 tmp = 1;
078c425b 968 s += uskip;
388cc4de
HS
969 }
970 }
971 else {
972 while (s < strend) {
973 STRLEN skip = 1;
974
32fc9b6a 975 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
976 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
977 /* The assignment of 2 is intentional:
978 * for the folded sharp s, the skip is 2. */
979 (skip = SHARP_S_SKIP))) {
3b0527fe 980 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
981 goto got_it;
982 else
983 tmp = doevery;
984 }
985 else
986 tmp = 1;
987 s += skip;
988 }
a0d0e21e 989 }
6eb5f6b9 990 break;
f33976b4
DB
991 case CANY:
992 while (s < strend) {
3b0527fe 993 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
994 goto got_it;
995 else
996 tmp = doevery;
997 s++;
998 }
999 break;
6eb5f6b9 1000 case EXACTF:
5dab1207
NIS
1001 m = STRING(c);
1002 ln = STR_LEN(c); /* length to match in octets/bytes */
1003 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1004 if (UTF) {
a2a2844f 1005 STRLEN ulen1, ulen2;
5dab1207 1006 U8 *sm = (U8 *) m;
89ebb4a3
JH
1007 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1008 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1009 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1010
1011 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1012 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1013
89ebb4a3 1014 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1015 0, uniflags);
89ebb4a3 1016 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1017 0, uniflags);
5dab1207
NIS
1018 lnc = 0;
1019 while (sm < ((U8 *) m + ln)) {
1020 lnc++;
1021 sm += UTF8SKIP(sm);
1022 }
1aa99e6b
IH
1023 }
1024 else {
1025 c1 = *(U8*)m;
1026 c2 = PL_fold[c1];
1027 }
6eb5f6b9
JH
1028 goto do_exactf;
1029 case EXACTFL:
5dab1207
NIS
1030 m = STRING(c);
1031 ln = STR_LEN(c);
1032 lnc = (I32) ln;
d8093b23 1033 c1 = *(U8*)m;
6eb5f6b9
JH
1034 c2 = PL_fold_locale[c1];
1035 do_exactf:
db12adc6 1036 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1037
3b0527fe 1038 if (!reginfo && e < s)
6eb5f6b9 1039 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1040
60a8b682
JH
1041 /* The idea in the EXACTF* cases is to first find the
1042 * first character of the EXACTF* node and then, if
1043 * necessary, case-insensitively compare the full
1044 * text of the node. The c1 and c2 are the first
1045 * characters (though in Unicode it gets a bit
1046 * more complicated because there are more cases
7f16dd3d
JH
1047 * than just upper and lower: one needs to use
1048 * the so-called folding case for case-insensitive
1049 * matching (called "loose matching" in Unicode).
1050 * ibcmp_utf8() will do just that. */
60a8b682 1051
1aa99e6b 1052 if (do_utf8) {
575cac57 1053 UV c, f;
89ebb4a3 1054 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1055 STRLEN len, foldlen;
4ad0818d 1056 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1057 if (c1 == c2) {
5dab1207
NIS
1058 /* Upper and lower of 1st char are equal -
1059 * probably not a "letter". */
1aa99e6b 1060 while (s <= e) {
89ebb4a3 1061 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1062 uniflags);
80aecb99
JH
1063 if ( c == c1
1064 && (ln == len ||
66423254 1065 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1066 m, (char **)0, ln, (bool)UTF))
3b0527fe 1067 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1068 goto got_it;
80aecb99 1069 else {
1df70142 1070 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1071 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1072 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1073 if ( f != c
1074 && (f == c1 || f == c2)
1075 && (ln == foldlen ||
66423254
JH
1076 !ibcmp_utf8((char *) foldbuf,
1077 (char **)0, foldlen, do_utf8,
d07ddd77 1078 m,
eb160463 1079 (char **)0, ln, (bool)UTF))
3b0527fe 1080 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1081 goto got_it;
1082 }
1aa99e6b
IH
1083 s += len;
1084 }
09091399
JH
1085 }
1086 else {
1aa99e6b 1087 while (s <= e) {
89ebb4a3 1088 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1089 uniflags);
80aecb99 1090
60a8b682 1091 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1092 * Note that not all the possible combinations
1093 * are handled here: some of them are handled
1094 * by the standard folding rules, and some of
1095 * them (the character class or ANYOF cases)
1096 * are handled during compiletime in
1097 * regexec.c:S_regclass(). */
880bd946
JH
1098 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1099 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1100 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1101
1102 if ( (c == c1 || c == c2)
1103 && (ln == len ||
66423254 1104 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1105 m, (char **)0, ln, (bool)UTF))
3b0527fe 1106 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1107 goto got_it;
80aecb99 1108 else {
1df70142 1109 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1110 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1111 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1112 if ( f != c
1113 && (f == c1 || f == c2)
1114 && (ln == foldlen ||
a6872d42 1115 !ibcmp_utf8((char *) foldbuf,
66423254 1116 (char **)0, foldlen, do_utf8,
d07ddd77 1117 m,
eb160463 1118 (char **)0, ln, (bool)UTF))
3b0527fe 1119 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1120 goto got_it;
1121 }
1aa99e6b
IH
1122 s += len;
1123 }
09091399 1124 }
1aa99e6b
IH
1125 }
1126 else {
1127 if (c1 == c2)
1128 while (s <= e) {
1129 if ( *(U8*)s == c1
1130 && (ln == 1 || !(OP(c) == EXACTF
1131 ? ibcmp(s, m, ln)
1132 : ibcmp_locale(s, m, ln)))
3b0527fe 1133 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1134 goto got_it;
1135 s++;
1136 }
1137 else
1138 while (s <= e) {
1139 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1140 && (ln == 1 || !(OP(c) == EXACTF
1141 ? ibcmp(s, m, ln)
1142 : ibcmp_locale(s, m, ln)))
3b0527fe 1143 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1144 goto got_it;
1145 s++;
1146 }
b3c9acc1
IZ
1147 }
1148 break;
bbce6d69 1149 case BOUNDL:
3280af22 1150 PL_reg_flags |= RF_tainted;
bbce6d69 1151 /* FALL THROUGH */
a0d0e21e 1152 case BOUND:
ffc61ed2 1153 if (do_utf8) {
12d33761 1154 if (s == PL_bostr)
ffc61ed2
JH
1155 tmp = '\n';
1156 else {
6136c704 1157 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1158 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1159 }
1160 tmp = ((OP(c) == BOUND ?
9041c2e3 1161 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1162 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1163 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1164 if (tmp == !(OP(c) == BOUND ?
3568d838 1165 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1166 isALNUM_LC_utf8((U8*)s)))
1167 {
1168 tmp = !tmp;
3b0527fe 1169 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1170 goto got_it;
1171 }
078c425b 1172 s += uskip;
a687059c 1173 }
a0d0e21e 1174 }
667bb95a 1175 else {
12d33761 1176 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1177 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1178 while (s < strend) {
1179 if (tmp ==
1180 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1181 tmp = !tmp;
3b0527fe 1182 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1183 goto got_it;
1184 }
1185 s++;
a0ed51b3 1186 }
a0ed51b3 1187 }
3b0527fe 1188 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1189 goto got_it;
1190 break;
bbce6d69 1191 case NBOUNDL:
3280af22 1192 PL_reg_flags |= RF_tainted;
bbce6d69 1193 /* FALL THROUGH */
a0d0e21e 1194 case NBOUND:
ffc61ed2 1195 if (do_utf8) {
12d33761 1196 if (s == PL_bostr)
ffc61ed2
JH
1197 tmp = '\n';
1198 else {
6136c704 1199 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1200 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1201 }
1202 tmp = ((OP(c) == NBOUND ?
9041c2e3 1203 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1204 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1205 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1206 if (tmp == !(OP(c) == NBOUND ?
3568d838 1207 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1208 isALNUM_LC_utf8((U8*)s)))
1209 tmp = !tmp;
3b0527fe 1210 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2 1211 goto got_it;
078c425b 1212 s += uskip;
ffc61ed2 1213 }
a0d0e21e 1214 }
667bb95a 1215 else {
12d33761 1216 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1217 tmp = ((OP(c) == NBOUND ?
1218 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1219 while (s < strend) {
1220 if (tmp ==
1221 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1222 tmp = !tmp;
3b0527fe 1223 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1224 goto got_it;
1225 s++;
1226 }
a0ed51b3 1227 }
3b0527fe 1228 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1229 goto got_it;
1230 break;
a0d0e21e 1231 case ALNUM:
ffc61ed2 1232 if (do_utf8) {
1a4fad37 1233 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1234 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1235 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1236 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1237 goto got_it;
1238 else
1239 tmp = doevery;
1240 }
bbce6d69 1241 else
ffc61ed2 1242 tmp = 1;
078c425b 1243 s += uskip;
bbce6d69 1244 }
bbce6d69 1245 }
ffc61ed2
JH
1246 else {
1247 while (s < strend) {
1248 if (isALNUM(*s)) {
3b0527fe 1249 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1250 goto got_it;
1251 else
1252 tmp = doevery;
1253 }
a0ed51b3 1254 else
ffc61ed2
JH
1255 tmp = 1;
1256 s++;
a0ed51b3 1257 }
a0ed51b3
LW
1258 }
1259 break;
bbce6d69 1260 case ALNUML:
3280af22 1261 PL_reg_flags |= RF_tainted;
ffc61ed2 1262 if (do_utf8) {
078c425b 1263 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1264 if (isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1265 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1266 goto got_it;
1267 else
1268 tmp = doevery;
1269 }
a687059c 1270 else
ffc61ed2 1271 tmp = 1;
078c425b 1272 s += uskip;
a0d0e21e 1273 }
a0d0e21e 1274 }
ffc61ed2
JH
1275 else {
1276 while (s < strend) {
1277 if (isALNUM_LC(*s)) {
3b0527fe 1278 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1279 goto got_it;
1280 else
1281 tmp = doevery;
1282 }
a0ed51b3 1283 else
ffc61ed2
JH
1284 tmp = 1;
1285 s++;
a0ed51b3 1286 }
a0ed51b3
LW
1287 }
1288 break;
a0d0e21e 1289 case NALNUM:
ffc61ed2 1290 if (do_utf8) {
1a4fad37 1291 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1292 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1293 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1294 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1295 goto got_it;
1296 else
1297 tmp = doevery;
1298 }
bbce6d69 1299 else
ffc61ed2 1300 tmp = 1;
078c425b 1301 s += uskip;
bbce6d69 1302 }
bbce6d69 1303 }
ffc61ed2
JH
1304 else {
1305 while (s < strend) {
1306 if (!isALNUM(*s)) {
3b0527fe 1307 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1308 goto got_it;
1309 else
1310 tmp = doevery;
1311 }
a0ed51b3 1312 else
ffc61ed2
JH
1313 tmp = 1;
1314 s++;
a0ed51b3 1315 }
a0ed51b3
LW
1316 }
1317 break;
bbce6d69 1318 case NALNUML:
3280af22 1319 PL_reg_flags |= RF_tainted;
ffc61ed2 1320 if (do_utf8) {
078c425b 1321 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1322 if (!isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1323 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1324 goto got_it;
1325 else
1326 tmp = doevery;
1327 }
a687059c 1328 else
ffc61ed2 1329 tmp = 1;
078c425b 1330 s += uskip;
a687059c 1331 }
a0d0e21e 1332 }
ffc61ed2
JH
1333 else {
1334 while (s < strend) {
1335 if (!isALNUM_LC(*s)) {
3b0527fe 1336 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1337 goto got_it;
1338 else
1339 tmp = doevery;
1340 }
a0ed51b3 1341 else
ffc61ed2
JH
1342 tmp = 1;
1343 s++;
a0ed51b3 1344 }
a0ed51b3
LW
1345 }
1346 break;
a0d0e21e 1347 case SPACE:
ffc61ed2 1348 if (do_utf8) {
1a4fad37 1349 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1350 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1351 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
3b0527fe 1352 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1353 goto got_it;
1354 else
1355 tmp = doevery;
1356 }
a0d0e21e 1357 else
ffc61ed2 1358 tmp = 1;
078c425b 1359 s += uskip;
2304df62 1360 }
a0d0e21e 1361 }
ffc61ed2
JH
1362 else {
1363 while (s < strend) {
1364 if (isSPACE(*s)) {
3b0527fe 1365 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1366 goto got_it;
1367 else
1368 tmp = doevery;
1369 }
a0ed51b3 1370 else
ffc61ed2
JH
1371 tmp = 1;
1372 s++;
a0ed51b3 1373 }
a0ed51b3
LW
1374 }
1375 break;
bbce6d69 1376 case SPACEL:
3280af22 1377 PL_reg_flags |= RF_tainted;
ffc61ed2 1378 if (do_utf8) {
078c425b 1379 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1380 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
3b0527fe 1381 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1382 goto got_it;
1383 else
1384 tmp = doevery;
1385 }
bbce6d69 1386 else
ffc61ed2 1387 tmp = 1;
078c425b 1388 s += uskip;
bbce6d69 1389 }
bbce6d69 1390 }
ffc61ed2
JH
1391 else {
1392 while (s < strend) {
1393 if (isSPACE_LC(*s)) {
3b0527fe 1394 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1395 goto got_it;
1396 else
1397 tmp = doevery;
1398 }
a0ed51b3 1399 else
ffc61ed2
JH
1400 tmp = 1;
1401 s++;
a0ed51b3 1402 }
a0ed51b3
LW
1403 }
1404 break;
a0d0e21e 1405 case NSPACE:
ffc61ed2 1406 if (do_utf8) {
1a4fad37 1407 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1408 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1409 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
3b0527fe 1410 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1411 goto got_it;
1412 else
1413 tmp = doevery;
1414 }
a0d0e21e 1415 else
ffc61ed2 1416 tmp = 1;
078c425b 1417 s += uskip;
a687059c 1418 }
a0d0e21e 1419 }
ffc61ed2
JH
1420 else {
1421 while (s < strend) {
1422 if (!isSPACE(*s)) {
3b0527fe 1423 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1424 goto got_it;
1425 else
1426 tmp = doevery;
1427 }
a0ed51b3 1428 else
ffc61ed2
JH
1429 tmp = 1;
1430 s++;
a0ed51b3 1431 }
a0ed51b3
LW
1432 }
1433 break;
bbce6d69 1434 case NSPACEL:
3280af22 1435 PL_reg_flags |= RF_tainted;
ffc61ed2 1436 if (do_utf8) {
078c425b 1437 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1438 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
3b0527fe 1439 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1440 goto got_it;
1441 else
1442 tmp = doevery;
1443 }
bbce6d69 1444 else
ffc61ed2 1445 tmp = 1;
078c425b 1446 s += uskip;
bbce6d69 1447 }
bbce6d69 1448 }
ffc61ed2
JH
1449 else {
1450 while (s < strend) {
1451 if (!isSPACE_LC(*s)) {
3b0527fe 1452 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1453 goto got_it;
1454 else
1455 tmp = doevery;
1456 }
a0ed51b3 1457 else
ffc61ed2
JH
1458 tmp = 1;
1459 s++;
a0ed51b3 1460 }
a0ed51b3
LW
1461 }
1462 break;
a0d0e21e 1463 case DIGIT:
ffc61ed2 1464 if (do_utf8) {
1a4fad37 1465 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1466 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1467 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1468 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1469 goto got_it;
1470 else
1471 tmp = doevery;
1472 }
a0d0e21e 1473 else
ffc61ed2 1474 tmp = 1;
078c425b 1475 s += uskip;
2b69d0c2 1476 }
a0d0e21e 1477 }
ffc61ed2
JH
1478 else {
1479 while (s < strend) {
1480 if (isDIGIT(*s)) {
3b0527fe 1481 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1482 goto got_it;
1483 else
1484 tmp = doevery;
1485 }
a0ed51b3 1486 else
ffc61ed2
JH
1487 tmp = 1;
1488 s++;
a0ed51b3 1489 }
a0ed51b3
LW
1490 }
1491 break;
b8c5462f
JH
1492 case DIGITL:
1493 PL_reg_flags |= RF_tainted;
ffc61ed2 1494 if (do_utf8) {
078c425b 1495 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1496 if (isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1497 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1498 goto got_it;
1499 else
1500 tmp = doevery;
1501 }
b8c5462f 1502 else
ffc61ed2 1503 tmp = 1;
078c425b 1504 s += uskip;
b8c5462f 1505 }
b8c5462f 1506 }
ffc61ed2
JH
1507 else {
1508 while (s < strend) {
1509 if (isDIGIT_LC(*s)) {
3b0527fe 1510 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1511 goto got_it;
1512 else
1513 tmp = doevery;
1514 }
b8c5462f 1515 else
ffc61ed2
JH
1516 tmp = 1;
1517 s++;
b8c5462f 1518 }
b8c5462f
JH
1519 }
1520 break;
a0d0e21e 1521 case NDIGIT:
ffc61ed2 1522 if (do_utf8) {
1a4fad37 1523 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1524 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1525 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1526 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1527 goto got_it;
1528 else
1529 tmp = doevery;
1530 }
a0d0e21e 1531 else
ffc61ed2 1532 tmp = 1;
078c425b 1533 s += uskip;
a687059c 1534 }
a0d0e21e 1535 }
ffc61ed2
JH
1536 else {
1537 while (s < strend) {
1538 if (!isDIGIT(*s)) {
3b0527fe 1539 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1540 goto got_it;
1541 else
1542 tmp = doevery;
1543 }
a0ed51b3 1544 else
ffc61ed2
JH
1545 tmp = 1;
1546 s++;
a0ed51b3 1547 }
a0ed51b3
LW
1548 }
1549 break;
b8c5462f
JH
1550 case NDIGITL:
1551 PL_reg_flags |= RF_tainted;
ffc61ed2 1552 if (do_utf8) {
078c425b 1553 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1554 if (!isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1555 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1556 goto got_it;
1557 else
1558 tmp = doevery;
1559 }
b8c5462f 1560 else
ffc61ed2 1561 tmp = 1;
078c425b 1562 s += uskip;
b8c5462f 1563 }
a0ed51b3 1564 }
ffc61ed2
JH
1565 else {
1566 while (s < strend) {
1567 if (!isDIGIT_LC(*s)) {
3b0527fe 1568 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1569 goto got_it;
1570 else
1571 tmp = doevery;
1572 }
cf93c79d 1573 else
ffc61ed2
JH
1574 tmp = 1;
1575 s++;
b8c5462f 1576 }
b8c5462f
JH
1577 }
1578 break;
b3c9acc1 1579 default:
3c3eec57
GS
1580 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1581 break;
d6a28714 1582 }
6eb5f6b9
JH
1583 return 0;
1584 got_it:
1585 return s;
1586}
1587
1588/*
1589 - regexec_flags - match a regexp against a string
1590 */
1591I32
1592Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1593 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1594/* strend: pointer to null at end of string */
1595/* strbeg: real beginning of string */
1596/* minend: end of match must be >=minend after stringarg. */
1597/* data: May be used for some additional optimizations. */
1598/* nosave: For optimizations. */
1599{
97aff369 1600 dVAR;
6eb5f6b9
JH
1601 register char *s;
1602 register regnode *c;
1603 register char *startpos = stringarg;
6eb5f6b9
JH
1604 I32 minlen; /* must match at least this many chars */
1605 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1606 I32 end_shift = 0; /* Same for the end. */ /* CC */
1607 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1608 char *scream_olds = NULL;
6eb5f6b9 1609 SV* oreplsv = GvSV(PL_replgv);
1df70142 1610 const bool do_utf8 = DO_UTF8(sv);
2757e526 1611 I32 multiline;
2a782b5b 1612#ifdef DEBUGGING
2757e526
JH
1613 SV* dsv0;
1614 SV* dsv1;
2a782b5b 1615#endif
3b0527fe 1616 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1617
1618 GET_RE_DEBUG_FLAGS_DECL;
1619
9d4ba2ae 1620 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1621
1622 /* Be paranoid... */
1623 if (prog == NULL || startpos == NULL) {
1624 Perl_croak(aTHX_ "NULL regexp parameter");
1625 return 0;
1626 }
1627
2757e526 1628 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1629 reginfo.prog = prog;
2757e526
JH
1630
1631#ifdef DEBUGGING
1632 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1633 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1634#endif
1635
bac06658
JH
1636 RX_MATCH_UTF8_set(prog, do_utf8);
1637
6eb5f6b9 1638 minlen = prog->minlen;
61a36c01 1639 if (strend - startpos < minlen) {
a3621e74 1640 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1641 "String too short [regexec_flags]...\n"));
1642 goto phooey;
1aa99e6b 1643 }
6eb5f6b9 1644
6eb5f6b9
JH
1645 /* Check validity of program. */
1646 if (UCHARAT(prog->program) != REG_MAGIC) {
1647 Perl_croak(aTHX_ "corrupted regexp program");
1648 }
1649
1650 PL_reg_flags = 0;
1651 PL_reg_eval_set = 0;
1652 PL_reg_maxiter = 0;
1653
1654 if (prog->reganch & ROPT_UTF8)
1655 PL_reg_flags |= RF_utf8;
1656
1657 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1658 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1659 PL_bostr = strbeg;
3b0527fe 1660 reginfo.sv = sv;
6eb5f6b9
JH
1661
1662 /* Mark end of line for $ (and such) */
1663 PL_regeol = strend;
1664
1665 /* see how far we have to get to not match where we matched before */
3b0527fe 1666 reginfo.till = startpos+minend;
6eb5f6b9 1667
6eb5f6b9
JH
1668 /* If there is a "must appear" string, look for it. */
1669 s = startpos;
1670
3b0527fe 1671 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1672 MAGIC *mg;
1673
1674 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1675 reginfo.ganch = startpos;
6eb5f6b9
JH
1676 else if (sv && SvTYPE(sv) >= SVt_PVMG
1677 && SvMAGIC(sv)
14befaf4
DM
1678 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1679 && mg->mg_len >= 0) {
3b0527fe 1680 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1681 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1682 if (s > reginfo.ganch)
6eb5f6b9 1683 goto phooey;
3b0527fe 1684 s = reginfo.ganch;
6eb5f6b9
JH
1685 }
1686 }
1687 else /* pos() not defined */
3b0527fe 1688 reginfo.ganch = strbeg;
6eb5f6b9
JH
1689 }
1690
a0714e2c 1691 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1692 re_scream_pos_data d;
1693
1694 d.scream_olds = &scream_olds;
1695 d.scream_pos = &scream_pos;
1696 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1697 if (!s) {
a3621e74 1698 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1699 goto phooey; /* not present */
3fa9c3d7 1700 }
6eb5f6b9
JH
1701 }
1702
a3621e74 1703 DEBUG_EXECUTE_r({
1df70142
AL
1704 const char * const s0 = UTF
1705 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1706 UNI_DISPLAY_REGEX)
1707 : prog->precomp;
1708 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1709 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1710 UNI_DISPLAY_REGEX) : startpos;
1df70142 1711 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1712 if (!PL_colorset)
1713 reginitcolors();
1714 PerlIO_printf(Perl_debug_log,
a0288114 1715 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1716 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1717 len0, len0, s0,
2a782b5b 1718 PL_colors[1],
9e55ce06 1719 len0 > 60 ? "..." : "",
2a782b5b 1720 PL_colors[0],
9e55ce06
JH
1721 (int)(len1 > 60 ? 60 : len1),
1722 s1, PL_colors[1],
1723 (len1 > 60 ? "..." : "")
2a782b5b
JH
1724 );
1725 });
6eb5f6b9
JH
1726
1727 /* Simplest case: anchored match need be tried only once. */
1728 /* [unless only anchor is BOL and multiline is set] */
1729 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1730 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1731 goto got_it;
7fba1cd6 1732 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1733 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1734 {
1735 char *end;
1736
1737 if (minlen)
1738 dontbother = minlen - 1;
1aa99e6b 1739 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1740 /* for multiline we only have to try after newlines */
33b8afdf 1741 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1742 if (s == startpos)
1743 goto after_try;
1744 while (1) {
3b0527fe 1745 if (regtry(&reginfo, s))
6eb5f6b9
JH
1746 goto got_it;
1747 after_try:
1748 if (s >= end)
1749 goto phooey;
1750 if (prog->reganch & RE_USE_INTUIT) {
1751 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1752 if (!s)
1753 goto phooey;
1754 }
1755 else
1756 s++;
1757 }
1758 } else {
1759 if (s > startpos)
1760 s--;
1761 while (s < end) {
1762 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1763 if (regtry(&reginfo, s))
6eb5f6b9
JH
1764 goto got_it;
1765 }
1766 }
1767 }
1768 }
1769 goto phooey;
1770 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1771 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1772 goto got_it;
1773 goto phooey;
1774 }
1775
1776 /* Messy cases: unanchored match. */
33b8afdf 1777 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1778 /* we have /x+whatever/ */
1779 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1780 char ch;
bf93d4cc
GS
1781#ifdef DEBUGGING
1782 int did_match = 0;
1783#endif
33b8afdf
JH
1784 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1785 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1786 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1787
1aa99e6b 1788 if (do_utf8) {
6eb5f6b9
JH
1789 while (s < strend) {
1790 if (*s == ch) {
a3621e74 1791 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1792 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1793 s += UTF8SKIP(s);
1794 while (s < strend && *s == ch)
1795 s += UTF8SKIP(s);
1796 }
1797 s += UTF8SKIP(s);
1798 }
1799 }
1800 else {
1801 while (s < strend) {
1802 if (*s == ch) {
a3621e74 1803 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1804 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1805 s++;
1806 while (s < strend && *s == ch)
1807 s++;
1808 }
1809 s++;
1810 }
1811 }
a3621e74 1812 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1813 PerlIO_printf(Perl_debug_log,
b7953727
JH
1814 "Did not find anchored character...\n")
1815 );
6eb5f6b9 1816 }
a0714e2c
SS
1817 else if (prog->anchored_substr != NULL
1818 || prog->anchored_utf8 != NULL
1819 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1820 && prog->float_max_offset < strend - s)) {
1821 SV *must;
1822 I32 back_max;
1823 I32 back_min;
1824 char *last;
6eb5f6b9 1825 char *last1; /* Last position checked before */
bf93d4cc
GS
1826#ifdef DEBUGGING
1827 int did_match = 0;
1828#endif
33b8afdf
JH
1829 if (prog->anchored_substr || prog->anchored_utf8) {
1830 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1831 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1832 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1833 back_max = back_min = prog->anchored_offset;
1834 } else {
1835 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1836 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1837 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1838 back_max = prog->float_max_offset;
1839 back_min = prog->float_min_offset;
1840 }
1841 if (must == &PL_sv_undef)
1842 /* could not downgrade utf8 check substring, so must fail */
1843 goto phooey;
1844
1845 last = HOP3c(strend, /* Cannot start after this */
1846 -(I32)(CHR_SVLEN(must)
1847 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1848
1849 if (s > PL_bostr)
1850 last1 = HOPc(s, -1);
1851 else
1852 last1 = s - 1; /* bogus */
1853
a0288114 1854 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1855 check_substr==must. */
1856 scream_pos = -1;
1857 dontbother = end_shift;
1858 strend = HOPc(strend, -dontbother);
1859 while ( (s <= last) &&
9041c2e3 1860 ((flags & REXEC_SCREAM)
1aa99e6b 1861 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1862 end_shift, &scream_pos, 0))
1aa99e6b 1863 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1864 (unsigned char*)strend, must,
7fba1cd6 1865 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1866 /* we may be pointing at the wrong string */
1867 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1868 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1869 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1870 if (HOPc(s, -back_max) > last1) {
1871 last1 = HOPc(s, -back_min);
1872 s = HOPc(s, -back_max);
1873 }
1874 else {
52657f30 1875 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1876
1877 last1 = HOPc(s, -back_min);
52657f30 1878 s = t;
6eb5f6b9 1879 }
1aa99e6b 1880 if (do_utf8) {
6eb5f6b9 1881 while (s <= last1) {
3b0527fe 1882 if (regtry(&reginfo, s))
6eb5f6b9
JH
1883 goto got_it;
1884 s += UTF8SKIP(s);
1885 }
1886 }
1887 else {
1888 while (s <= last1) {
3b0527fe 1889 if (regtry(&reginfo, s))
6eb5f6b9
JH
1890 goto got_it;
1891 s++;
1892 }
1893 }
1894 }
a3621e74 1895 DEBUG_EXECUTE_r(if (!did_match)
b7953727 1896 PerlIO_printf(Perl_debug_log,
a0288114 1897 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1898 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1899 ? "anchored" : "floating"),
1900 PL_colors[0],
1901 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 1902 SvPVX_const(must),
b7953727
JH
1903 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1904 );
6eb5f6b9
JH
1905 goto phooey;
1906 }
155aba94 1907 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1908 if (minlen) {
1909 I32 op = (U8)OP(prog->regstclass);
66e933ab 1910 /* don't bother with what can't match */
f14c76ed
RGS
1911 if (PL_regkind[op] != EXACT && op != CANY)
1912 strend = HOPc(strend, -(minlen - 1));
1913 }
a3621e74 1914 DEBUG_EXECUTE_r({
ffc61ed2 1915 SV *prop = sv_newmortal();
cfd0369c
NC
1916 const char *s0;
1917 const char *s1;
9e55ce06
JH
1918 int len0;
1919 int len1;
1920
32fc9b6a 1921 regprop(prog, prop, c);
9e55ce06 1922 s0 = UTF ?
3f7c398e 1923 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 1924 UNI_DISPLAY_REGEX) :
cfd0369c 1925 SvPVX_const(prop);
9e55ce06
JH
1926 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1927 s1 = UTF ?
c728cb41 1928 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1929 len1 = UTF ? SvCUR(dsv1) : strend - s;
1930 PerlIO_printf(Perl_debug_log,
a0288114 1931 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
9e55ce06
JH
1932 len0, len0, s0,
1933 len1, len1, s1);
ffc61ed2 1934 });
3b0527fe 1935 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 1936 goto got_it;
a3621e74 1937 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1938 }
1939 else {
1940 dontbother = 0;
a0714e2c 1941 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1942 /* Trim the end. */
d6a28714 1943 char *last;
33b8afdf
JH
1944 SV* float_real;
1945
1946 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1947 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1948 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1949
1950 if (flags & REXEC_SCREAM) {
33b8afdf 1951 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1952 end_shift, &scream_pos, 1); /* last one */
1953 if (!last)
ffc61ed2 1954 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1955 /* we may be pointing at the wrong string */
1956 else if (RX_MATCH_COPIED(prog))
3f7c398e 1957 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1958 }
d6a28714
JH
1959 else {
1960 STRLEN len;
cfd0369c 1961 const char * const little = SvPV_const(float_real, len);
d6a28714 1962
33b8afdf 1963 if (SvTAIL(float_real)) {
d6a28714
JH
1964 if (memEQ(strend - len + 1, little, len - 1))
1965 last = strend - len + 1;
7fba1cd6 1966 else if (!multiline)
9041c2e3 1967 last = memEQ(strend - len, little, len)
bd61b366 1968 ? strend - len : NULL;
b8c5462f 1969 else
d6a28714
JH
1970 goto find_last;
1971 } else {
1972 find_last:
9041c2e3 1973 if (len)
d6a28714 1974 last = rninstr(s, strend, little, little + len);
b8c5462f 1975 else
a0288114 1976 last = strend; /* matching "$" */
b8c5462f 1977 }
b8c5462f 1978 }
bf93d4cc 1979 if (last == NULL) {
a3621e74 1980 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 1981 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 1982 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1983 goto phooey; /* Should not happen! */
1984 }
d6a28714
JH
1985 dontbother = strend - last + prog->float_min_offset;
1986 }
1987 if (minlen && (dontbother < minlen))
1988 dontbother = minlen - 1;
1989 strend -= dontbother; /* this one's always in bytes! */
1990 /* We don't know much -- general case. */
1aa99e6b 1991 if (do_utf8) {
d6a28714 1992 for (;;) {
3b0527fe 1993 if (regtry(&reginfo, s))
d6a28714
JH
1994 goto got_it;
1995 if (s >= strend)
1996 break;
b8c5462f 1997 s += UTF8SKIP(s);
d6a28714
JH
1998 };
1999 }
2000 else {
2001 do {
3b0527fe 2002 if (regtry(&reginfo, s))
d6a28714
JH
2003 goto got_it;
2004 } while (s++ < strend);
2005 }
2006 }
2007
2008 /* Failure. */
2009 goto phooey;
2010
2011got_it:
2012 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2013
2014 if (PL_reg_eval_set) {
2015 /* Preserve the current value of $^R */
2016 if (oreplsv != GvSV(PL_replgv))
2017 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2018 restored, the value remains
2019 the same. */
4f639d21 2020 restore_pos(aTHX_ prog);
d6a28714
JH
2021 }
2022
2023 /* make sure $`, $&, $', and $digit will work later */
2024 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2025 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2026 if (flags & REXEC_COPY_STR) {
2027 I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2028#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2029 if ((SvIsCOW(sv)
2030 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2031 if (DEBUG_C_TEST) {
2032 PerlIO_printf(Perl_debug_log,
2033 "Copy on write: regexp capture, type %d\n",
2034 (int) SvTYPE(sv));
2035 }
2036 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2037 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2038 assert (SvPOKp(prog->saved_copy));
2039 } else
2040#endif
2041 {
2042 RX_MATCH_COPIED_on(prog);
2043 s = savepvn(strbeg, i);
2044 prog->subbeg = s;
2045 }
d6a28714 2046 prog->sublen = i;
d6a28714
JH
2047 }
2048 else {
2049 prog->subbeg = strbeg;
2050 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2051 }
2052 }
9041c2e3 2053
d6a28714
JH
2054 return 1;
2055
2056phooey:
a3621e74 2057 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2058 PL_colors[4], PL_colors[5]));
d6a28714 2059 if (PL_reg_eval_set)
4f639d21 2060 restore_pos(aTHX_ prog);
d6a28714
JH
2061 return 0;
2062}
2063
2064/*
2065 - regtry - try match at specific point
2066 */
2067STATIC I32 /* 0 failure, 1 success */
3b0527fe 2068S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2069{
97aff369 2070 dVAR;
d6a28714
JH
2071 register I32 *sp;
2072 register I32 *ep;
2073 CHECKPOINT lastcp;
3b0527fe 2074 regexp *prog = reginfo->prog;
a3621e74 2075 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2076
02db2b7b
IZ
2077#ifdef DEBUGGING
2078 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2079#endif
d6a28714
JH
2080 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2081 MAGIC *mg;
2082
2083 PL_reg_eval_set = RS_init;
a3621e74 2084 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2085 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2086 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2087 ));
e8347627 2088 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2089 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2090 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2091 SAVETMPS;
2092 /* Apparently this is not needed, judging by wantarray. */
e8347627 2093 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2094 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2095
3b0527fe 2096 if (reginfo->sv) {
d6a28714 2097 /* Make $_ available to executed code. */
3b0527fe 2098 if (reginfo->sv != DEFSV) {
59f00321 2099 SAVE_DEFSV;
3b0527fe 2100 DEFSV = reginfo->sv;
b8c5462f 2101 }
d6a28714 2102
3b0527fe
DM
2103 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2104 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2105 /* prepare for quick setting of pos */
d300d9fa
NC
2106#ifdef PERL_OLD_COPY_ON_WRITE
2107 if (SvIsCOW(sv))
2108 sv_force_normal_flags(sv, 0);
2109#endif
3b0527fe 2110 mg = sv_magicext(reginfo->sv, (SV*)0, PERL_MAGIC_regex_global,
d300d9fa 2111 &PL_vtbl_mglob, NULL, 0);
d6a28714 2112 mg->mg_len = -1;
b8c5462f 2113 }
d6a28714
JH
2114 PL_reg_magic = mg;
2115 PL_reg_oldpos = mg->mg_len;
4f639d21 2116 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2117 }
09687e5a 2118 if (!PL_reg_curpm) {
a02a5408 2119 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2120#ifdef USE_ITHREADS
2121 {
2122 SV* repointer = newSViv(0);
577e12cc 2123 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2124 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2125 av_push(PL_regex_padav,repointer);
2126 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2127 PL_regex_pad = AvARRAY(PL_regex_padav);
2128 }
2129#endif
2130 }
aaa362c4 2131 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2132 PL_reg_oldcurpm = PL_curpm;
2133 PL_curpm = PL_reg_curpm;
2134 if (RX_MATCH_COPIED(prog)) {
2135 /* Here is a serious problem: we cannot rewrite subbeg,
2136 since it may be needed if this match fails. Thus
2137 $` inside (?{}) could fail... */
2138 PL_reg_oldsaved = prog->subbeg;
2139 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2140#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2141 PL_nrs = prog->saved_copy;
2142#endif
d6a28714
JH
2143 RX_MATCH_COPIED_off(prog);
2144 }
2145 else
bd61b366 2146 PL_reg_oldsaved = NULL;
d6a28714
JH
2147 prog->subbeg = PL_bostr;
2148 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2149 }
973dddac 2150 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2151 PL_reginput = startpos;
2152 PL_regstartp = prog->startp;
2153 PL_regendp = prog->endp;
2154 PL_reglastparen = &prog->lastparen;
a01268b5 2155 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2156 prog->lastparen = 0;
03994de8 2157 prog->lastcloseparen = 0;
d6a28714 2158 PL_regsize = 0;
a3621e74 2159 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2160 if (PL_reg_start_tmpl <= prog->nparens) {
2161 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2162 if(PL_reg_start_tmp)
2163 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2164 else
a02a5408 2165 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2166 }
2167
2168 /* XXXX What this code is doing here?!!! There should be no need
2169 to do this again and again, PL_reglastparen should take care of
3dd2943c 2170 this! --ilya*/
dafc8851
JH
2171
2172 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2173 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2174 * PL_reglastparen), is not needed at all by the test suite
2175 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2176 * enough, for building DynaLoader, or otherwise this
2177 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2178 * will happen. Meanwhile, this code *is* needed for the
2179 * above-mentioned test suite tests to succeed. The common theme
2180 * on those tests seems to be returning null fields from matches.
2181 * --jhi */
dafc8851 2182#if 1
d6a28714
JH
2183 sp = prog->startp;
2184 ep = prog->endp;
2185 if (prog->nparens) {
097eb12c 2186 register I32 i;
eb160463 2187 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2188 *++sp = -1;
2189 *++ep = -1;
2190 }
2191 }
dafc8851 2192#endif
02db2b7b 2193 REGCP_SET(lastcp);
3b0527fe 2194 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2195 prog->endp[0] = PL_reginput - PL_bostr;
2196 return 1;
2197 }
02db2b7b 2198 REGCP_UNWIND(lastcp);
d6a28714
JH
2199 return 0;
2200}
2201
02db2b7b
IZ
2202#define RE_UNWIND_BRANCH 1
2203#define RE_UNWIND_BRANCHJ 2
2204
2205union re_unwind_t;
2206
2207typedef struct { /* XX: makes sense to enlarge it... */
2208 I32 type;
2209 I32 prev;
2210 CHECKPOINT lastcp;
2211} re_unwind_generic_t;
2212
2213typedef struct {
2214 I32 type;
2215 I32 prev;
2216 CHECKPOINT lastcp;
2217 I32 lastparen;
2218 regnode *next;
2219 char *locinput;
2220 I32 nextchr;
3a2830be 2221 int minmod;
02db2b7b
IZ
2222#ifdef DEBUGGING
2223 int regindent;
2224#endif
2225} re_unwind_branch_t;
2226
2227typedef union re_unwind_t {
2228 I32 type;
2229 re_unwind_generic_t generic;
2230 re_unwind_branch_t branch;
2231} re_unwind_t;
2232
8ba1375e
MJD
2233#define sayYES goto yes
2234#define sayNO goto no
e0f9d4a8 2235#define sayNO_ANYOF goto no_anyof
8ba1375e 2236#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2237#define sayNO_FINAL goto no_final
2238#define sayNO_SILENT goto do_no
2239#define saySAME(x) if (x) goto yes; else goto no
2240
3ab3c9b4
HS
2241#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2242#define POSCACHE_SEEN 1 /* we know what we're caching */
2243#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
7409bbd3 2244
3ab3c9b4 2245#define CACHEsayYES STMT_START { \
d8319b27 2246 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3
DM
2247 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2248 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2249 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2250 } \
2251 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2252 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2253 } \
2254 else { \
3ab3c9b4
HS
2255 /* cache records failure, but this is success */ \
2256 DEBUG_r( \
2257 PerlIO_printf(Perl_debug_log, \
2258 "%*s (remove success from failure cache)\n", \
2259 REPORT_CODE_OFF+PL_regindent*2, "") \
2260 ); \
d8319b27 2261 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2262 } \
2263 } \
2264 sayYES; \
2265} STMT_END
7409bbd3 2266
3ab3c9b4 2267#define CACHEsayNO STMT_START { \
d8319b27 2268 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3 2269 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
3ab3c9b4 2270 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
7409bbd3
DM
2271 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2272 } \
2273 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2274 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2275 } \
2276 else { \
3ab3c9b4
HS
2277 /* cache records success, but this is failure */ \
2278 DEBUG_r( \
2279 PerlIO_printf(Perl_debug_log, \
2280 "%*s (remove failure from success cache)\n", \
2281 REPORT_CODE_OFF+PL_regindent*2, "") \
2282 ); \
d8319b27 2283 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2284 } \
2285 } \
2286 sayNO; \
2287} STMT_END
2288
a3621e74
YO
2289/* this is used to determine how far from the left messages like
2290 'failed...' are printed. Currently 29 makes these messages line
2291 up with the opcode they refer to. Earlier perls used 25 which
2292 left these messages outdented making reviewing a debug output
2293 quite difficult.
2294*/
2295#define REPORT_CODE_OFF 29
2296
2297
2298/* Make sure there is a test for this +1 options in re_tests */
2299#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2300
86545054
DM
2301#define SLAB_FIRST(s) (&(s)->states[0])
2302#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2303
5d9a96ca
DM
2304/* grab a new slab and return the first slot in it */
2305
2306STATIC regmatch_state *
2307S_push_slab(pTHX)
2308{
2309 regmatch_slab *s = PL_regmatch_slab->next;
2310 if (!s) {
2311 Newx(s, 1, regmatch_slab);
2312 s->prev = PL_regmatch_slab;
2313 s->next = NULL;
2314 PL_regmatch_slab->next = s;
2315 }
2316 PL_regmatch_slab = s;
86545054 2317 return SLAB_FIRST(s);
5d9a96ca 2318}
5b47454d 2319
95b24440
DM
2320/* simulate a recursive call to regmatch */
2321
2322#define REGMATCH(ns, where) \
5d9a96ca
DM
2323 st->scan = scan; \
2324 scan = (ns); \
2325 st->resume_state = resume_##where; \
95b24440
DM
2326 goto start_recurse; \
2327 resume_point_##where:
2328
aa283a38
DM
2329
2330/* push a new regex state. Set newst to point to it */
2331
2332#define PUSH_STATE(newst, resume) \
2333 depth++; \
2334 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH STATE(%d)\n", depth)); \
2335 st->scan = scan; \
2336 st->next = next; \
2337 st->n = n; \
2338 st->locinput = locinput; \
2339 st->resume_state = resume; \
2340 newst = st+1; \
86545054 2341 if (newst > SLAB_LAST(PL_regmatch_slab)) \
aa283a38
DM
2342 newst = S_push_slab(aTHX); \
2343 PL_regmatch_state = newst; \
2344 newst->cc = 0; \
2345 newst->minmod = 0; \
2346 newst->sw = 0; \
2347 newst->logical = 0; \
2348 newst->unwind = 0; \
2349 locinput = PL_reginput; \
2350 nextchr = UCHARAT(locinput);
2351
2352#define POP_STATE \
2353 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth)); \
2354 depth--; \
2355 st--; \
86545054 2356 if (st < SLAB_FIRST(PL_regmatch_slab)) { \
aa283a38 2357 PL_regmatch_slab = PL_regmatch_slab->prev; \
86545054 2358 st = SLAB_LAST(PL_regmatch_slab); \
aa283a38
DM
2359 } \
2360 PL_regmatch_state = st; \
2361 scan = st->scan; \
2362 next = st->next; \
2363 n = st->n; \
2364 locinput = st->locinput; \
2365 nextchr = UCHARAT(locinput);
2366
d6a28714
JH
2367/*
2368 - regmatch - main matching routine
2369 *
2370 * Conceptually the strategy is simple: check to see whether the current
2371 * node matches, call self recursively to see whether the rest matches,
2372 * and then act accordingly. In practice we make some effort to avoid
2373 * recursion, in particular by going through "ordinary" nodes (that don't
2374 * need to know whether the rest of the match failed) by a loop instead of
2375 * by recursion.
2376 */
2377/* [lwall] I've hoisted the register declarations to the outer block in order to
2378 * maybe save a little bit of pushing and popping on the stack. It also takes
2379 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2380 *
2381 * This function used to be heavily recursive, but since this had the
2382 * effect of blowing the CPU stack on complex regexes, it has been
2383 * restructured to be iterative, and to save state onto the heap rather
2384 * than the stack. Essentially whereever regmatch() used to be called, it
2385 * pushes the current state, notes where to return, then jumps back into
2386 * the main loop.
2387 *
2388 * Originally the structure of this function used to look something like
2389
2390 S_regmatch() {
2391 int a = 1, b = 2;
2392 ...
2393 while (scan != NULL) {
5d9a96ca 2394 a++; // do stuff with a and b
95b24440
DM
2395 ...
2396 switch (OP(scan)) {
2397 case FOO: {
2398 int local = 3;
2399 ...
2400 if (regmatch(...)) // recurse
2401 goto yes;
2402 }
2403 ...
2404 }
2405 }
2406 yes:
2407 return 1;
2408 }
2409
2410 * Now it looks something like this:
2411
5d9a96ca 2412 typedef struct {
95b24440
DM
2413 int a, b, local;
2414 int resume_state;
5d9a96ca 2415 } regmatch_state;
95b24440
DM
2416
2417 S_regmatch() {
5d9a96ca
DM
2418 regmatch_state *st = new();
2419 int depth=0;
2420 st->a++; // do stuff with a and b
95b24440
DM
2421 ...
2422 while (scan != NULL) {
2423 ...
2424 switch (OP(scan)) {
2425 case FOO: {
5d9a96ca 2426 st->local = 3;
95b24440 2427 ...
5d9a96ca
DM
2428 st->scan = scan;
2429 scan = ...;
2430 st->resume_state = resume_FOO;
2431 goto start_recurse; // recurse
95b24440 2432
5d9a96ca
DM
2433 resume_point_FOO:
2434 if (result)
95b24440
DM
2435 goto yes;
2436 }
2437 ...
2438 }
5d9a96ca
DM
2439 start_recurse:
2440 st = new(); push a new state
2441 st->a = 1; st->b = 2;
2442 depth++;
95b24440 2443 }
5d9a96ca 2444 yes:
95b24440 2445 result = 1;
5d9a96ca
DM
2446 if (depth--) {
2447 st = pop();
95b24440
DM
2448 switch (resume_state) {
2449 case resume_FOO:
2450 goto resume_point_FOO;
2451 ...
2452 }
2453 }
2454 return result
2455 }
2456
2457 * WARNING: this means that any line in this function that contains a
2458 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2459 * regmatch() using gotos instead. Thus the values of any local variables
2460 * not saved in the regmatch_state structure will have been lost when
2461 * execution resumes on the next line .
5d9a96ca
DM
2462 *
2463 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2464 * PL_regmatch_state always points to the currently active state, and
2465 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2466 * The first time regmatch is called, the first slab is allocated, and is
2467 * never freed until interpreter desctruction. When the slab is full,
2468 * a new one is allocated chained to the end. At exit from regmatch, slabs
2469 * allocated since entry are freed.
d6a28714 2470 */
95b24440
DM
2471
2472
d6a28714 2473STATIC I32 /* 0 failure, 1 success */
3b0527fe 2474S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2475{
27da23d5 2476 dVAR;
95b24440 2477 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2478 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2479
3b0527fe
DM
2480 regexp *rex = reginfo->prog;
2481
5d9a96ca
DM
2482 regmatch_slab *orig_slab;
2483 regmatch_state *orig_state;
a3621e74 2484
5d9a96ca
DM
2485 /* the current state. This is a cached copy of PL_regmatch_state */
2486 register regmatch_state *st;
95b24440 2487
5d9a96ca
DM
2488 /* cache heavy used fields of st in registers */
2489 register regnode *scan;
2490 register regnode *next;
2491 register I32 n = 0; /* initialize to shut up compiler warning */
2492 register char *locinput = PL_reginput;
95b24440 2493
5d9a96ca
DM
2494 /* these variables are NOT saved during a recusive RFEGMATCH: */
2495 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2496 bool result; /* return value of S_regmatch */
2497 regnode *inner; /* Next node in internal branch. */
2498 int depth = 0; /* depth of recursion */
aa283a38 2499 regmatch_state *newst; /* when pushing a state, this is the new one */
77cb431f
DM
2500 regmatch_state *yes_state = NULL; /* state to pop to on success of
2501 subpattern */
95b24440
DM
2502
2503#ifdef DEBUGGING
ab74612d 2504 SV *re_debug_flags = NULL;
a3621e74 2505 GET_RE_DEBUG_FLAGS;
d6a28714
JH
2506 PL_regindent++;
2507#endif
2508
5d9a96ca
DM
2509 /* on first ever call to regmatch, allocate first slab */
2510 if (!PL_regmatch_slab) {
2511 Newx(PL_regmatch_slab, 1, regmatch_slab);
2512 PL_regmatch_slab->prev = NULL;
2513 PL_regmatch_slab->next = NULL;
86545054 2514 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2515 }
2516
2517 /* remember current high-water mark for exit */
2518 /* XXX this should be done with SAVE* instead */
2519 orig_slab = PL_regmatch_slab;
2520 orig_state = PL_regmatch_state;
2521
2522 /* grab next free state slot */
2523 st = ++PL_regmatch_state;
86545054 2524 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2525 st = PL_regmatch_state = S_push_slab(aTHX);
2526
2527 st->minmod = 0;
2528 st->sw = 0;
2529 st->logical = 0;
2530 st->unwind = 0;
2531 st->cc = NULL;
d6a28714
JH
2532 /* Note that nextchr is a byte even in UTF */
2533 nextchr = UCHARAT(locinput);
2534 scan = prog;
2535 while (scan != NULL) {
8ba1375e 2536
a3621e74 2537 DEBUG_EXECUTE_r( {
6136c704 2538 SV * const prop = sv_newmortal();
1df70142
AL
2539 const int docolor = *PL_colors[0];
2540 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2541 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2542 /* The part of the string before starttry has one color
2543 (pref0_len chars), between starttry and current
2544 position another one (pref_len - pref0_len chars),
2545 after the current position the third one.
2546 We assume that pref0_len <= pref_len, otherwise we
2547 decrease pref0_len. */
9041c2e3 2548 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2549 ? (5 + taill) - l : locinput - PL_bostr;
2550 int pref0_len;
d6a28714 2551
df1ffd02 2552 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2553 pref_len++;
2554 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2555 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2556 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2557 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2558 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2559 l--;
d6a28714
JH
2560 if (pref0_len < 0)
2561 pref0_len = 0;
2562 if (pref0_len > pref_len)
2563 pref0_len = pref_len;
32fc9b6a 2564 regprop(rex, prop, scan);
2a782b5b 2565 {
1df70142 2566 const char * const s0 =
f14c76ed 2567 do_utf8 && OP(scan) != CANY ?
95b24440 2568 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
c728cb41 2569 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2570 locinput - pref_len;
1df70142
AL
2571 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2572 const char * const s1 = do_utf8 && OP(scan) != CANY ?
95b24440
DM
2573 pv_uni_display(PERL_DEBUG_PAD(1),
2574 (U8*)(locinput - pref_len + pref0_len),
c728cb41 2575 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2576 locinput - pref_len + pref0_len;
1df70142
AL
2577 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2578 const char * const s2 = do_utf8 && OP(scan) != CANY ?
95b24440 2579 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
c728cb41 2580 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2581 locinput;
1df70142 2582 const int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2583 PerlIO_printf(Perl_debug_log,
2584 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2585 (IV)(locinput - PL_bostr),
2586 PL_colors[4],
2587 len0, s0,
2588 PL_colors[5],
2589 PL_colors[2],
2590 len1, s1,
2591 PL_colors[3],
2592 (docolor ? "" : "> <"),
2593 PL_colors[0],
2594 len2, s2,
2595 PL_colors[1],
2596 15 - l - pref_len + 1,
2597 "",
4f639d21 2598 (IV)(scan - rex->program), PL_regindent*2, "",
3f7c398e 2599 SvPVX_const(prop));
2a782b5b
JH
2600 }
2601 });
d6a28714
JH
2602
2603 next = scan + NEXT_OFF(scan);
2604 if (next == scan)
2605 next = NULL;
2606
2607 switch (OP(scan)) {
2608 case BOL:
7fba1cd6 2609 if (locinput == PL_bostr)
d6a28714 2610 {
3b0527fe 2611 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2612 break;
2613 }
d6a28714
JH
2614 sayNO;
2615 case MBOL:
12d33761
HS
2616 if (locinput == PL_bostr ||
2617 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2618 {
b8c5462f
JH
2619 break;
2620 }
d6a28714
JH
2621 sayNO;
2622 case SBOL:
c2a73568 2623 if (locinput == PL_bostr)
b8c5462f 2624 break;
d6a28714
JH
2625 sayNO;
2626 case GPOS:
3b0527fe 2627 if (locinput == reginfo->ganch)
d6a28714
JH
2628 break;
2629 sayNO;
2630 case EOL:
d6a28714
JH
2631 goto seol;
2632 case MEOL:
d6a28714 2633 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2634 sayNO;
b8c5462f 2635 break;
d6a28714
JH
2636 case SEOL:
2637 seol:
2638 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2639 sayNO;
d6a28714 2640 if (PL_regeol - locinput > 1)
b8c5462f 2641 sayNO;
b8c5462f 2642 break;
d6a28714
JH
2643 case EOS:
2644 if (PL_regeol != locinput)
b8c5462f 2645 sayNO;
d6a28714 2646 break;
ffc61ed2 2647 case SANY:
d6a28714 2648 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2649 sayNO;
f33976b4
DB
2650 if (do_utf8) {
2651 locinput += PL_utf8skip[nextchr];
2652 if (locinput > PL_regeol)
2653 sayNO;
2654 nextchr = UCHARAT(locinput);
2655 }
2656 else
2657 nextchr = UCHARAT(++locinput);
2658 break;
2659 case CANY:
2660 if (!nextchr && locinput >= PL_regeol)
2661 sayNO;
b8c5462f 2662 nextchr = UCHARAT(++locinput);
a0d0e21e 2663 break;
ffc61ed2 2664 case REG_ANY:
1aa99e6b
IH
2665 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2666 sayNO;
2667 if (do_utf8) {
b8c5462f 2668 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2669 if (locinput > PL_regeol)
2670 sayNO;
a0ed51b3 2671 nextchr = UCHARAT(locinput);
a0ed51b3 2672 }
1aa99e6b
IH
2673 else
2674 nextchr = UCHARAT(++locinput);
a0ed51b3 2675 break;
a3621e74
YO
2676
2677
2678
2679 /*
2680 traverse the TRIE keeping track of all accepting states
2681 we transition through until we get to a failing node.
2682
a3621e74
YO
2683
2684 */
5b47454d 2685 case TRIE:
a3621e74
YO
2686 case TRIEF:
2687 case TRIEFL:
2688 {
a3621e74
YO
2689 U8 *uc = ( U8* )locinput;
2690 U32 state = 1;
2691 U16 charid = 0;
2692 U32 base = 0;
2693 UV uvc = 0;
2694 STRLEN len = 0;
2695 STRLEN foldlen = 0;
a3621e74
YO
2696 U8 *uscan = (U8*)NULL;
2697 STRLEN bufflen=0;
95b24440 2698 SV *sv_accept_buff = NULL;
5b47454d
DM
2699 const enum { trie_plain, trie_utf8, trie_uft8_fold }
2700 trie_type = do_utf8 ?
2701 (OP(scan) == TRIE ? trie_utf8 : trie_uft8_fold)
2702 : trie_plain;
2703
7087a21c
NC
2704 /* what trie are we using right now */
2705 reg_trie_data *trie
32fc9b6a 2706 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
d8319b27 2707 st->u.trie.accepted = 0; /* how many accepting states we have seen */
95b24440 2708 result = 0;
a3621e74
YO
2709
2710 while ( state && uc <= (U8*)PL_regeol ) {
2711
5b47454d 2712 if (trie->states[ state ].wordnum) {
d8319b27 2713 if (!st->u.trie.accepted ) {
5b47454d
DM
2714 ENTER;
2715 SAVETMPS;
2716 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2717 sv_accept_buff=newSV(bufflen *
2718 sizeof(reg_trie_accepted) - 1);
2719 SvCUR_set(sv_accept_buff,
2720 sizeof(reg_trie_accepted));
2721 SvPOK_on(sv_accept_buff);
2722 sv_2mortal(sv_accept_buff);
d8319b27 2723 st->u.trie.accept_buff =
5b47454d
DM
2724 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2725 }
2726 else {
d8319b27 2727 if (st->u.trie.accepted >= bufflen) {
5b47454d 2728 bufflen *= 2;
d8319b27 2729 st->u.trie.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2730 SvGROW(sv_accept_buff,
2731 bufflen * sizeof(reg_trie_accepted));
2732 }
2733 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2734 + sizeof(reg_trie_accepted));
2735 }
d8319b27
DM
2736 st->u.trie.accept_buff[st->u.trie.accepted].wordnum = trie->states[state].wordnum;
2737 st->u.trie.accept_buff[st->u.trie.accepted].endpos = uc;
2738 ++st->u.trie.accepted;
5b47454d 2739 }
a3621e74
YO
2740
2741 base = trie->states[ state ].trans.base;
2742
2743 DEBUG_TRIE_EXECUTE_r(
2744 PerlIO_printf( Perl_debug_log,
e4584336 2745 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2746 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27 2747 (UV)state, (UV)base, (UV)st->u.trie.accepted );
a3621e74
YO
2748 );
2749
2750 if ( base ) {
5b47454d
DM
2751 switch (trie_type) {
2752 case trie_uft8_fold:
a3621e74
YO
2753 if ( foldlen>0 ) {
2754 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2755 foldlen -= len;
2756 uscan += len;
2757 len=0;
2758 } else {
1df70142 2759 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2760 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2761 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2762 foldlen -= UNISKIP( uvc );
2763 uscan = foldbuf + UNISKIP( uvc );
2764 }
5b47454d
DM
2765 break;
2766 case trie_utf8:
2767 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2768 &len, uniflags );
2769 break;
2770 case trie_plain:
e4584336 2771 uvc = (UV)*uc;
a3621e74
YO
2772 len = 1;
2773 }
2774
5b47454d
DM
2775 if (uvc < 256) {
2776 charid = trie->charmap[ uvc ];
2777 }
2778 else {
2779 charid = 0;
2780 if (trie->widecharmap) {
2781 SV** svpp = (SV**)NULL;
2782 svpp = hv_fetch(trie->widecharmap,
2783 (char*)&uvc, sizeof(UV), 0);
2784 if (svpp)
2785 charid = (U16)SvIV(*svpp);
2786 }
2787 }
a3621e74 2788
5b47454d
DM
2789 if (charid &&
2790 (base + charid > trie->uniquecharcount )
2791 && (base + charid - 1 - trie->uniquecharcount
2792 < trie->lasttrans)
2793 && trie->trans[base + charid - 1 -
2794 trie->uniquecharcount].check == state)
2795 {
2796 state = trie->trans[base + charid - 1 -
2797 trie->uniquecharcount ].next;
2798 }
2799 else {
2800 state = 0;
2801 }
2802 uc += len;
2803
2804 }
2805 else {
a3621e74
YO
2806 state = 0;
2807 }
2808 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2809 PerlIO_printf( Perl_debug_log,
2810 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2811 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2812 );
2813 }
d8319b27 2814 if (!st->u.trie.accepted )
a3621e74 2815 sayNO;
a3621e74
YO
2816
2817 /*
2818 There was at least one accepting state that we
2819 transitioned through. Presumably the number of accepting
2820 states is going to be low, typically one or two. So we
2821 simply scan through to find the one with lowest wordnum.
2822 Once we find it, we swap the last state into its place
2823 and decrement the size. We then try to match the rest of
2824 the pattern at the point where the word ends, if we
2825 succeed then we end the loop, otherwise the loop
2826 eventually terminates once all of the accepting states
2827 have been tried.
2828 */
a3621e74 2829
d8319b27 2830 if ( st->u.trie.accepted == 1 ) {
a3621e74 2831 DEBUG_EXECUTE_r({
097eb12c 2832 SV ** const tmp = av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 );
a3621e74
YO
2833 PerlIO_printf( Perl_debug_log,
2834 "%*s %sonly one match : #%d <%s>%s\n",
e4584336 2835 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
d8319b27 2836 st->u.trie.accept_buff[ 0 ].wordnum,
cfd0369c 2837 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
a3621e74
YO
2838 PL_colors[5] );
2839 });
d8319b27 2840 PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos;
a3621e74
YO
2841 /* in this case we free tmps/leave before we call regmatch
2842 as we wont be using accept_buff again. */
2843 FREETMPS;
2844 LEAVE;
95b24440
DM
2845 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
2846 /*** all unsaved local vars undefined at this point */
a3621e74
YO
2847 } else {
2848 DEBUG_EXECUTE_r(
e4584336 2849 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
d8319b27 2850 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)st->u.trie.accepted,
a3621e74
YO
2851 PL_colors[5] );
2852 );
d8319b27 2853 while ( !result && st->u.trie.accepted-- ) {
a3621e74
YO
2854 U32 best = 0;
2855 U32 cur;
d8319b27 2856 for( cur = 1 ; cur <= st->u.trie.accepted ; cur++ ) {
e4584336
RB
2857 DEBUG_TRIE_EXECUTE_r(
2858 PerlIO_printf( Perl_debug_log,
2859 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2860 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27
DM
2861 (IV)best, st->u.trie.accept_buff[ best ].wordnum, (IV)cur,
2862 st->u.trie.accept_buff[ cur ].wordnum, PL_colors[5] );
e4584336 2863 );
a3621e74 2864
d8319b27
DM
2865 if (st->u.trie.accept_buff[cur].wordnum <
2866 st->u.trie.accept_buff[best].wordnum)
e822a8b4 2867 best = cur;
a3621e74
YO
2868 }
2869 DEBUG_EXECUTE_r({
87830502 2870 reg_trie_data * const trie = (reg_trie_data*)
32fc9b6a 2871 rex->data->data[ARG(scan)];
d8319b27 2872 SV ** const tmp = av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 );
a3621e74 2873 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
e4584336 2874 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
d8319b27 2875 st->u.trie.accept_buff[best].wordnum,
cfd0369c 2876 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",scan,
a3621e74
YO
2877 PL_colors[5] );
2878 });
d8319b27
DM
2879 if ( best<st->u.trie.accepted ) {
2880 reg_trie_accepted tmp = st->u.trie.accept_buff[ best ];
2881 st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ];
2882 st->u.trie.accept_buff[ st->u.trie.accepted ] = tmp;
2883 best = st->u.trie.accepted;
a3621e74 2884 }
d8319b27 2885 PL_reginput = (char *)st->u.trie.accept_buff[ best ].endpos;
a3621e74
YO
2886
2887 /*
2888 as far as I can tell we only need the SAVETMPS/FREETMPS
2889 for re's with EVAL in them but I'm leaving them in for
2890 all until I can be sure.
2891 */
2892 SAVETMPS;
95b24440
DM
2893 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
2894 /*** all unsaved local vars undefined at this point */
a3621e74
YO
2895 FREETMPS;
2896 }
2897 FREETMPS;
2898 LEAVE;
2899 }
2900
95b24440 2901 if (result) {
a3621e74
YO
2902 sayYES;
2903 } else {
2904 sayNO;
2905 }
2906 }
2907 /* unreached codepoint */
95b24440
DM
2908 case EXACT: {
2909 char *s = STRING(scan);
5d9a96ca 2910 st->ln = STR_LEN(scan);
eb160463 2911 if (do_utf8 != UTF) {
bc517b45 2912 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2913 char *l = locinput;
5d9a96ca 2914 const char *e = s + st->ln;
a72c7584 2915
5ff6fc6d
JH
2916 if (do_utf8) {
2917 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2918 while (s < e) {
a3b680e6 2919 STRLEN ulen;
1aa99e6b 2920 if (l >= PL_regeol)
5ff6fc6d
JH
2921 sayNO;
2922 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2923 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2924 uniflags))
5ff6fc6d 2925 sayNO;
bc517b45 2926 l += ulen;
5ff6fc6d 2927 s ++;
1aa99e6b 2928 }
5ff6fc6d
JH
2929 }
2930 else {
2931 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2932 while (s < e) {
a3b680e6 2933 STRLEN ulen;
1aa99e6b
IH
2934 if (l >= PL_regeol)
2935 sayNO;
5ff6fc6d 2936 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2937 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2938 uniflags))
1aa99e6b 2939 sayNO;
bc517b45 2940 s += ulen;
a72c7584 2941 l ++;
1aa99e6b 2942 }
5ff6fc6d 2943 }
1aa99e6b
IH
2944 locinput = l;
2945 nextchr = UCHARAT(locinput);
2946 break;
2947 }
bc517b45 2948 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2949 /* Inline the first character, for speed. */
2950 if (UCHARAT(s) != nextchr)
2951 sayNO;
5d9a96ca 2952 if (PL_regeol - locinput < st->ln)
d6a28714 2953 sayNO;
5d9a96ca 2954 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 2955 sayNO;
5d9a96ca 2956 locinput += st->ln;
d6a28714
JH
2957 nextchr = UCHARAT(locinput);
2958 break;
95b24440 2959 }
d6a28714 2960 case EXACTFL:
b8c5462f
JH
2961 PL_reg_flags |= RF_tainted;
2962 /* FALL THROUGH */
95b24440
DM
2963 case EXACTF: {
2964 char *s = STRING(scan);
5d9a96ca 2965 st->ln = STR_LEN(scan);
d6a28714 2966
d07ddd77
JH
2967 if (do_utf8 || UTF) {
2968 /* Either target or the pattern are utf8. */
d6a28714 2969 char *l = locinput;
d07ddd77 2970 char *e = PL_regeol;
bc517b45 2971
5d9a96ca 2972 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 2973 l, &e, 0, do_utf8)) {
5486206c
JH
2974 /* One more case for the sharp s:
2975 * pack("U0U*", 0xDF) =~ /ss/i,
2976 * the 0xC3 0x9F are the UTF-8
2977 * byte sequence for the U+00DF. */
2978 if (!(do_utf8 &&
2979 toLOWER(s[0]) == 's' &&
5d9a96ca 2980 st->ln >= 2 &&
5486206c
JH
2981 toLOWER(s[1]) == 's' &&
2982 (U8)l[0] == 0xC3 &&
2983 e - l >= 2 &&
2984 (U8)l[1] == 0x9F))
2985 sayNO;
2986 }
d07ddd77
JH
2987 locinput = e;
2988 nextchr = UCHARAT(locinput);
2989 break;
a0ed51b3 2990 }
d6a28714 2991
bc517b45
JH
2992 /* Neither the target and the pattern are utf8. */
2993
d6a28714
JH
2994 /* Inline the first character, for speed. */
2995 if (UCHARAT(s) != nextchr &&
2996 UCHARAT(s) != ((OP(scan) == EXACTF)
2997 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2998 sayNO;
5d9a96ca 2999 if (PL_regeol - locinput < st->ln)
b8c5462f 3000 sayNO;
5d9a96ca
DM
3001 if (st->ln > 1 && (OP(scan) == EXACTF
3002 ? ibcmp(s, locinput, st->ln)
3003 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 3004 sayNO;
5d9a96ca 3005 locinput += st->ln;
d6a28714 3006 nextchr = UCHARAT(locinput);
a0d0e21e 3007 break;
95b24440 3008 }
d6a28714 3009 case ANYOF:
ffc61ed2 3010 if (do_utf8) {
9e55ce06
JH
3011 STRLEN inclasslen = PL_regeol - locinput;
3012
32fc9b6a 3013 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 3014 sayNO_ANYOF;
ffc61ed2
JH
3015 if (locinput >= PL_regeol)
3016 sayNO;
0f0076b4 3017 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3018 nextchr = UCHARAT(locinput);
e0f9d4a8 3019 break;
ffc61ed2
JH
3020 }
3021 else {
3022 if (nextchr < 0)
3023 nextchr = UCHARAT(locinput);
32fc9b6a 3024 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 3025 sayNO_ANYOF;
ffc61ed2
JH
3026 if (!nextchr && locinput >= PL_regeol)
3027 sayNO;
3028 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3029 break;
3030 }
3031 no_anyof:
3032 /* If we might have the case of the German sharp s
3033 * in a casefolding Unicode character class. */
3034
ebc501f0
JH
3035 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3036 locinput += SHARP_S_SKIP;
e0f9d4a8 3037 nextchr = UCHARAT(locinput);
ffc61ed2 3038 }
e0f9d4a8
JH
3039 else
3040 sayNO;
b8c5462f 3041 break;
d6a28714 3042 case ALNUML:
b8c5462f
JH
3043 PL_reg_flags |= RF_tainted;
3044 /* FALL THROUGH */
d6a28714 3045 case ALNUM:
b8c5462f 3046 if (!nextchr)
4633a7c4 3047 sayNO;
ffc61ed2 3048 if (do_utf8) {
1a4fad37 3049 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3050 if (!(OP(scan) == ALNUM
3568d838 3051 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3052 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3053 {
3054 sayNO;
a0ed51b3 3055 }
b8c5462f 3056 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3057 nextchr = UCHARAT(locinput);
3058 break;
3059 }
ffc61ed2 3060 if (!(OP(scan) == ALNUM
d6a28714 3061 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3062 sayNO;
b8c5462f 3063 nextchr = UCHARAT(++locinput);
a0d0e21e 3064 break;
d6a28714 3065 case NALNUML:
b8c5462f
JH
3066 PL_reg_flags |= RF_tainted;
3067 /* FALL THROUGH */
d6a28714
JH
3068 case NALNUM:
3069 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3070 sayNO;
ffc61ed2 3071 if (do_utf8) {
1a4fad37 3072 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3073 if (OP(scan) == NALNUM
3568d838 3074 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3075 : isALNUM_LC_utf8((U8*)locinput))
3076 {
b8c5462f 3077 sayNO;
d6a28714 3078 }
b8c5462f
JH
3079 locinput += PL_utf8skip[nextchr];
3080 nextchr = UCHARAT(locinput);
3081 break;
3082 }
ffc61ed2 3083 if (OP(scan) == NALNUM
d6a28714 3084 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3085 sayNO;
76e3520e 3086 nextchr = UCHARAT(++locinput);
a0d0e21e 3087 break;
d6a28714
JH
3088 case BOUNDL:
3089 case NBOUNDL:
3280af22 3090 PL_reg_flags |= RF_tainted;
bbce6d69 3091 /* FALL THROUGH */
d6a28714
JH
3092 case BOUND:
3093 case NBOUND:
3094 /* was last char in word? */
ffc61ed2 3095 if (do_utf8) {
12d33761 3096 if (locinput == PL_bostr)
5d9a96ca 3097 st->ln = '\n';
ffc61ed2 3098 else {
a3b680e6 3099 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3100
4ad0818d 3101 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3102 }
3103 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3104 st->ln = isALNUM_uni(st->ln);
1a4fad37 3105 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3106 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3107 }
3108 else {
5d9a96ca 3109 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3110 n = isALNUM_LC_utf8((U8*)locinput);
3111 }
a0ed51b3 3112 }
d6a28714 3113 else {
5d9a96ca 3114 st->ln = (locinput != PL_bostr) ?
12d33761 3115 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3116 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3117 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3118 n = isALNUM(nextchr);
3119 }
3120 else {
5d9a96ca 3121 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3122 n = isALNUM_LC(nextchr);
3123 }
d6a28714 3124 }
5d9a96ca 3125 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3126 OP(scan) == BOUNDL))
3127 sayNO;
a0ed51b3 3128 break;
d6a28714 3129 case SPACEL:
3280af22 3130 PL_reg_flags |= RF_tainted;
bbce6d69 3131 /* FALL THROUGH */
d6a28714 3132 case SPACE:
9442cb0e 3133 if (!nextchr)
4633a7c4 3134 sayNO;
1aa99e6b 3135 if (do_utf8) {
fd400ab9 3136 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3137 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3138 if (!(OP(scan) == SPACE
3568d838 3139 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3140 : isSPACE_LC_utf8((U8*)locinput)))
3141 {
3142 sayNO;
3143 }
3144 locinput += PL_utf8skip[nextchr];
3145 nextchr = UCHARAT(locinput);
3146 break;
d6a28714 3147 }
ffc61ed2
JH
3148 if (!(OP(scan) == SPACE
3149 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3150 sayNO;
3151 nextchr = UCHARAT(++locinput);
3152 }
3153 else {
3154 if (!(OP(scan) == SPACE
3155 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3156 sayNO;
3157 nextchr = UCHARAT(++locinput);
a0ed51b3 3158 }
a0ed51b3 3159 break;
d6a28714 3160 case NSPACEL:
3280af22 3161 PL_reg_flags |= RF_tainted;
bbce6d69 3162 /* FALL THROUGH */
d6a28714 3163 case NSPACE:
9442cb0e 3164 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3165 sayNO;
1aa99e6b 3166 if (do_utf8) {
1a4fad37 3167 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3168 if (OP(scan) == NSPACE
3568d838 3169 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3170 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3171 {
3172 sayNO;
3173 }
3174 locinput += PL_utf8skip[nextchr];
3175 nextchr = UCHARAT(locinput);
3176 break;
a0ed51b3 3177 }
ffc61ed2 3178 if (OP(scan) == NSPACE
d6a28714 3179 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3180 sayNO;
76e3520e 3181 nextchr = UCHARAT(++locinput);
a0d0e21e 3182 break;
d6a28714 3183 case DIGITL:
a0ed51b3
LW
3184 PL_reg_flags |= RF_tainted;
3185 /* FALL THROUGH */
d6a28714 3186 case DIGIT:
9442cb0e 3187 if (!nextchr)
a0ed51b3 3188 sayNO;
1aa99e6b 3189 if (do_utf8) {
1a4fad37 3190 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3191 if (!(OP(scan) == DIGIT
3568d838 3192 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3193 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3194 {
a0ed51b3 3195 sayNO;
dfe13c55 3196 }
6f06b55f 3197 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3198 nextchr = UCHARAT(locinput);
3199 break;
3200 }
ffc61ed2 3201 if (!(OP(scan) == DIGIT
9442cb0e 3202 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3203 sayNO;
3204 nextchr = UCHARAT(++locinput);
3205 break;
d6a28714 3206 case NDIGITL:
b8c5462f
JH
3207 PL_reg_flags |= RF_tainted;
3208 /* FALL THROUGH */
d6a28714 3209 case NDIGIT:
9442cb0e 3210 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3211 sayNO;
1aa99e6b 3212 if (do_utf8) {
1a4fad37 3213 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3214 if (OP(scan) == NDIGIT
3568d838 3215 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3216 : isDIGIT_LC_utf8((U8*)locinput))
3217 {
a0ed51b3 3218 sayNO;
9442cb0e 3219 }
6f06b55f 3220 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3221 nextchr = UCHARAT(locinput);
3222 break;
3223 }
ffc61ed2 3224 if (OP(scan) == NDIGIT
9442cb0e 3225 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3226 sayNO;
3227 nextchr = UCHARAT(++locinput);
3228 break;
3229 case CLUMP:
b7c83a7e 3230 if (locinput >= PL_regeol)
a0ed51b3 3231 sayNO;
b7c83a7e 3232 if (do_utf8) {
1a4fad37 3233 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3234 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3235 sayNO;
3236 locinput += PL_utf8skip[nextchr];
3237 while (locinput < PL_regeol &&
3238 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3239 locinput += UTF8SKIP(locinput);
3240 if (locinput > PL_regeol)
3241 sayNO;
eb08e2da
JH
3242 }
3243 else
3244 locinput++;
a0ed51b3
LW
3245 nextchr = UCHARAT(locinput);
3246 break;
c8756f30 3247 case REFFL:
3280af22 3248 PL_reg_flags |= RF_tainted;
c8756f30 3249 /* FALL THROUGH */
c277df42 3250 case REF:
95b24440
DM
3251 case REFF: {
3252 char *s;
c277df42 3253 n = ARG(scan); /* which paren pair */
5d9a96ca 3254 st->ln = PL_regstartp[n];
2c2d71f5 3255 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3256 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3257 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3258 if (st->ln == PL_regendp[n])
a0d0e21e 3259 break;
a0ed51b3 3260
5d9a96ca 3261 s = PL_bostr + st->ln;
1aa99e6b 3262 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3263 char *l = locinput;
a3b680e6 3264 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3265 /*
3266 * Note that we can't do the "other character" lookup trick as
3267 * in the 8-bit case (no pun intended) because in Unicode we
3268 * have to map both upper and title case to lower case.
3269 */
3270 if (OP(scan) == REFF) {
3271 while (s < e) {
a3b680e6
AL
3272 STRLEN ulen1, ulen2;
3273 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3274 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3275
a0ed51b3
LW
3276 if (l >= PL_regeol)
3277 sayNO;
a2a2844f
JH
3278 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3279 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3280 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3281 sayNO;
a2a2844f
JH
3282 s += ulen1;
3283 l += ulen2;
a0ed51b3
LW
3284 }
3285 }
3286 locinput = l;
3287 nextchr = UCHARAT(locinput);
3288 break;
3289 }
3290
a0d0e21e 3291 /* Inline the first character, for speed. */
76e3520e 3292 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3293 (OP(scan) == REF ||
3294 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3295 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3296 sayNO;
5d9a96ca
DM
3297 st->ln = PL_regendp[n] - st->ln;
3298 if (locinput + st->ln > PL_regeol)
4633a7c4 3299 sayNO;
5d9a96ca
DM
3300 if (st->ln > 1 && (OP(scan) == REF
3301 ? memNE(s, locinput, st->ln)
c8756f30 3302 : (OP(scan) == REFF
5d9a96ca
DM
3303 ? ibcmp(s, locinput, st->ln)
3304 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3305 sayNO;
5d9a96ca 3306 locinput += st->ln;
76e3520e 3307 nextchr = UCHARAT(locinput);
a0d0e21e 3308 break;
95b24440 3309 }
a0d0e21e
LW
3310
3311 case NOTHING:
c277df42 3312 case TAIL:
a0d0e21e
LW
3313 break;
3314 case BACK:
3315 break;
c277df42
IZ
3316 case EVAL:
3317 {
c277df42 3318 SV *ret;
8e5e9ebe 3319 {
4aabdb9b
DM
3320 /* execute the code in the {...} */
3321 dSP;
6136c704 3322 SV ** const before = SP;
4aabdb9b
DM
3323 OP_4tree * const oop = PL_op;
3324 COP * const ocurcop = PL_curcop;
3325 PAD *old_comppad;
4aabdb9b
DM
3326
3327 n = ARG(scan);
32fc9b6a 3328 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3329 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3330 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3331 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3332
8e5e9ebe
RGS
3333 CALLRUNOPS(aTHX); /* Scalar context. */
3334 SPAGAIN;
3335 if (SP == before)
075aa684 3336 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3337 else {
3338 ret = POPs;
3339 PUTBACK;
3340 }
4aabdb9b
DM
3341
3342 PL_op = oop;
3343 PAD_RESTORE_LOCAL(old_comppad);
3344 PL_curcop = ocurcop;
3345 if (!st->logical) {
3346 /* /(?{...})/ */
3347 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3348 break;
3349 }
8e5e9ebe 3350 }
4aabdb9b
DM
3351 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3352 regexp *re;
4aabdb9b 3353 {
4f639d21
DM
3354 /* extract RE object from returned value; compiling if
3355 * necessary */
3356
6136c704 3357 MAGIC *mg = NULL;
4aabdb9b 3358 SV *sv;
faf82a0b
AE
3359 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3360 mg = mg_find(sv, PERL_MAGIC_qr);
3361 else if (SvSMAGICAL(ret)) {
3362 if (SvGMAGICAL(ret))
3363 sv_unmagic(ret, PERL_MAGIC_qr);
3364 else
3365 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3366 }
faf82a0b 3367
0f5d15d6
IZ
3368 if (mg) {
3369 re = (regexp *)mg->mg_obj;
df0003d4 3370 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3371 }
3372 else {
3373 STRLEN len;
6136c704 3374 const char * const t = SvPV_const(ret, len);
0f5d15d6 3375 PMOP pm;
a3b680e6 3376 const I32 osize = PL_regsize;
0f5d15d6 3377
5fcd1c1b 3378 Zero(&pm, 1, PMOP);
4aabdb9b 3379 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3380 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3381 if (!(SvFLAGS(ret)
faf82a0b
AE
3382 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3383 | SVs_GMG)))
14befaf4
DM
3384 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3385 PERL_MAGIC_qr,0,0);
0f5d15d6 3386 PL_regsize = osize;
0f5d15d6 3387 }
4aabdb9b 3388 }
aa283a38
DM
3389
3390 /* run the pattern returned from (??{...}) */
3391
4aabdb9b
DM
3392 DEBUG_EXECUTE_r(
3393 PerlIO_printf(Perl_debug_log,
3394 "Entering embedded \"%s%.60s%s%s\"\n",
3395 PL_colors[0],
3396 re->precomp,
3397 PL_colors[1],
3398 (strlen(re->precomp) > 60 ? "..." : ""))
3399 );
2c2d71f5 3400
4aabdb9b
DM
3401 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
3402 REGCP_SET(st->u.eval.lastcp);
4aabdb9b
DM
3403 *PL_reglastparen = 0;
3404 *PL_reglastcloseparen = 0;
4aabdb9b 3405 PL_reginput = locinput;
4aabdb9b
DM
3406
3407 /* XXXX This is too dramatic a measure... */
3408 PL_reg_maxiter = 0;
3409
5d9a96ca 3410 st->logical = 0;
aa283a38
DM
3411 st->u.eval.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3412 ((re->reganch & ROPT_UTF8) != 0);
3413 if (st->u.eval.toggleutf) PL_reg_flags ^= RF_utf8;
3414 st->u.eval.prev_rex = rex;
aa283a38 3415 rex = re;
aa283a38 3416
77cb431f
DM
3417 /* resume to current state on success */
3418 st->u.yes.prev_yes_state = yes_state;
3419 yes_state = st;
aa283a38
DM
3420 PUSH_STATE(newst, resume_EVAL);
3421 st = newst;
3422
3423 /* now continue from first node in postoned RE */
3424 next = re->program + 1;
3425 break;
4aabdb9b 3426 /* NOTREACHED */
a0ed51b3 3427 }
4aabdb9b
DM
3428 /* /(?(?{...})X|Y)/ */
3429 st->sw = SvTRUE(ret);
3430 st->logical = 0;
c277df42
IZ
3431 break;
3432 }
a0d0e21e 3433 case OPEN:
c277df42 3434 n = ARG(scan); /* which paren pair */
3280af22
NIS
3435 PL_reg_start_tmp[n] = locinput;
3436 if (n > PL_regsize)
3437 PL_regsize = n;
a0d0e21e
LW
3438 break;
3439 case CLOSE:
c277df42 3440 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3441 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3442 PL_regendp[n] = locinput - PL_bostr;
eb160463 3443 if (n > (I32)*PL_reglastparen)
3280af22 3444 *PL_reglastparen = n;
a01268b5 3445 *PL_reglastcloseparen = n;
a0d0e21e 3446 break;
c277df42
IZ
3447 case GROUPP:
3448 n = ARG(scan); /* which paren pair */
5d9a96ca 3449 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3450 break;
3451 case IFTHEN:
2c2d71f5 3452 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3453 if (st->sw)
c277df42
IZ
3454 next = NEXTOPER(NEXTOPER(scan));
3455 else {
3456 next = scan + ARG(scan);
3457 if (OP(next) == IFTHEN) /* Fake one. */
3458 next = NEXTOPER(NEXTOPER(next));
3459 }
3460 break;
3461 case LOGICAL:
5d9a96ca 3462 st->logical = scan->flags;
c277df42 3463 break;
2ab05381 3464/*******************************************************************
a0374537
DM
3465 cc points to the regmatch_state associated with the most recent CURLYX.
3466 This struct contains info about the innermost (...)* loop (an
3467 "infoblock"), and a pointer to the next outer cc.
2ab05381
IZ
3468
3469 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3470
95b24440 3471 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3472
a0374537 3473 2) This regnode populates cc, and calls regmatch() recursively
95b24440 3474 with the starting point at WHILEM node;
2ab05381
IZ
3475
3476 3) Each hit of WHILEM node tries to match A and Z (in the order
3477 depending on the current iteration, min/max of {min,max} and
3478 greediness). The information about where are nodes for "A"
a0374537 3479 and "Z" is read from cc, as is info on how many times "A"
2ab05381
IZ
3480 was already matched, and greediness.
3481
3482 4) After A matches, the same WHILEM node is hit again.
3483
95b24440 3484 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
2ab05381 3485 of the same pair. Thus when WHILEM tries to match Z, it temporarily
95b24440 3486 resets cc, since this Y(A)*Z can be a part of some other loop:
2ab05381
IZ
3487 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3488 of the external loop.
3489
a0374537 3490 Currently present infoblocks form a tree with a stem formed by st->cc
2ab05381
IZ
3491 and whatever it mentions via ->next, and additional attached trees
3492 corresponding to temporarily unset infoblocks as in "5" above.
3493
95b24440 3494 In the following picture, infoblocks for outer loop of
2ab05381
IZ
3495 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3496 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3497 infoblocks are drawn below the "reset" infoblock.
3498
3499 In fact in the picture below we do not show failed matches for Z and T
3500 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3501 more obvious *why* one needs to *temporary* unset infoblocks.]
3502
3503 Matched REx position InfoBlocks Comment
3504 (Y(A)*?Z)*?T x
3505 Y(A)*?Z)*?T x <- O
3506 Y (A)*?Z)*?T x <- O
3507 Y A)*?Z)*?T x <- O <- I
3508 YA )*?Z)*?T x <- O <- I
3509 YA A)*?Z)*?T x <- O <- I
3510 YAA )*?Z)*?T x <- O <- I
3511 YAA Z)*?T x <- O # Temporary unset I
3512 I
3513
3514 YAAZ Y(A)*?Z)*?T x <- O
3515 I
3516
3517 YAAZY (A)*?Z)*?T x <- O
3518 I
3519
3520 YAAZY A)*?Z)*?T x <- O <- I
3521 I
3522
3523 YAAZYA )*?Z)*?T x <- O <- I
3524 I
3525
3526 YAAZYA Z)*?T x <- O # Temporary unset I
3527 I,I
3528
3529 YAAZYAZ )*?T x <- O
3530 I,I
3531
3532 YAAZYAZ T x # Temporary unset O
3533 O
3534 I,I
3535
3536 YAAZYAZT x
3537 O
3538 I,I
3539 *******************************************************************/
95b24440 3540
a0d0e21e 3541 case CURLYX: {
cb434fcc
IZ
3542 /* No need to save/restore up to this paren */
3543 I32 parenfloor = scan->flags;
c277df42 3544
c2b7afd3
NC
3545 /* Dave says:
3546
3547 CURLYX and WHILEM are always paired: they're the moral
3548 equivalent of pp_enteriter anbd pp_iter.
3549
3550 The only time next could be null is if the node tree is
3551 corrupt. This was mentioned on p5p a few days ago.
3552
3553 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3554 So we'll assert that this is true:
3555 */
3556 assert(next);
30b2893d 3557 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
c277df42 3558 next += ARG(next);
cb434fcc
IZ
3559 /* XXXX Probably it is better to teach regpush to support
3560 parenfloor > PL_regsize... */
eb160463 3561 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc 3562 parenfloor = *PL_reglastparen; /* Pessimization... */
a0374537 3563
d8319b27
DM
3564 st->u.curlyx.cp = PL_savestack_ix;
3565 st->u.curlyx.outercc = st->cc;
a0374537
DM
3566 st->cc = st;
3567 /* these fields contain the state of the current curly.
3568 * they are accessed by subsequent WHILEMs;
3569 * cur and lastloc are also updated by WHILEM */
d8319b27
DM
3570 st->u.curlyx.parenfloor = parenfloor;
3571 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3572 st->u.curlyx.min = ARG1(scan);
3573 st->u.curlyx.max = ARG2(scan);
3574 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3575 st->u.curlyx.lastloc = 0;
a0374537
DM
3576 /* st->next and st->minmod are also read by WHILEM */
3577
3280af22 3578 PL_reginput = locinput;
95b24440
DM
3579 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3580 /*** all unsaved local vars undefined at this point */
d8319b27
DM
3581 regcpblow(st->u.curlyx.cp);
3582 st->cc = st->u.curlyx.outercc;
95b24440 3583 saySAME(result);
a0d0e21e 3584 }
5f66b61c 3585 /* NOTREACHED */
a0d0e21e
LW
3586 case WHILEM: {
3587 /*
3588 * This is really hard to understand, because after we match
3589 * what we're trying to match, we must make sure the rest of
2c2d71f5 3590 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3591 * to go back UP the parse tree by recursing ever deeper. And
3592 * if it fails, we have to reset our parent's current state
3593 * that we can try again after backing off.
3594 */
3595
c2b7afd3
NC
3596 /* Dave says:
3597
3598 st->cc gets initialised by CURLYX ready for use by WHILEM.
3599 So again, unless somethings been corrupted, st->cc cannot
3600 be null at that point in WHILEM.
3601
3602 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3603 So we'll assert that this is true:
3604 */
3605 assert(st->cc);
d8319b27
DM
3606 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3607 st->u.whilem.cache_offset = 0;
3608 st->u.whilem.cache_bit = 0;
c277df42 3609
d8319b27 3610 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3280af22 3611 PL_reginput = locinput;
a0d0e21e 3612
a3621e74 3613 DEBUG_EXECUTE_r(
9041c2e3 3614 PerlIO_printf(Perl_debug_log,
91f3b821 3615 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3616 REPORT_CODE_OFF+PL_regindent*2, "",
d8319b27
DM
3617 (long)n, (long)st->cc->u.curlyx.min,
3618 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
c277df42 3619 );
4633a7c4 3620
a0d0e21e
LW
3621 /* If degenerate scan matches "", assume scan done. */
3622
d8319b27
DM
3623 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3624 st->u.whilem.savecc = st->cc;
3625 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3626 if (st->cc)
d8319b27 3627 st->ln = st->cc->u.curlyx.cur;
a3621e74 3628 DEBUG_EXECUTE_r(
c3464db5
DD
3629 PerlIO_printf(Perl_debug_log,
3630 "%*s empty match detected, try continuation...\n",
3280af22 3631 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3632 );
d8319b27 3633 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
95b24440 3634 /*** all unsaved local vars undefined at this point */
d8319b27 3635 st->cc = st->u.whilem.savecc;
95b24440 3636 if (result)
4633a7c4 3637 sayYES;
d8319b27
DM
3638 if (st->cc->u.curlyx.outercc)
3639 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4633a7c4 3640 sayNO;
a0d0e21e
LW
3641 }
3642
3643 /* First just match a string of min scans. */
3644
d8319b27
DM
3645 if (n < st->cc->u.curlyx.min) {
3646 st->cc->u.curlyx.cur = n;
3647 st->cc->u.curlyx.lastloc = locinput;
3648 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
95b24440
DM
3649 /*** all unsaved local vars undefined at this point */
3650 if (result)
4633a7c4 3651 sayYES;
d8319b27
DM
3652 st->cc->u.curlyx.cur = n - 1;
3653 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
4633a7c4 3654 sayNO;
a0d0e21e
LW
3655 }
3656
2c2d71f5
JH
3657 if (scan->flags) {
3658 /* Check whether we already were at this position.
3659 Postpone detection until we know the match is not
3660 *that* much linear. */
3661 if (!PL_reg_maxiter) {
3662 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3663 PL_reg_leftiter = PL_reg_maxiter;
3664 }
3665 if (PL_reg_leftiter-- == 0) {
a3b680e6 3666 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
2c2d71f5 3667 if (PL_reg_poscache) {
eb160463 3668 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3669 Renew(PL_reg_poscache, size, char);
3670 PL_reg_poscache_size = size;
3671 }
3672 Zero(PL_reg_poscache, size, char);
3673 }
3674 else {
3675 PL_reg_poscache_size = size;
a02a5408 3676 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3677 }
a3621e74 3678 DEBUG_EXECUTE_r(
2c2d71f5
JH
3679 PerlIO_printf(Perl_debug_log,
3680 "%sDetected a super-linear match, switching on caching%s...\n",
3681 PL_colors[4], PL_colors[5])
3682 );
3683 }
3684 if (PL_reg_leftiter < 0) {
d8319b27 3685 st->u.whilem.cache_offset = locinput - PL_bostr;
2c2d71f5 3686
d8319b27
DM
3687 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3688 + st->u.whilem.cache_offset * (scan->flags>>4);
3689 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3690 st->u.whilem.cache_offset /= 8;
3691 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
a3621e74 3692 DEBUG_EXECUTE_r(
2c2d71f5
JH
3693 PerlIO_printf(Perl_debug_log,
3694 "%*s already tried at this position...\n",
3695 REPORT_CODE_OFF+PL_regindent*2, "")
3696 );
3ab3c9b4
HS
3697 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3698 /* cache records success */
c2b0868c
HS
3699 sayYES;
3700 else
3ab3c9b4 3701 /* cache records failure */
c2b0868c 3702 sayNO_SILENT;
2c2d71f5 3703 }
2c2d71f5
JH
3704 }
3705 }
3706
a0d0e21e
LW
3707 /* Prefer next over scan for minimal matching. */
3708
5d9a96ca 3709 if (st->cc->minmod) {
d8319b27
DM
3710 st->u.whilem.savecc = st->cc;
3711 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3712 if (st->cc)
d8319b27
DM
3713 st->ln = st->cc->u.curlyx.cur;
3714 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3715 REGCP_SET(st->u.whilem.lastcp);
3716 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
95b24440 3717 /*** all unsaved local vars undefined at this point */
d8319b27 3718 st->cc = st->u.whilem.savecc;
95b24440 3719 if (result) {
d8319b27 3720 regcpblow(st->u.whilem.cp);
3ab3c9b4 3721 CACHEsayYES; /* All done. */
5f05dabc 3722 }
d8319b27 3723 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3724 regcppop(rex);
d8319b27
DM
3725 if (st->cc->u.curlyx.outercc)
3726 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
a0d0e21e 3727
d8319b27 3728 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
9041c2e3 3729 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3730 && !(PL_reg_flags & RF_warned)) {
3731 PL_reg_flags |= RF_warned;
9014280d 3732 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3733 "Complex regular subexpression recursion",
3734 REG_INFTY - 1);
c277df42 3735 }
3ab3c9b4 3736 CACHEsayNO;
c277df42 3737 }
a687059c 3738
a3621e74 3739 DEBUG_EXECUTE_r(
c3464db5
DD
3740 PerlIO_printf(Perl_debug_log,
3741 "%*s trying longer...\n",
3280af22 3742 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3743 );
a0d0e21e 3744 /* Try scanning more and see if it helps. */
3280af22 3745 PL_reginput = locinput;
d8319b27
DM
3746 st->cc->u.curlyx.cur = n;
3747 st->cc->u.curlyx.lastloc = locinput;
3748 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3749 REGCP_SET(st->u.whilem.lastcp);
3750 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
95b24440
DM
3751 /*** all unsaved local vars undefined at this point */
3752 if (result) {
d8319b27 3753 regcpblow(st->u.whilem.cp);
3ab3c9b4 3754 CACHEsayYES;
5f05dabc 3755 }
d8319b27 3756 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3757 regcppop(rex);
d8319b27
DM
3758 st->cc->u.curlyx.cur = n - 1;
3759 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3760 CACHEsayNO;
a0d0e21e
LW
3761 }
3762
3763 /* Prefer scan over next for maximal matching. */
3764
d8319b27
DM
3765 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3766 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3767 st->cc->u.curlyx.cur = n;
3768 st->cc->u.curlyx.lastloc = locinput;
3769 REGCP_SET(st->u.whilem.lastcp);
3770 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
95b24440
DM
3771 /*** all unsaved local vars undefined at this point */
3772 if (result) {
d8319b27 3773 regcpblow(st->u.whilem.cp);
3ab3c9b4 3774 CACHEsayYES;
5f05dabc 3775 }
d8319b27 3776 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3777 regcppop(rex); /* Restore some previous $<digit>s? */
3280af22 3778 PL_reginput = locinput;
a3621e74 3779 DEBUG_EXECUTE_r(
c3464db5
DD
3780 PerlIO_printf(Perl_debug_log,
3781 "%*s failed, try continuation...\n",
3280af22 3782 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3783 );
3784 }
9041c2e3 3785 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3786 && !(PL_reg_flags & RF_warned)) {
3280af22 3787 PL_reg_flags |= RF_warned;
9014280d 3788 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3789 "Complex regular subexpression recursion",
3790 REG_INFTY - 1);
a0d0e21e
LW
3791 }
3792
3793 /* Failed deeper matches of scan, so see if this one works. */
d8319b27
DM
3794 st->u.whilem.savecc = st->cc;
3795 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3796 if (st->cc)
d8319b27
DM
3797 st->ln = st->cc->u.curlyx.cur;
3798 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
95b24440 3799 /*** all unsaved local vars undefined at this point */
d8319b27 3800 st->cc = st->u.whilem.savecc;
95b24440 3801 if (result)
3ab3c9b4 3802 CACHEsayYES;
d8319b27
DM
3803 if (st->cc->u.curlyx.outercc)
3804 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3805 st->cc->u.curlyx.cur = n - 1;
3806 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3807 CACHEsayNO;
a0d0e21e 3808 }
5f66b61c 3809 /* NOTREACHED */
9041c2e3 3810 case BRANCHJ:
c277df42
IZ
3811 next = scan + ARG(scan);
3812 if (next == scan)
3813 next = NULL;
3814 inner = NEXTOPER(NEXTOPER(scan));
3815 goto do_branch;
9041c2e3 3816 case BRANCH:
c277df42
IZ
3817 inner = NEXTOPER(scan);
3818 do_branch:
3819 {
e822a8b4
DM
3820 I32 type;
3821 type = OP(scan);
ae5031b3 3822 if (!next || OP(next) != type) /* No choice. */
c277df42 3823 next = inner; /* Avoid recursion. */
a0d0e21e 3824 else {
a3b680e6 3825 const I32 lastparen = *PL_reglastparen;
02db2b7b 3826 /* Put unwinding data on stack */
6136c704
AL
3827 const I32 unwind1 = SSNEWt(1,re_unwind_branch_t);
3828 re_unwind_branch_t * const uw = SSPTRt(unwind1,re_unwind_branch_t);
3829
5d9a96ca
DM
3830 uw->prev = st->unwind;
3831 st->unwind = unwind1;
e822a8b4 3832 uw->type = ((type == BRANCH)
02db2b7b
IZ
3833 ? RE_UNWIND_BRANCH
3834 : RE_UNWIND_BRANCHJ);
3835 uw->lastparen = lastparen;
3836 uw->next = next;
3837 uw->locinput = locinput;
3838 uw->nextchr = nextchr;
3a2830be 3839 uw->minmod = st->minmod;
02db2b7b
IZ
3840#ifdef DEBUGGING
3841 uw->regindent = ++PL_regindent;
3842#endif
c277df42 3843
02db2b7b
IZ
3844 REGCP_SET(uw->lastcp);
3845
3846 /* Now go into the first branch */
3847 next = inner;
a687059c 3848 }
a0d0e21e
LW
3849 }
3850 break;
3851 case MINMOD:
5d9a96ca 3852 st->minmod = 1;
a0d0e21e 3853 break;
c277df42
IZ
3854 case CURLYM:
3855 {
d8319b27 3856 st->u.curlym.l = st->u.curlym.matches = 0;
9041c2e3 3857
c277df42 3858 /* We suppose that the next guy does not need
0e788c72 3859 backtracking: in particular, it is of constant non-zero length,
c277df42 3860 and has no parenths to influence future backrefs. */
5d9a96ca 3861 st->ln = ARG1(scan); /* min to match */
c277df42 3862 n = ARG2(scan); /* max to match */
d8319b27
DM
3863 st->u.curlym.paren = scan->flags;
3864 if (st->u.curlym.paren) {
3865 if (st->u.curlym.paren > PL_regsize)
3866 PL_regsize = st->u.curlym.paren;
3867 if (st->u.curlym.paren > (I32)*PL_reglastparen)
3868 *PL_reglastparen = st->u.curlym.paren;
c277df42 3869 }
dc45a647 3870 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
d8319b27 3871 if (st->u.curlym.paren)
c277df42 3872 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3873 PL_reginput = locinput;
d8319b27
DM
3874 st->u.curlym.maxwanted = st->minmod ? st->ln : n;
3875 if (st->u.curlym.maxwanted) {
3876 while (PL_reginput < PL_regeol && st->u.curlym.matches < st->u.curlym.maxwanted) {
dad79028
DM
3877 /* resume to current state on success */
3878 st->u.yes.prev_yes_state = yes_state;
3879 yes_state = st;
95b24440 3880 REGMATCH(scan, CURLYM1);
dad79028 3881 yes_state = st->u.yes.prev_yes_state;
95b24440
DM
3882 /*** all unsaved local vars undefined at this point */
3883 if (!result)
6407bf3b 3884 break;
d8319b27
DM
3885 /* on first match, determine length, u.curlym.l */
3886 if (!st->u.curlym.matches++) {
6407bf3b
DM
3887 if (PL_reg_match_utf8) {
3888 char *s = locinput;
3889 while (s < PL_reginput) {
d8319b27 3890 st->u.curlym.l++;
6407bf3b
DM
3891 s += UTF8SKIP(s);
3892 }
3893 }
3894 else {
d8319b27 3895 st->u.curlym.l = PL_reginput - locinput;
6407bf3b 3896 }
d8319b27
DM
3897 if (st->u.curlym.l == 0) {
3898 st->u.curlym.matches = st->u.curlym.maxwanted;
6407bf3b
DM
3899 break;
3900 }
3901 }
3902 locinput = PL_reginput;
3903 }
3904 }
3905
3906 PL_reginput = locinput;
3907
5d9a96ca
DM
3908 if (st->minmod) {
3909 st->minmod = 0;
d8319b27 3910 if (st->ln && st->u.curlym.matches < st->ln)
c277df42 3911 sayNO;
cca55fe3 3912 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3913 regnode *text_node = next;
3914
cca55fe3 3915 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3916
d8319b27 3917 if (! HAS_TEXT(text_node)) st->u.curlym.c1 = st->u.curlym.c2 = -1000;
5f80c4cf 3918 else {
cca55fe3 3919 if (PL_regkind[(U8)OP(text_node)] == REF) {
d8319b27 3920 st->u.curlym.c1 = st->u.curlym.c2 = -1000;
44a68960 3921 goto assume_ok_MM;
cca55fe3 3922 }
d8319b27 3923 else { st->u.curlym.c1 = (U8)*STRING(text_node); }
af5decee 3924 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
d8319b27 3925 st->u.curlym.c2 = PL_fold[st->u.curlym.c1];
af5decee 3926 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
d8319b27 3927 st->u.curlym.c2 = PL_fold_locale[st->u.curlym.c1];
5f80c4cf 3928 else
d8319b27 3929 st->u.curlym.c2 = st->u.curlym.c1;
5f80c4cf 3930 }
a0ed51b3
LW
3931 }
3932 else
d8319b27 3933 st->u.curlym.c1 = st->u.curlym.c2 = -1000;
cca55fe3 3934 assume_ok_MM:
d8319b27 3935 REGCP_SET(st->u.curlym.lastcp);
5d9a96ca 3936 while (n >= st->ln || (n == REG_INFTY && st->ln > 0)) { /* ln overflow ? */
c277df42 3937 /* If it could work, try it. */
d8319b27
DM
3938 if (st->u.curlym.c1 == -1000 ||
3939 UCHARAT(PL_reginput) == st->u.curlym.c1 ||
3940 UCHARAT(PL_reginput) == st->u.curlym.c2)
c277df42 3941 {
d8319b27 3942 if (st->u.curlym.paren) {
5d9a96ca 3943 if (st->ln) {
d8319b27
DM
3944 PL_regstartp[st->u.curlym.paren] =
3945 HOPc(PL_reginput, -st->u.curlym.l) - PL_bostr;
3946 PL_regendp[st->u.curlym.paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3947 }
3948 else
d8319b27 3949 PL_regendp[st->u.curlym.paren] = -1;
c277df42 3950 }
dad79028
DM
3951 /* resume to current state on success */
3952 st->u.yes.prev_yes_state = yes_state;
3953 yes_state = st;
95b24440 3954 REGMATCH(next, CURLYM2);
dad79028 3955 yes_state = st->u.yes.prev_yes_state;
95b24440
DM
3956 /*** all unsaved local vars undefined at this point */
3957 if (result)
dad79028
DM
3958 /* XXX tmp sayYES; */
3959 sayYES_FINAL;
d8319b27 3960 REGCP_UNWIND(st->u.curlym.lastcp);
c277df42
IZ
3961 }
3962 /* Couldn't or didn't -- move forward. */
3280af22 3963 PL_reginput = locinput;
dad79028
DM
3964 /* resume to current state on success */
3965 st->u.yes.prev_yes_state = yes_state;
3966 yes_state = st;
95b24440 3967 REGMATCH(scan, CURLYM3);
dad79028 3968 yes_state = st->u.yes.prev_yes_state;
95b24440
DM
3969 /*** all unsaved local vars undefined at this point */
3970 if (result) {
5d9a96ca 3971 st->ln++;
3280af22 3972 locinput = PL_reginput;
c277df42
IZ
3973 }
3974 else
3975 sayNO;
3976 }
a0ed51b3
LW
3977 }
3978 else {
a3621e74 3979 DEBUG_EXECUTE_r(
5c0ca799 3980 PerlIO_printf(Perl_debug_log,
95b24440
DM
3981 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3982 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
d8319b27 3983 (IV) st->u.curlym.matches, (IV)st->u.curlym.l)
c277df42 3984 );
d8319b27 3985 if (st->u.curlym.matches >= st->ln) {
cca55fe3 3986 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3987 regnode *text_node = next;
3988
cca55fe3 3989 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3990
d8319b27 3991 if (! HAS_TEXT(text_node)) st->u.curlym.c1 = st->u.curlym.c2 = -1000;
5f80c4cf 3992 else {
cca55fe3 3993 if (PL_regkind[(U8)OP(text_node)] == REF) {
d8319b27 3994 st->u.curlym.c1 = st->u.curlym.c2 = -1000;
44a68960 3995 goto assume_ok_REG;
cca55fe3 3996 }
d8319b27 3997 else { st->u.curlym.c1 = (U8)*STRING(text_node); }
cca55fe3 3998
af5decee 3999 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
d8319b27 4000 st->u.curlym.c2 = PL_fold[st->u.curlym.c1];
af5decee 4001 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
d8319b27 4002 st->u.curlym.c2 = PL_fold_locale[st->u.curlym.c1];
5f80c4cf 4003 else
d8319b27 4004 st->u.curlym.c2 = st->u.curlym.c1;
5f80c4cf 4005 }
a0ed51b3
LW
4006 }
4007 else
d8319b27 4008 st->u.curlym.c1 = st->u.curlym.c2 = -1000;
c277df42 4009 }
cca55fe3 4010 assume_ok_REG:
d8319b27
DM
4011 REGCP_SET(st->u.curlym.lastcp);
4012 while (st->u.curlym.matches >= st->ln) {
c277df42 4013 /* If it could work, try it. */
d8319b27
DM
4014 if (st->u.curlym.c1 == -1000 ||
4015 UCHARAT(PL_reginput) == st->u.curlym.c1 ||
4016 UCHARAT(PL_reginput) == st->u.curlym.c2)
a0ed51b3 4017 {
a3621e74 4018 DEBUG_EXECUTE_r(
6407bf3b
DM
4019 PerlIO_printf(Perl_debug_log,
4020 "%*s trying tail with matches=%"IVdf"...\n",
4021 (int)(REPORT_CODE_OFF+PL_regindent*2),
d8319b27 4022 "", (IV)st->u.curlym.matches)
a0ed51b3 4023 );
d8319b27
DM
4024 if (st->u.curlym.paren) {
4025 if (st->u.curlym.matches) {
4026 PL_regstartp[st->u.curlym.paren]
4027 = HOPc(PL_reginput, -st->u.curlym.l) - PL_bostr;
4028 PL_regendp[st->u.curlym.paren] = PL_reginput - PL_bostr;
c277df42 4029 }
a0ed51b3 4030 else
d8319b27 4031 PL_regendp[st->u.curlym.paren] = -1;
c277df42 4032 }
dad79028
DM
4033 /* resume to current state on success */
4034 st->u.yes.prev_yes_state = yes_state;
4035 yes_state = st;
95b24440 4036 REGMATCH(next, CURLYM4);
dad79028 4037 yes_state = st->u.yes.prev_yes_state;
95b24440
DM
4038 /*** all unsaved local vars undefined at this point */
4039 if (result)
dad79028
DM
4040 /* XXX tmp sayYES; */
4041 sayYES_FINAL;
d8319b27 4042 REGCP_UNWIND(st->u.curlym.lastcp);
a0ed51b3 4043 }
c277df42 4044 /* Couldn't or didn't -- back up. */
d8319b27
DM
4045 st->u.curlym.matches--;
4046 locinput = HOPc(locinput, -st->u.curlym.l);
3280af22 4047 PL_reginput = locinput;
c277df42
IZ
4048 }
4049 }
4050 sayNO;
5f66b61c 4051 /* NOTREACHED */
c277df42
IZ
4052 break;
4053 }
4054 case CURLYN:
d8319b27
DM
4055 st->u.plus.paren = scan->flags; /* Which paren to set */
4056 if (st->u.plus.paren > PL_regsize)
4057 PL_regsize = st->u.plus.paren;
4058 if (st->u.plus.paren > (I32)*PL_reglastparen)
4059 *PL_reglastparen = st->u.plus.paren;
5d9a96ca 4060 st->ln = ARG1(scan); /* min to match */
c277df42 4061 n = ARG2(scan); /* max to match */
dc45a647 4062 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 4063 goto repeat;
a0d0e21e 4064 case CURLY:
d8319b27 4065 st->u.plus.paren = 0;
5d9a96ca 4066 st->ln = ARG1(scan); /* min to match */
a0d0e21e 4067 n = ARG2(scan); /* max to match */
dc45a647 4068 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
4069 goto repeat;
4070 case STAR:
5d9a96ca 4071 st->ln = 0;
c277df42 4072 n = REG_INFTY;
a0d0e21e 4073 scan = NEXTOPER(scan);
d8319b27 4074 st->u.plus.paren = 0;
a0d0e21e
LW
4075 goto repeat;
4076 case PLUS:
5d9a96ca 4077 st->ln = 1;
c277df42
IZ
4078 n = REG_INFTY;
4079 scan = NEXTOPER(scan);
d8319b27 4080 st->u.plus.paren = 0;
c277df42 4081 repeat:
a0d0e21e
LW
4082 /*
4083 * Lookahead to avoid useless match attempts
4084 * when we know what character comes next.
4085 */
5f80c4cf
JP
4086
4087 /*
4088 * Used to only do .*x and .*?x, but now it allows
4089 * for )'s, ('s and (?{ ... })'s to be in the way
4090 * of the quantifier and the EXACT-like node. -- japhy
4091 */
4092
cca55fe3 4093 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4094 U8 *s;
4095 regnode *text_node = next;
4096
cca55fe3 4097 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 4098
d8319b27 4099 if (! HAS_TEXT(text_node)) st->u.plus.c1 = st->u.plus.c2 = -1000;
5f80c4cf 4100 else {
cca55fe3 4101 if (PL_regkind[(U8)OP(text_node)] == REF) {
d8319b27 4102 st->u.plus.c1 = st->u.plus.c2 = -1000;
44a68960 4103 goto assume_ok_easy;
cca55fe3
JP
4104 }
4105 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
4106
4107 if (!UTF) {
d8319b27 4108 st->u.plus.c2 = st->u.plus.c1 = *s;
f65d3ee7 4109 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
d8319b27 4110 st->u.plus.c2 = PL_fold[st->u.plus.c1];
f65d3ee7 4111 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
d8319b27 4112 st->u.plus.c2 = PL_fold_locale[st->u.plus.c1];
1aa99e6b 4113 }
5f80c4cf 4114 else { /* UTF */
f65d3ee7 4115 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 4116 STRLEN ulen1, ulen2;
89ebb4a3
JH
4117 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4118 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4119
4120 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4121 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4122
d8319b27 4123 st->u.plus.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
041457d9 4124 uniflags);
d8319b27 4125 st->u.plus.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
041457d9 4126 uniflags);
5f80c4cf
JP
4127 }
4128 else {
d8319b27 4129 st->u.plus.c2 = st->u.plus.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4130 uniflags);
5f80c4cf 4131 }
1aa99e6b
IH
4132 }
4133 }
bbce6d69 4134 }
a0d0e21e 4135 else
d8319b27 4136 st->u.plus.c1 = st->u.plus.c2 = -1000;
cca55fe3 4137 assume_ok_easy:
3280af22 4138 PL_reginput = locinput;
5d9a96ca
DM
4139 if (st->minmod) {
4140 st->minmod = 0;
32fc9b6a 4141 if (st->ln && regrepeat(rex, scan, st->ln) < st->ln)
4633a7c4 4142 sayNO;
a0ed51b3 4143 locinput = PL_reginput;
d8319b27
DM
4144 REGCP_SET(st->u.plus.lastcp);
4145 if (st->u.plus.c1 != -1000) {
4146 st->u.plus.old = locinput;
4147 st->u.plus.count = 0;
0fe9bf95 4148
1aa99e6b 4149 if (n == REG_INFTY) {
d8319b27 4150 st->u.plus.e = PL_regeol - 1;
1aa99e6b 4151 if (do_utf8)
d8319b27
DM
4152 while (UTF8_IS_CONTINUATION(*(U8*)st->u.plus.e))
4153 st->u.plus.e--;
1aa99e6b
IH
4154 }
4155 else if (do_utf8) {
5d9a96ca 4156 int m = n - st->ln;
d8319b27
DM
4157 for (st->u.plus.e = locinput;
4158 m >0 && st->u.plus.e + UTF8SKIP(st->u.plus.e) <= PL_regeol; m--)
4159 st->u.plus.e += UTF8SKIP(st->u.plus.e);
1aa99e6b
IH
4160 }
4161 else {
d8319b27
DM
4162 st->u.plus.e = locinput + n - st->ln;
4163 if (st->u.plus.e >= PL_regeol)
4164 st->u.plus.e = PL_regeol - 1;
1aa99e6b 4165 }
0fe9bf95
IZ
4166 while (1) {
4167 /* Find place 'next' could work */
1aa99e6b 4168 if (!do_utf8) {
d8319b27
DM
4169 if (st->u.plus.c1 == st->u.plus.c2) {
4170 while (locinput <= st->u.plus.e &&
4171 UCHARAT(locinput) != st->u.plus.c1)
1aa99e6b
IH
4172 locinput++;
4173 } else {
d8319b27
DM
4174 while (locinput <= st->u.plus.e
4175 && UCHARAT(locinput) != st->u.plus.c1
4176 && UCHARAT(locinput) != st->u.plus.c2)
1aa99e6b
IH
4177 locinput++;
4178 }
d8319b27 4179 st->u.plus.count = locinput - st->u.plus.old;
1aa99e6b
IH
4180 }
4181 else {
d8319b27 4182 if (st->u.plus.c1 == st->u.plus.c2) {
a3b680e6 4183 STRLEN len;
872c91ae
JH
4184 /* count initialised to
4185 * utf8_distance(old, locinput) */
d8319b27 4186 while (locinput <= st->u.plus.e &&
872c91ae 4187 utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4188 UTF8_MAXBYTES, &len,
d8319b27 4189 uniflags) != (UV)st->u.plus.c1) {
1aa99e6b 4190 locinput += len;
d8319b27 4191 st->u.plus.count++;
b2f2f093 4192 }
1aa99e6b 4193 } else {
a3b680e6 4194 STRLEN len;
872c91ae
JH
4195 /* count initialised to
4196 * utf8_distance(old, locinput) */
d8319b27 4197 while (locinput <= st->u.plus.e) {
872c91ae 4198 UV c = utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4199 UTF8_MAXBYTES, &len,
041457d9 4200 uniflags);
d8319b27 4201 if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2)
1aa99e6b 4202 break;
b2f2f093 4203 locinput += len;
d8319b27 4204 st->u.plus.count++;
1aa99e6b
IH
4205 }
4206 }
0fe9bf95 4207 }
d8319b27 4208 if (locinput > st->u.plus.e)
0fe9bf95
IZ
4209 sayNO;
4210 /* PL_reginput == old now */
d8319b27 4211 if (locinput != st->u.plus.old) {
5d9a96ca 4212 st->ln = 1; /* Did some */
32fc9b6a 4213 if (regrepeat(rex, scan, st->u.plus.count) < st->u.plus.count)
0fe9bf95
IZ
4214 sayNO;
4215 }
4216 /* PL_reginput == locinput now */
d8319b27 4217 TRYPAREN(st->u.plus.paren, st->ln, locinput, PLUS1);
95b24440 4218 /*** all unsaved local vars undefined at this point */
0fe9bf95 4219 PL_reginput = locinput; /* Could be reset... */
d8319b27 4220 REGCP_UNWIND(st->u.plus.lastcp);
0fe9bf95 4221 /* Couldn't or didn't -- move forward. */
d8319b27 4222 st->u.plus.old = locinput;
1aa99e6b
IH
4223 if (do_utf8)
4224 locinput += UTF8SKIP(locinput);
4225 else
4226 locinput++;
d8319b27 4227 st->u.plus.count = 1;
0fe9bf95
IZ
4228 }
4229 }
4230 else
5d9a96ca 4231 while (n >= st->ln || (n == REG_INFTY && st->ln > 0)) { /* ln overflow ? */
1aa99e6b 4232 UV c;
d8319b27 4233 if (st->u.plus.c1 != -1000) {
1aa99e6b 4234 if (do_utf8)
872c91ae 4235 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4236 UTF8_MAXBYTES, 0,
041457d9 4237 uniflags);
1aa99e6b 4238 else
9041c2e3 4239 c = UCHARAT(PL_reginput);
2390ecbc 4240 /* If it could work, try it. */
d8319b27 4241 if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2)
2390ecbc 4242 {
d8319b27 4243 TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS2);
95b24440 4244 /*** all unsaved local vars undefined at this point */
d8319b27 4245 REGCP_UNWIND(st->u.plus.lastcp);
2390ecbc 4246 }
1aa99e6b 4247 }
a0d0e21e 4248 /* If it could work, try it. */
d8319b27 4249 else if (st->u.plus.c1 == -1000)
bbce6d69 4250 {
d8319b27 4251 TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS3);
95b24440 4252 /*** all unsaved local vars undefined at this point */
d8319b27 4253 REGCP_UNWIND(st->u.plus.lastcp);
bbce6d69 4254 }
c277df42 4255 /* Couldn't or didn't -- move forward. */
a0ed51b3 4256 PL_reginput = locinput;
32fc9b6a 4257 if (regrepeat(rex, scan, 1)) {
5d9a96ca 4258 st->ln++;
a0ed51b3
LW
4259 locinput = PL_reginput;
4260 }
4261 else
4633a7c4 4262 sayNO;
a0d0e21e
LW
4263 }
4264 }
4265 else {
32fc9b6a 4266 n = regrepeat(rex, scan, n);
a0ed51b3 4267 locinput = PL_reginput;
5d9a96ca 4268 if (st->ln < n && PL_regkind[(U8)OP(next)] == EOL &&
7fba1cd6 4269 (OP(next) != MEOL ||
15272685
HS
4270 OP(next) == SEOL || OP(next) == EOS))
4271 {
5d9a96ca 4272 st->ln = n; /* why back off? */
1aeab75a
GS
4273 /* ...because $ and \Z can match before *and* after
4274 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4275 We should back off by one in this case. */
4276 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
5d9a96ca 4277 st->ln--;
1aeab75a 4278 }
d8319b27 4279 REGCP_SET(st->u.plus.lastcp);
1d5c262f 4280 {
8fa7f367 4281 UV c = 0;
5d9a96ca 4282 while (n >= st->ln) {
d8319b27 4283 if (st->u.plus.c1 != -1000) {
1aa99e6b 4284 if (do_utf8)
872c91ae 4285 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4286 UTF8_MAXBYTES, 0,
041457d9 4287 uniflags);
1aa99e6b 4288 else
9041c2e3 4289 c = UCHARAT(PL_reginput);
1aa99e6b 4290 }
c277df42 4291 /* If it could work, try it. */
d8319b27 4292 if (st->u.plus.c1 == -1000 || c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2)
c277df42 4293 {
d8319b27 4294 TRYPAREN(st->u.plus.paren, n, PL_reginput, PLUS4);
95b24440 4295 /*** all unsaved local vars undefined at this point */
d8319b27 4296 REGCP_UNWIND(st->u.plus.lastcp);
c277df42
IZ
4297 }
4298 /* Couldn't or didn't -- back up. */
4299 n--;
dfe13c55 4300 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 4301 }
a0d0e21e
LW
4302 }
4303 }
4633a7c4 4304 sayNO;
c277df42 4305 break;
a0d0e21e 4306 case END:
3b0527fe 4307 if (locinput < reginfo->till) {
a3621e74 4308 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4309 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4310 PL_colors[4],
4311 (long)(locinput - PL_reg_starttry),
3b0527fe 4312 (long)(reginfo->till - PL_reg_starttry),
7821416a
IZ
4313 PL_colors[5]));
4314 sayNO_FINAL; /* Cannot match: too short. */
4315 }
4316 PL_reginput = locinput; /* put where regtry can find it */
4317 sayYES_FINAL; /* Success! */
dad79028
DM
4318
4319 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4320 DEBUG_EXECUTE_r(
4321 PerlIO_printf(Perl_debug_log,
4322 "%*s %ssubpattern success...%s\n",
4323 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
3280af22 4324 PL_reginput = locinput; /* put where regtry can find it */
dad79028
DM
4325 sayYES_FINAL; /* Success! */
4326
4327 case SUSPEND: /* (?>FOO) */
4328 st->u.ifmatch.wanted = 1;
9fe1d20c 4329 PL_reginput = locinput;
9041c2e3 4330 goto do_ifmatch;
dad79028
DM
4331
4332 case UNLESSM: /* -ve lookaround: (?!FOO), or with flags, (?<!foo) */
4333 st->u.ifmatch.wanted = 0;
4334 goto ifmatch_trivial_fail_test;
4335
4336 case IFMATCH: /* +ve lookaround: (?=FOO), or with flags, (?<=foo) */
4337 st->u.ifmatch.wanted = 1;
4338 ifmatch_trivial_fail_test:
a0ed51b3 4339 if (scan->flags) {
52657f30 4340 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4341 if (!s) {
4342 /* trivial fail */
4343 if (st->logical) {
4344 st->logical = 0;
4345 st->sw = 1 - st->u.ifmatch.wanted;
4346 }
4347 else if (st->u.ifmatch.wanted)
4348 sayNO;
4349 next = scan + ARG(scan);
4350 if (next == scan)
4351 next = NULL;
4352 break;
4353 }
efb30f32 4354 PL_reginput = s;
a0ed51b3
LW
4355 }
4356 else
4357 PL_reginput = locinput;
4358
c277df42 4359 do_ifmatch:
dad79028
DM
4360 /* resume to current state on success */
4361 st->u.yes.prev_yes_state = yes_state;
4362 yes_state = st;
4363 PUSH_STATE(newst, resume_IFMATCH);
4364 st = newst;
4365 next = NEXTOPER(NEXTOPER(scan));
4366 break;
4367
c277df42 4368 case LONGJMP:
c277df42
IZ
4369 next = scan + ARG(scan);
4370 if (next == scan)
4371 next = NULL;
a0d0e21e
LW
4372 break;
4373 default:
b900a521 4374 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4375 PTR2UV(scan), OP(scan));
cea2e8a9 4376 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4377 }
95b24440 4378
02db2b7b 4379 reenter:
a0d0e21e 4380 scan = next;
95b24440
DM
4381 continue;
4382 /* NOTREACHED */
4383
4384 /* simulate recursively calling regmatch(), but without actually
4385 * recursing - ie save the current state on the heap rather than on
4386 * the stack, then re-enter the loop. This avoids complex regexes
4387 * blowing the processor stack */
4388
4389 start_recurse:
4390 {
5d9a96ca
DM
4391 /* push new state */
4392 regmatch_state *oldst = st;
4393
4394 depth++;
4395
4396 /* grab the next free state slot */
4397 st++;
86545054 4398 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
4399 st = S_push_slab(aTHX);
4400 PL_regmatch_state = st;
4401
4402 oldst->next = next;
4403 oldst->n = n;
4404 oldst->locinput = locinput;
5d9a96ca
DM
4405
4406 st->cc = oldst->cc;
95b24440
DM
4407 locinput = PL_reginput;
4408 nextchr = UCHARAT(locinput);
5d9a96ca
DM
4409 st->minmod = 0;
4410 st->sw = 0;
4411 st->logical = 0;
4412 st->unwind = 0;
95b24440
DM
4413#ifdef DEBUGGING
4414 PL_regindent++;
4415#endif
4416 }
a0d0e21e 4417 }
a687059c 4418
aa283a38
DM
4419
4420
a0d0e21e
LW
4421 /*
4422 * We get here only if there's trouble -- normally "case END" is
4423 * the terminating point.
4424 */
cea2e8a9 4425 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4426 /*NOTREACHED*/
4633a7c4
LW
4427 sayNO;
4428
7821416a 4429yes_final:
77cb431f
DM
4430
4431 if (yes_state) {
4432 /* we have successfully completed a subexpression, but we must now
4433 * pop to the state marked by yes_state and continue from there */
4434
dad79028
DM
4435 /*XXX tmp for CURLYM*/
4436 regmatch_slab *oslab = PL_regmatch_slab;
4437 regmatch_state *ost = st, *oys=yes_state;
4438 int odepth = depth;
4439
77cb431f
DM
4440 assert(st != yes_state);
4441 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4442 || yes_state > SLAB_LAST(PL_regmatch_slab))
4443 {
4444 /* not in this slab, pop slab */
4445 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4446 PL_regmatch_slab = PL_regmatch_slab->prev;
4447 st = SLAB_LAST(PL_regmatch_slab);
4448 }
4449 depth -= (st - yes_state);
dad79028 4450 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE TO (%d)\n", depth));
77cb431f
DM
4451 st = yes_state;
4452 yes_state = st->u.yes.prev_yes_state;
4453 PL_regmatch_state = st;
4454
4455 switch (st->resume_state) {
4456 case resume_EVAL:
4457 if (st->u.eval.toggleutf)
4458 PL_reg_flags ^= RF_utf8;
4459 ReREFCNT_dec(rex);
4460 rex = st->u.eval.prev_rex;
4461 /* XXXX This is too dramatic a measure... */
4462 PL_reg_maxiter = 0;
4463 /* Restore parens of the caller without popping the
4464 * savestack */
4465 {
4466 I32 tmp = PL_savestack_ix;
4467 PL_savestack_ix = st->u.eval.lastcp;
4468 regcppop(rex);
4469 PL_savestack_ix = tmp;
4470 }
4471 PL_reginput = locinput;
4472 /* continue at the node following the (??{...}) */
4473 next = st->next;
4474 goto reenter;
4475
dad79028
DM
4476 case resume_IFMATCH:
4477 if (st->logical) {
4478 st->logical = 0;
4479 st->sw = st->u.ifmatch.wanted;
4480 }
4481 else if (!st->u.ifmatch.wanted)
4482 sayNO;
4483
4484 if (OP(st->scan) == SUSPEND)
4485 locinput = PL_reginput;
4486 else {
4487 locinput = PL_reginput = st->locinput;
4488 nextchr = UCHARAT(locinput);
4489 }
4490 next = st->scan + ARG(st->scan);
4491 if (next == st->scan)
4492 next = NULL;
4493 goto reenter;
4494
4495 /* XXX tmp don't handle yes_state yet */
4496 case resume_CURLYM1:
4497 case resume_CURLYM2:
4498 case resume_CURLYM3:
4499 case resume_CURLYM4:
4500 PL_regmatch_slab =oslab;
4501 st = ost;
4502 PL_regmatch_state = st;
4503 depth = odepth;
4504 yes_state = oys;
4505 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "XXX revering a CURLYM\n"));
4506 goto yes;
4507
77cb431f
DM
4508 default:
4509 Perl_croak(aTHX_ "unexpected yes reume state");
4510 }
4511 }
4512
a3621e74 4513 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4514 PL_colors[4], PL_colors[5]));
4633a7c4
LW
4515yes:
4516#ifdef DEBUGGING
3280af22 4517 PL_regindent--;
4633a7c4 4518#endif
02db2b7b 4519
95b24440 4520 result = 1;
aa283a38 4521 /* XXX this is duplicate(ish) code to that in the do_no section.
77cb431f
DM
4522 * eventually a yes should just pop the stack back to the current
4523 * yes_state */
aa283a38
DM
4524 if (depth) {
4525 /* restore previous state and re-enter */
4526 POP_STATE;
4527
4528 switch (st->resume_state) {
4529 case resume_TRIE1:
4530 goto resume_point_TRIE1;
4531 case resume_TRIE2:
4532 goto resume_point_TRIE2;
aa283a38
DM
4533 case resume_CURLYX:
4534 goto resume_point_CURLYX;
4535 case resume_WHILEM1:
4536 goto resume_point_WHILEM1;
4537 case resume_WHILEM2:
4538 goto resume_point_WHILEM2;
4539 case resume_WHILEM3:
4540 goto resume_point_WHILEM3;
4541 case resume_WHILEM4:
4542 goto resume_point_WHILEM4;
4543 case resume_WHILEM5:
4544 goto resume_point_WHILEM5;
4545 case resume_WHILEM6:
4546 goto resume_point_WHILEM6;
4547 case resume_CURLYM1:
4548 goto resume_point_CURLYM1;
4549 case resume_CURLYM2:
4550 goto resume_point_CURLYM2;
4551 case resume_CURLYM3:
4552 goto resume_point_CURLYM3;
4553 case resume_CURLYM4:
4554 goto resume_point_CURLYM4;
aa283a38
DM
4555 case resume_PLUS1:
4556 goto resume_point_PLUS1;
4557 case resume_PLUS2:
4558 goto resume_point_PLUS2;
4559 case resume_PLUS3:
4560 goto resume_point_PLUS3;
4561 case resume_PLUS4:
4562 goto resume_point_PLUS4;
77cb431f 4563
dad79028 4564 case resume_IFMATCH:
77cb431f 4565 case resume_EVAL:
aa283a38
DM
4566 default:
4567 Perl_croak(aTHX_ "regexp resume memory corruption");
4568 }
4569 }
4570 goto final_exit;
4633a7c4
LW
4571
4572no:
a3621e74 4573 DEBUG_EXECUTE_r(
7821416a
IZ
4574 PerlIO_printf(Perl_debug_log,
4575 "%*s %sfailed...%s\n",
e4584336 4576 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a
IZ
4577 );
4578 goto do_no;
4579no_final:
4580do_no:
5d9a96ca
DM
4581 if (st->unwind) {
4582 re_unwind_t * const uw = SSPTRt(st->unwind,re_unwind_t);
02db2b7b
IZ
4583
4584 switch (uw->type) {
4585 case RE_UNWIND_BRANCH:
4586 case RE_UNWIND_BRANCHJ:
4587 {
6136c704 4588 re_unwind_branch_t * const uwb = &(uw->branch);
a3b680e6 4589 const I32 lastparen = uwb->lastparen;
9041c2e3 4590
02db2b7b
IZ
4591 REGCP_UNWIND(uwb->lastcp);
4592 for (n = *PL_reglastparen; n > lastparen; n--)
4593 PL_regendp[n] = -1;
4594 *PL_reglastparen = n;
4595 scan = next = uwb->next;
3a2830be 4596 st->minmod = uwb->minmod;
9041c2e3
NIS
4597 if ( !scan ||
4598 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b 4599 ? BRANCH : BRANCHJ) ) { /* Failure */
5d9a96ca 4600 st->unwind = uwb->prev;
02db2b7b
IZ
4601#ifdef DEBUGGING
4602 PL_regindent--;
4603#endif
4604 goto do_no;
4605 }
4606 /* Have more choice yet. Reuse the same uwb. */
02db2b7b
IZ
4607 if ((n = (uwb->type == RE_UNWIND_BRANCH
4608 ? NEXT_OFF(next) : ARG(next))))
4609 next += n;
4610 else
4611 next = NULL; /* XXXX Needn't unwinding in this case... */
4612 uwb->next = next;
4613 next = NEXTOPER(scan);
4614 if (uwb->type == RE_UNWIND_BRANCHJ)
4615 next = NEXTOPER(next);
4616 locinput = uwb->locinput;
4617 nextchr = uwb->nextchr;
4618#ifdef DEBUGGING
4619 PL_regindent = uwb->regindent;
4620#endif
4621
4622 goto reenter;
4623 }
5f66b61c 4624 /* NOTREACHED */
02db2b7b
IZ
4625 default:
4626 Perl_croak(aTHX_ "regexp unwind memory corruption");
4627 }
5f66b61c 4628 /* NOTREACHED */
02db2b7b 4629 }
aa283a38 4630
4633a7c4 4631#ifdef DEBUGGING
3280af22 4632 PL_regindent--;
4633a7c4 4633#endif
95b24440 4634 result = 0;
5d9a96ca 4635
aa283a38
DM
4636 if (depth) {
4637 /* there's a previous state to backtrack to */
4638 POP_STATE;
5d9a96ca 4639 switch (st->resume_state) {
95b24440
DM
4640 case resume_TRIE1:
4641 goto resume_point_TRIE1;
4642 case resume_TRIE2:
4643 goto resume_point_TRIE2;
aa283a38
DM
4644 case resume_EVAL:
4645 /* we have failed an (??{...}). Restore state to the outer re
4646 * then re-throw the failure */
4647 if (st->u.eval.toggleutf)
4648 PL_reg_flags ^= RF_utf8;
4649 ReREFCNT_dec(rex);
4650 rex = st->u.eval.prev_rex;
77cb431f 4651 yes_state = st->u.yes.prev_yes_state;
aa283a38
DM
4652
4653 /* XXXX This is too dramatic a measure... */
4654 PL_reg_maxiter = 0;
4655
4656 PL_reginput = locinput;
4657 REGCP_UNWIND(st->u.eval.lastcp);
4658 regcppop(rex);
4659 goto do_no;
4660
95b24440
DM
4661 case resume_CURLYX:
4662 goto resume_point_CURLYX;
4663 case resume_WHILEM1:
4664 goto resume_point_WHILEM1;
4665 case resume_WHILEM2:
4666 goto resume_point_WHILEM2;
4667 case resume_WHILEM3:
4668 goto resume_point_WHILEM3;
4669 case resume_WHILEM4:
4670 goto resume_point_WHILEM4;
4671 case resume_WHILEM5:
4672 goto resume_point_WHILEM5;
4673 case resume_WHILEM6:
4674 goto resume_point_WHILEM6;
4675 case resume_CURLYM1:
4676 goto resume_point_CURLYM1;
4677 case resume_CURLYM2:
4678 goto resume_point_CURLYM2;
4679 case resume_CURLYM3:
4680 goto resume_point_CURLYM3;
4681 case resume_CURLYM4:
4682 goto resume_point_CURLYM4;
4683 case resume_IFMATCH:
dad79028
DM
4684 yes_state = st->u.yes.prev_yes_state;
4685 if (st->logical) {
4686 st->logical = 0;
4687 st->sw = !st->u.ifmatch.wanted;
4688 }
4689 else if (st->u.ifmatch.wanted)
4690 sayNO;
4691
4692 assert(OP(scan) != SUSPEND); /* XXX DAPM tmp */
4693 locinput = PL_reginput = st->locinput;
4694 nextchr = UCHARAT(locinput);
4695 next = scan + ARG(scan);
4696 if (next == scan)
4697 next = NULL;
4698 goto reenter;
4699
95b24440
DM
4700 case resume_PLUS1:
4701 goto resume_point_PLUS1;
4702 case resume_PLUS2:
4703 goto resume_point_PLUS2;
4704 case resume_PLUS3:
4705 goto resume_point_PLUS3;
4706 case resume_PLUS4:
4707 goto resume_point_PLUS4;
95b24440
DM
4708 default:
4709 Perl_croak(aTHX_ "regexp resume memory corruption");
4710 }
95b24440 4711 }
aa283a38
DM
4712
4713final_exit:
4714
5d9a96ca
DM
4715 /* restore original high-water mark */
4716 PL_regmatch_slab = orig_slab;
4717 PL_regmatch_state = orig_state;
4718
4719 /* free all slabs above current one */
4720 if (orig_slab->next) {
ad65c075 4721 regmatch_slab *osl, *sl = orig_slab->next;
5d9a96ca
DM
4722 orig_slab->next = NULL;
4723 while (sl) {
ad65c075 4724 osl = sl;
5d9a96ca 4725 sl = sl->next;
ad65c075 4726 Safefree(osl);
5d9a96ca
DM
4727 }
4728 }
4729
95b24440
DM
4730 return result;
4731
a687059c
LW
4732}
4733
4734/*
4735 - regrepeat - repeatedly match something simple, report how many
4736 */
4737/*
4738 * [This routine now assumes that it will only match on things of length 1.
4739 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4740 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4741 */
76e3520e 4742STATIC I32
32fc9b6a 4743S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
a687059c 4744{
27da23d5 4745 dVAR;
a0d0e21e 4746 register char *scan;
a0d0e21e 4747 register I32 c;
3280af22 4748 register char *loceol = PL_regeol;
a0ed51b3 4749 register I32 hardcount = 0;
53c4c00c 4750 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4751
3280af22 4752 scan = PL_reginput;
faf11cac
HS
4753 if (max == REG_INFTY)
4754 max = I32_MAX;
4755 else if (max < loceol - scan)
7f596f4c 4756 loceol = scan + max;
a0d0e21e 4757 switch (OP(p)) {
22c35a8c 4758 case REG_ANY:
1aa99e6b 4759 if (do_utf8) {
ffc61ed2 4760 loceol = PL_regeol;
1aa99e6b 4761 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4762 scan += UTF8SKIP(scan);
4763 hardcount++;
4764 }
4765 } else {
4766 while (scan < loceol && *scan != '\n')
4767 scan++;
a0ed51b3
LW
4768 }
4769 break;
ffc61ed2 4770 case SANY:
def8e4ea
JH
4771 if (do_utf8) {
4772 loceol = PL_regeol;
a0804c9e 4773 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4774 scan += UTF8SKIP(scan);
4775 hardcount++;
4776 }
4777 }
4778 else
4779 scan = loceol;
a0ed51b3 4780 break;
f33976b4
DB
4781 case CANY:
4782 scan = loceol;
4783 break;
090f7165
JH
4784 case EXACT: /* length of string is 1 */
4785 c = (U8)*STRING(p);
4786 while (scan < loceol && UCHARAT(scan) == c)
4787 scan++;
bbce6d69 4788 break;
4789 case EXACTF: /* length of string is 1 */
cd439c50 4790 c = (U8)*STRING(p);
bbce6d69 4791 while (scan < loceol &&
22c35a8c 4792 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4793 scan++;
4794 break;
4795 case EXACTFL: /* length of string is 1 */
3280af22 4796 PL_reg_flags |= RF_tainted;
cd439c50 4797 c = (U8)*STRING(p);
bbce6d69 4798 while (scan < loceol &&
22c35a8c 4799 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4800 scan++;
4801 break;
4802 case ANYOF:
ffc61ed2
JH
4803 if (do_utf8) {
4804 loceol = PL_regeol;
cfc92286 4805 while (hardcount < max && scan < loceol &&
32fc9b6a 4806 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4807 scan += UTF8SKIP(scan);
4808 hardcount++;
4809 }
4810 } else {
32fc9b6a 4811 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
4812 scan++;
4813 }
a0d0e21e
LW
4814 break;
4815 case ALNUM:
1aa99e6b 4816 if (do_utf8) {
ffc61ed2 4817 loceol = PL_regeol;
1a4fad37 4818 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4819 while (hardcount < max && scan < loceol &&
3568d838 4820 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4821 scan += UTF8SKIP(scan);
4822 hardcount++;
4823 }
4824 } else {
4825 while (scan < loceol && isALNUM(*scan))
4826 scan++;
a0ed51b3
LW
4827 }
4828 break;
bbce6d69 4829 case ALNUML:
3280af22 4830 PL_reg_flags |= RF_tainted;
1aa99e6b 4831 if (do_utf8) {
ffc61ed2 4832 loceol = PL_regeol;
1aa99e6b
IH
4833 while (hardcount < max && scan < loceol &&
4834 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4835 scan += UTF8SKIP(scan);
4836 hardcount++;
4837 }
4838 } else {
4839 while (scan < loceol && isALNUM_LC(*scan))
4840 scan++;
a0ed51b3
LW
4841 }
4842 break;
a0d0e21e 4843 case NALNUM:
1aa99e6b 4844 if (do_utf8) {
ffc61ed2 4845 loceol = PL_regeol;
1a4fad37 4846 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4847 while (hardcount < max && scan < loceol &&
3568d838 4848 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4849 scan += UTF8SKIP(scan);
4850 hardcount++;
4851 }
4852 } else {
4853 while (scan < loceol && !isALNUM(*scan))
4854 scan++;
a0ed51b3
LW
4855 }
4856 break;
bbce6d69 4857 case NALNUML:
3280af22 4858 PL_reg_flags |= RF_tainted;
1aa99e6b 4859 if (do_utf8) {
ffc61ed2 4860 loceol = PL_regeol;
1aa99e6b
IH
4861 while (hardcount < max && scan < loceol &&
4862 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4863 scan += UTF8SKIP(scan);
4864 hardcount++;
4865 }
4866 } else {
4867 while (scan < loceol && !isALNUM_LC(*scan))
4868 scan++;
a0ed51b3
LW
4869 }
4870 break;
a0d0e21e 4871 case SPACE:
1aa99e6b 4872 if (do_utf8) {
ffc61ed2 4873 loceol = PL_regeol;
1a4fad37 4874 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4875 while (hardcount < max && scan < loceol &&
3568d838
JH
4876 (*scan == ' ' ||
4877 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4878 scan += UTF8SKIP(scan);
4879 hardcount++;
4880 }
4881 } else {
4882 while (scan < loceol && isSPACE(*scan))
4883 scan++;
a0ed51b3
LW
4884 }
4885 break;
bbce6d69 4886 case SPACEL:
3280af22 4887 PL_reg_flags |= RF_tainted;
1aa99e6b 4888 if (do_utf8) {
ffc61ed2 4889 loceol = PL_regeol;
1aa99e6b 4890 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4891 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4892 scan += UTF8SKIP(scan);
4893 hardcount++;
4894 }
4895 } else {
4896 while (scan < loceol && isSPACE_LC(*scan))
4897 scan++;
a0ed51b3
LW
4898 }
4899 break;
a0d0e21e 4900 case NSPACE:
1aa99e6b 4901 if (do_utf8) {
ffc61ed2 4902 loceol = PL_regeol;
1a4fad37 4903 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4904 while (hardcount < max && scan < loceol &&
3568d838
JH
4905 !(*scan == ' ' ||
4906 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4907 scan += UTF8SKIP(scan);
4908 hardcount++;
4909 }
4910 } else {
4911 while (scan < loceol && !isSPACE(*scan))
4912 scan++;
4913 break;
a0ed51b3 4914 }
bbce6d69 4915 case NSPACEL:
3280af22 4916 PL_reg_flags |= RF_tainted;
1aa99e6b 4917 if (do_utf8) {
ffc61ed2 4918 loceol = PL_regeol;
1aa99e6b 4919 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4920 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4921 scan += UTF8SKIP(scan);
4922 hardcount++;
4923 }
4924 } else {
4925 while (scan < loceol && !isSPACE_LC(*scan))
4926 scan++;
a0ed51b3
LW
4927 }
4928 break;
a0d0e21e 4929 case DIGIT:
1aa99e6b 4930 if (do_utf8) {
ffc61ed2 4931 loceol = PL_regeol;
1a4fad37 4932 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4933 while (hardcount < max && scan < loceol &&
3568d838 4934 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4935 scan += UTF8SKIP(scan);
4936 hardcount++;
4937 }
4938 } else {
4939 while (scan < loceol && isDIGIT(*scan))
4940 scan++;
a0ed51b3
LW
4941 }
4942 break;
a0d0e21e 4943 case NDIGIT:
1aa99e6b 4944 if (do_utf8) {
ffc61ed2 4945 loceol = PL_regeol;
1a4fad37 4946 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4947 while (hardcount < max && scan < loceol &&
3568d838 4948 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4949 scan += UTF8SKIP(scan);
4950 hardcount++;
4951 }
4952 } else {
4953 while (scan < loceol && !isDIGIT(*scan))
4954 scan++;
a0ed51b3
LW
4955 }
4956 break;
a0d0e21e
LW
4957 default: /* Called on something of 0 width. */
4958 break; /* So match right here or not at all. */
4959 }
a687059c 4960
a0ed51b3
LW
4961 if (hardcount)
4962 c = hardcount;
4963 else
4964 c = scan - PL_reginput;
3280af22 4965 PL_reginput = scan;
a687059c 4966
a3621e74 4967 DEBUG_r({
ab74612d 4968 SV *re_debug_flags = NULL;
6136c704 4969 SV * const prop = sv_newmortal();
a3621e74
YO
4970 GET_RE_DEBUG_FLAGS;
4971 DEBUG_EXECUTE_r({
32fc9b6a 4972 regprop(prog, prop, p);
9041c2e3
NIS
4973 PerlIO_printf(Perl_debug_log,
4974 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3f7c398e 4975 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
c277df42 4976 });
a3621e74 4977 });
9041c2e3 4978
a0d0e21e 4979 return(c);
a687059c
LW
4980}
4981
c277df42
IZ
4982
4983/*
ffc61ed2
JH
4984- regclass_swash - prepare the utf8 swash
4985*/
4986
4987SV *
32fc9b6a 4988Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4989{
97aff369 4990 dVAR;
9e55ce06
JH
4991 SV *sw = NULL;
4992 SV *si = NULL;
4993 SV *alt = NULL;
32fc9b6a 4994 const struct reg_data *data = prog ? prog->data : NULL;
ffc61ed2 4995
4f639d21 4996 if (data && data->count) {
a3b680e6 4997 const U32 n = ARG(node);
ffc61ed2 4998
4f639d21
DM
4999 if (data->what[n] == 's') {
5000 SV * const rv = (SV*)data->data[n];
890ce7af 5001 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 5002 SV **const ary = AvARRAY(av);
9e55ce06 5003 SV **a, **b;
9041c2e3 5004
711a919c 5005 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5006 * documentation of these array elements. */
5007
b11f357e 5008 si = *ary;
8f7f7219 5009 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
5010 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5011
ffc61ed2
JH
5012 if (a)
5013 sw = *a;
5014 else if (si && doinit) {
5015 sw = swash_init("utf8", "", si, 1, 0);
5016 (void)av_store(av, 1, sw);
5017 }
9e55ce06
JH
5018 if (b)
5019 alt = *b;
ffc61ed2
JH
5020 }
5021 }
5022
9e55ce06
JH
5023 if (listsvp)
5024 *listsvp = si;
5025 if (altsvp)
5026 *altsvp = alt;
ffc61ed2
JH
5027
5028 return sw;
5029}
5030
5031/*
ba7b4546 5032 - reginclass - determine if a character falls into a character class
832705d4
JH
5033
5034 The n is the ANYOF regnode, the p is the target string, lenp
5035 is pointer to the maximum length of how far to go in the p
5036 (if the lenp is zero, UTF8SKIP(p) is used),
5037 do_utf8 tells whether the target string is in UTF-8.
5038
bbce6d69 5039 */
5040
76e3520e 5041STATIC bool
32fc9b6a 5042S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5043{
27da23d5 5044 dVAR;
a3b680e6 5045 const char flags = ANYOF_FLAGS(n);
bbce6d69 5046 bool match = FALSE;
cc07378b 5047 UV c = *p;
ae9ddab8 5048 STRLEN len = 0;
9e55ce06 5049 STRLEN plen;
1aa99e6b 5050
19f67299
TS
5051 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5052 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5053 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5054 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
19f67299
TS
5055 if (len == (STRLEN)-1)
5056 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5057 }
bbce6d69 5058
0f0076b4 5059 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5060 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5061 if (lenp)
5062 *lenp = 0;
ffc61ed2 5063 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5064 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5065 match = TRUE;
bbce6d69 5066 }
3568d838 5067 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5068 match = TRUE;
ffc61ed2 5069 if (!match) {
9e55ce06 5070 AV *av;
32fc9b6a 5071 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5072
5073 if (sw) {
3568d838 5074 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
5075 match = TRUE;
5076 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5077 if (!match && lenp && av) {
5078 I32 i;
9e55ce06 5079 for (i = 0; i <= av_len(av); i++) {
890ce7af 5080 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5081 STRLEN len;
890ce7af 5082 const char * const s = SvPV_const(sv, len);
9e55ce06 5083
061b10df 5084 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
5085 *lenp = len;
5086 match = TRUE;
5087 break;
5088 }
5089 }
5090 }
5091 if (!match) {
89ebb4a3 5092 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
5093 STRLEN tmplen;
5094
9e55ce06
JH
5095 to_utf8_fold(p, tmpbuf, &tmplen);
5096 if (swash_fetch(sw, tmpbuf, do_utf8))
5097 match = TRUE;
5098 }
ffc61ed2
JH
5099 }
5100 }
bbce6d69 5101 }
9e55ce06 5102 if (match && lenp && *lenp == 0)
0f0076b4 5103 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5104 }
1aa99e6b 5105 if (!match && c < 256) {
ffc61ed2
JH
5106 if (ANYOF_BITMAP_TEST(n, c))
5107 match = TRUE;
5108 else if (flags & ANYOF_FOLD) {
eb160463 5109 U8 f;
a0ed51b3 5110
ffc61ed2
JH
5111 if (flags & ANYOF_LOCALE) {
5112 PL_reg_flags |= RF_tainted;
5113 f = PL_fold_locale[c];
5114 }
5115 else
5116 f = PL_fold[c];
5117 if (f != c && ANYOF_BITMAP_TEST(n, f))
5118 match = TRUE;
5119 }
5120
5121 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5122 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5123 if (
5124 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5125 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5126 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5127 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5128 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5129 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5130 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5131 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5132 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5133 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5134 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5135 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5136 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5137 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5138 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5139 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5140 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5141 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5142 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5143 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5144 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5145 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5146 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5147 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5148 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5149 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5150 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5151 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5152 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5153 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5154 ) /* How's that for a conditional? */
5155 {
5156 match = TRUE;
5157 }
a0ed51b3 5158 }
a0ed51b3
LW
5159 }
5160
a0ed51b3
LW
5161 return (flags & ANYOF_INVERT) ? !match : match;
5162}
161b471a 5163
dfe13c55 5164STATIC U8 *
5f66b61c 5165S_reghop3(U8 *s, I32 off, U8* lim)
9041c2e3 5166{
97aff369 5167 dVAR;
a0ed51b3 5168 if (off >= 0) {
1aa99e6b 5169 while (off-- && s < lim) {
ffc61ed2 5170 /* XXX could check well-formedness here */
a0ed51b3 5171 s += UTF8SKIP(s);
ffc61ed2 5172 }
a0ed51b3
LW
5173 }
5174 else {
5175 while (off++) {
1aa99e6b 5176 if (s > lim) {
a0ed51b3 5177 s--;
ffc61ed2 5178 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5179 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5180 s--;
ffc61ed2
JH
5181 }
5182 /* XXX could check well-formedness here */
a0ed51b3
LW
5183 }
5184 }
5185 }
5186 return s;
5187}
161b471a 5188
dfe13c55 5189STATIC U8 *
5f66b61c 5190S_reghopmaybe3(U8* s, I32 off, U8* lim)
a0ed51b3 5191{
97aff369 5192 dVAR;
a0ed51b3 5193 if (off >= 0) {
1aa99e6b 5194 while (off-- && s < lim) {
ffc61ed2 5195 /* XXX could check well-formedness here */
a0ed51b3 5196 s += UTF8SKIP(s);
ffc61ed2 5197 }
a0ed51b3
LW
5198 if (off >= 0)
5199 return 0;
5200 }
5201 else {
5202 while (off++) {
1aa99e6b 5203 if (s > lim) {
a0ed51b3 5204 s--;
ffc61ed2 5205 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5206 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5207 s--;
ffc61ed2
JH
5208 }
5209 /* XXX could check well-formedness here */
a0ed51b3
LW
5210 }
5211 else
5212 break;
5213 }
5214 if (off <= 0)
5215 return 0;
5216 }
5217 return s;
5218}
51371543 5219
51371543 5220static void
acfe0abc 5221restore_pos(pTHX_ void *arg)
51371543 5222{
97aff369 5223 dVAR;
097eb12c 5224 regexp * const rex = (regexp *)arg;
51371543
GS
5225 if (PL_reg_eval_set) {
5226 if (PL_reg_oldsaved) {
4f639d21
DM
5227 rex->subbeg = PL_reg_oldsaved;
5228 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5229#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5230 rex->saved_copy = PL_nrs;
ed252734 5231#endif
4f639d21 5232 RX_MATCH_COPIED_on(rex);
51371543
GS
5233 }
5234 PL_reg_magic->mg_len = PL_reg_oldpos;
5235 PL_reg_eval_set = 0;
5236 PL_curpm = PL_reg_oldcurpm;
5237 }
5238}
33b8afdf
JH
5239
5240STATIC void
5241S_to_utf8_substr(pTHX_ register regexp *prog)
5242{
33b8afdf 5243 if (prog->float_substr && !prog->float_utf8) {
097eb12c
AL
5244 SV* const sv = newSVsv(prog->float_substr);
5245 prog->float_utf8 = sv;
33b8afdf
JH
5246 sv_utf8_upgrade(sv);
5247 if (SvTAIL(prog->float_substr))
5248 SvTAIL_on(sv);
5249 if (prog->float_substr == prog->check_substr)
5250 prog->check_utf8 = sv;
5251 }
5252 if (prog->anchored_substr && !prog->anchored_utf8) {
097eb12c
AL
5253 SV* const sv = newSVsv(prog->anchored_substr);
5254 prog->anchored_utf8 = sv;
33b8afdf
JH
5255 sv_utf8_upgrade(sv);
5256 if (SvTAIL(prog->anchored_substr))
5257 SvTAIL_on(sv);
5258 if (prog->anchored_substr == prog->check_substr)
5259 prog->check_utf8 = sv;
5260 }
5261}
5262
5263STATIC void
5264S_to_byte_substr(pTHX_ register regexp *prog)
5265{
97aff369 5266 dVAR;
33b8afdf 5267 if (prog->float_utf8 && !prog->float_substr) {
097eb12c
AL
5268 SV* sv = newSVsv(prog->float_utf8);
5269 prog->float_substr = sv;
33b8afdf
JH
5270 if (sv_utf8_downgrade(sv, TRUE)) {
5271 if (SvTAIL(prog->float_utf8))
5272 SvTAIL_on(sv);
5273 } else {
5274 SvREFCNT_dec(sv);
5275 prog->float_substr = sv = &PL_sv_undef;
5276 }
5277 if (prog->float_utf8 == prog->check_utf8)
5278 prog->check_substr = sv;
5279 }
5280 if (prog->anchored_utf8 && !prog->anchored_substr) {
097eb12c
AL
5281 SV* sv = newSVsv(prog->anchored_utf8);
5282 prog->anchored_substr = sv;
33b8afdf
JH
5283 if (sv_utf8_downgrade(sv, TRUE)) {
5284 if (SvTAIL(prog->anchored_utf8))
5285 SvTAIL_on(sv);
5286 } else {
5287 SvREFCNT_dec(sv);
5288 prog->anchored_substr = sv = &PL_sv_undef;
5289 }
5290 if (prog->anchored_utf8 == prog->check_utf8)
5291 prog->check_substr = sv;
5292 }
5293}
66610fdd
RGS
5294
5295/*
5296 * Local variables:
5297 * c-indentation-style: bsd
5298 * c-basic-offset: 4
5299 * indent-tabs-mode: t
5300 * End:
5301 *
37442d52
RGS
5302 * ex: set ts=8 sts=4 sw=4 noet:
5303 */