This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
disable 'split /(?{ split "" })/' test until recursive split is fixed
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
50/* *These* symbols are masked to allow static link. */
51# define Perl_pregexec my_pregexec
9041c2e3 52# define Perl_reginitcolors my_reginitcolors
490a3f88 53# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
9041c2e3 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 **** Alterations to Henry's code are...
79 ****
4bb101f2 80 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 81 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 82 ****
9ef589d8
LW
83 **** You may distribute under the terms of either the GNU General Public
84 **** License or the Artistic License, as specified in the README file.
a687059c
LW
85 *
86 * Beware that some of this code is subtly aware of the way operator
87 * precedence is structured in regular expressions. Serious changes in
88 * regular-expression syntax might require a total rethink.
89 */
90#include "EXTERN.h"
864dbfa3 91#define PERL_IN_REGEXEC_C
a687059c 92#include "perl.h"
0f5d15d6 93
a687059c
LW
94#include "regcomp.h"
95
c277df42
IZ
96#define RF_tainted 1 /* tainted information used? */
97#define RF_warned 2 /* warned about big count? */
ce862d02 98#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
99#define RF_utf8 8 /* String contains multibyte chars? */
100
eb160463 101#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
102
103#define RS_init 1 /* eval environment created */
104#define RS_set 2 /* replsv value is set */
c277df42 105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
32fc9b6a 110#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 111
c277df42
IZ
112/*
113 * Forwards.
114 */
115
33b8afdf 116#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 117#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 118
52657f30
AL
119#define HOPc(pos,off) ((char *)(PL_reg_match_utf8 \
120 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
121 : (U8*)(pos + off)))
122#define HOPBACKc(pos, off) ((char*) \
123 ((PL_reg_match_utf8) \
124 ? reghopmaybe3((U8*)pos, -off, ((U8*)(off < 0 ? PL_regeol : PL_bostr))) \
efb30f32
HS
125 : (pos - off >= PL_bostr) \
126 ? (U8*)(pos - off) \
52657f30 127 : (U8*)NULL) \
efb30f32 128)
efb30f32 129
1aa99e6b 130#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c 131#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 132#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 133
1a4fad37
AL
134#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
135 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
136#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
137#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
138#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
139#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 140
5f80c4cf 141/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
142#define JUMPABLE(rn) ( \
143 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
144 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
145 OP(rn) == PLUS || OP(rn) == MINMOD || \
146 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
147)
148
cca55fe3
JP
149#define HAS_TEXT(rn) ( \
150 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
151)
e2d8ce26 152
a84d97b6
HS
153/*
154 Search for mandatory following text node; for lookahead, the text must
155 follow but for lookbehind (rn->flags != 0) we skip to the next step.
156*/
cca55fe3 157#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 158 while (JUMPABLE(rn)) \
a84d97b6 159 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 160 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
161 else if (OP(rn) == PLUS) \
162 rn = NEXTOPER(rn); \
a84d97b6
HS
163 else if (OP(rn) == IFMATCH) \
164 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 165 else rn += NEXT_OFF(rn); \
5f80c4cf 166} STMT_END
74750237 167
acfe0abc 168static void restore_pos(pTHX_ void *arg);
51371543 169
76e3520e 170STATIC CHECKPOINT
cea2e8a9 171S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 172{
97aff369 173 dVAR;
a3b680e6 174 const int retval = PL_savestack_ix;
b1ce53c5 175#define REGCP_PAREN_ELEMS 4
a3b680e6 176 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
177 int p;
178
e49a9654
IH
179 if (paren_elems_to_push < 0)
180 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
181
a01268b5 182#define REGCP_OTHER_ELEMS 6
4b3c1a47 183 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 184 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 185/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
186 SSPUSHINT(PL_regendp[p]);
187 SSPUSHINT(PL_regstartp[p]);
3280af22 188 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
189 SSPUSHINT(p);
190 }
b1ce53c5 191/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
192 SSPUSHINT(PL_regsize);
193 SSPUSHINT(*PL_reglastparen);
a01268b5 194 SSPUSHINT(*PL_reglastcloseparen);
3280af22 195 SSPUSHPTR(PL_reginput);
41123dfd
JH
196#define REGCP_FRAME_ELEMS 2
197/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
198 * are needed for the regexp context stack bookkeeping. */
199 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 200 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 201
a0d0e21e
LW
202 return retval;
203}
204
c277df42 205/* These are needed since we do not localize EVAL nodes: */
a3621e74 206# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 207 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 208 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 209
a3621e74 210# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 211 PerlIO_printf(Perl_debug_log, \
faccc32b 212 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 213 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 214
76e3520e 215STATIC char *
097eb12c 216S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 217{
97aff369 218 dVAR;
b1ce53c5 219 I32 i;
a0d0e21e 220 char *input;
b1ce53c5 221
a3621e74
YO
222 GET_RE_DEBUG_FLAGS_DECL;
223
b1ce53c5 224 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 225 i = SSPOPINT;
b1ce53c5
JH
226 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
227 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 228 input = (char *) SSPOPPTR;
a01268b5 229 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
230 *PL_reglastparen = SSPOPINT;
231 PL_regsize = SSPOPINT;
b1ce53c5
JH
232
233 /* Now restore the parentheses context. */
41123dfd
JH
234 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
235 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 236 I32 tmps;
097eb12c 237 U32 paren = (U32)SSPOPINT;
3280af22 238 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
239 PL_regstartp[paren] = SSPOPINT;
240 tmps = SSPOPINT;
3280af22
NIS
241 if (paren <= *PL_reglastparen)
242 PL_regendp[paren] = tmps;
a3621e74 243 DEBUG_EXECUTE_r(
c3464db5 244 PerlIO_printf(Perl_debug_log,
b900a521 245 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 246 (UV)paren, (IV)PL_regstartp[paren],
b900a521 247 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 248 (IV)PL_regendp[paren],
3280af22 249 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 250 );
a0d0e21e 251 }
a3621e74 252 DEBUG_EXECUTE_r(
4f639d21 253 if ((I32)(*PL_reglastparen + 1) <= rex->nparens) {
c3464db5 254 PerlIO_printf(Perl_debug_log,
faccc32b 255 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 256 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
257 }
258 );
daf18116 259#if 1
dafc8851
JH
260 /* It would seem that the similar code in regtry()
261 * already takes care of this, and in fact it is in
262 * a better location to since this code can #if 0-ed out
263 * but the code in regtry() is needed or otherwise tests
264 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
265 * (as of patchlevel 7877) will fail. Then again,
266 * this code seems to be necessary or otherwise
267 * building DynaLoader will fail:
268 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
269 * --jhi */
097eb12c
AL
270 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
271 if (i > PL_regsize)
272 PL_regstartp[i] = -1;
273 PL_regendp[i] = -1;
a0d0e21e 274 }
dafc8851 275#endif
a0d0e21e
LW
276 return input;
277}
278
02db2b7b 279#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 280
95b24440 281#define TRYPAREN(paren, n, input, where) { \
29d1e993
HS
282 if (paren) { \
283 if (n) { \
284 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
285 PL_regendp[paren] = input - PL_bostr; \
286 } \
287 else \
288 PL_regendp[paren] = -1; \
289 } \
95b24440
DM
290 REGMATCH(next, where); \
291 if (result) \
29d1e993
HS
292 sayYES; \
293 if (paren && n) \
294 PL_regendp[paren] = -1; \
295}
296
297
a687059c 298/*
e50aee73 299 * pregexec and friends
a687059c
LW
300 */
301
302/*
c277df42 303 - pregexec - match a regexp against a string
a687059c 304 */
c277df42 305I32
864dbfa3 306Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 307 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
308/* strend: pointer to null at end of string */
309/* strbeg: real beginning of string */
310/* minend: end of match must be >=minend after stringarg. */
311/* nosave: For optimizations. */
312{
313 return
9041c2e3 314 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
315 nosave ? 0 : REXEC_COPY_STR);
316}
0f5d15d6 317
22e551b9 318
9041c2e3 319/*
cad2e5aa
JH
320 * Need to implement the following flags for reg_anch:
321 *
322 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
323 * USE_INTUIT_ML
324 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
325 * INTUIT_AUTORITATIVE_ML
326 * INTUIT_ONCE_NOML - Intuit can match in one location only.
327 * INTUIT_ONCE_ML
328 *
329 * Another flag for this function: SECOND_TIME (so that float substrs
330 * with giant delta may be not rechecked).
331 */
332
333/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
334
3f7c398e 335/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
336 Otherwise, only SvCUR(sv) is used to get strbeg. */
337
338/* XXXX We assume that strpos is strbeg unless sv. */
339
6eb5f6b9
JH
340/* XXXX Some places assume that there is a fixed substring.
341 An update may be needed if optimizer marks as "INTUITable"
342 RExen without fixed substrings. Similarly, it is assumed that
343 lengths of all the strings are no more than minlen, thus they
344 cannot come from lookahead.
345 (Or minlen should take into account lookahead.) */
346
2c2d71f5
JH
347/* A failure to find a constant substring means that there is no need to make
348 an expensive call to REx engine, thus we celebrate a failure. Similarly,
349 finding a substring too deep into the string means that less calls to
30944b6d
IZ
350 regtry() should be needed.
351
352 REx compiler's optimizer found 4 possible hints:
353 a) Anchored substring;
354 b) Fixed substring;
355 c) Whether we are anchored (beginning-of-line or \G);
356 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 357 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
358 string which does not contradict any of them.
359 */
2c2d71f5 360
6eb5f6b9
JH
361/* Most of decisions we do here should have been done at compile time.
362 The nodes of the REx which we used for the search should have been
363 deleted from the finite automaton. */
364
cad2e5aa
JH
365char *
366Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
367 char *strend, U32 flags, re_scream_pos_data *data)
368{
97aff369 369 dVAR;
b7953727 370 register I32 start_shift = 0;
cad2e5aa 371 /* Should be nonnegative! */
b7953727 372 register I32 end_shift = 0;
2c2d71f5
JH
373 register char *s;
374 register SV *check;
a1933d95 375 char *strbeg;
cad2e5aa 376 char *t;
a3b680e6 377 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 378 I32 ml_anch;
bd61b366
SS
379 register char *other_last = NULL; /* other substr checked before this */
380 char *check_at = NULL; /* check substr found at this pos */
1df70142 381 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 382#ifdef DEBUGGING
890ce7af
AL
383 const char * const i_strpos = strpos;
384 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 385#endif
a3621e74
YO
386
387 GET_RE_DEBUG_FLAGS_DECL;
388
a30b2f1f 389 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 390
b8d68ded 391 if (prog->reganch & ROPT_UTF8) {
a3621e74 392 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
393 "UTF-8 regex...\n"));
394 PL_reg_flags |= RF_utf8;
395 }
396
a3621e74 397 DEBUG_EXECUTE_r({
1df70142 398 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
399 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
400 strpos;
1df70142 401 const int len = PL_reg_match_utf8 ?
b8d68ded 402 strlen(s) : strend - strpos;
2a782b5b
JH
403 if (!PL_colorset)
404 reginitcolors();
b8d68ded 405 if (PL_reg_match_utf8)
a3621e74 406 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 407 "UTF-8 target...\n"));
2a782b5b 408 PerlIO_printf(Perl_debug_log,
a0288114 409 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 410 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
411 prog->precomp,
412 PL_colors[1],
413 (strlen(prog->precomp) > 60 ? "..." : ""),
414 PL_colors[0],
415 (int)(len > 60 ? 60 : len),
416 s, PL_colors[1],
417 (len > 60 ? "..." : "")
418 );
419 });
cad2e5aa 420
c344f387
JH
421 /* CHR_DIST() would be more correct here but it makes things slow. */
422 if (prog->minlen > strend - strpos) {
a3621e74 423 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 424 "String too short... [re_intuit_start]\n"));
cad2e5aa 425 goto fail;
2c2d71f5 426 }
a1933d95 427 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 428 PL_regeol = strend;
33b8afdf
JH
429 if (do_utf8) {
430 if (!prog->check_utf8 && prog->check_substr)
431 to_utf8_substr(prog);
432 check = prog->check_utf8;
433 } else {
434 if (!prog->check_substr && prog->check_utf8)
435 to_byte_substr(prog);
436 check = prog->check_substr;
437 }
438 if (check == &PL_sv_undef) {
a3621e74 439 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
440 "Non-utf string cannot match utf check string\n"));
441 goto fail;
442 }
2c2d71f5 443 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
444 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
445 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 446 && !multiline ) ); /* Check after \n? */
cad2e5aa 447
7e25d62c
JH
448 if (!ml_anch) {
449 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
450 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 451 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
452 && sv && !SvROK(sv)
453 && (strpos != strbeg)) {
a3621e74 454 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
455 goto fail;
456 }
457 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 458 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 459 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
460 I32 slen;
461
1aa99e6b 462 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
463 if (SvTAIL(check)) {
464 slen = SvCUR(check); /* >= 1 */
cad2e5aa 465
9041c2e3 466 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 467 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 468 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 469 goto fail_finish;
cad2e5aa
JH
470 }
471 /* Now should match s[0..slen-2] */
472 slen--;
3f7c398e 473 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 474 || (slen > 1
3f7c398e 475 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 476 report_neq:
a3621e74 477 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
478 goto fail_finish;
479 }
cad2e5aa 480 }
3f7c398e 481 else if (*SvPVX_const(check) != *s
653099ff 482 || ((slen = SvCUR(check)) > 1
3f7c398e 483 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 484 goto report_neq;
c315bfe8 485 check_at = s;
2c2d71f5 486 goto success_at_start;
7e25d62c 487 }
cad2e5aa 488 }
2c2d71f5 489 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 490 s = strpos;
2c2d71f5 491 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 492 end_shift = prog->minlen - start_shift -
653099ff 493 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 494 if (!ml_anch) {
a3b680e6 495 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 496 - (SvTAIL(check) != 0);
a3b680e6 497 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
498
499 if (end_shift < eshift)
500 end_shift = eshift;
501 }
cad2e5aa 502 }
2c2d71f5 503 else { /* Can match at random position */
cad2e5aa
JH
504 ml_anch = 0;
505 s = strpos;
2c2d71f5
JH
506 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
507 /* Should be nonnegative! */
508 end_shift = prog->minlen - start_shift -
653099ff 509 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
510 }
511
2c2d71f5 512#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 513 if (end_shift < 0)
6bbae5e6 514 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
515#endif
516
2c2d71f5
JH
517 restart:
518 /* Find a possible match in the region s..strend by looking for
519 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 520 if (flags & REXEC_SCREAM) {
cad2e5aa 521 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 522 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 523
2c2d71f5
JH
524 if (PL_screamfirst[BmRARE(check)] >= 0
525 || ( BmRARE(check) == '\n'
526 && (BmPREVIOUS(check) == SvCUR(check) - 1)
527 && SvTAIL(check) ))
9041c2e3 528 s = screaminstr(sv, check,
2c2d71f5 529 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 530 else
2c2d71f5 531 goto fail_finish;
4addbd3b
HS
532 /* we may be pointing at the wrong string */
533 if (s && RX_MATCH_COPIED(prog))
3f7c398e 534 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
535 if (data)
536 *data->scream_olds = s;
537 }
f33976b4 538 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
539 s = fbm_instr((U8*)(s + start_shift),
540 (U8*)(strend - end_shift),
7fba1cd6 541 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 542 else
1aa99e6b
IH
543 s = fbm_instr(HOP3(s, start_shift, strend),
544 HOP3(strend, -end_shift, strbeg),
7fba1cd6 545 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
546
547 /* Update the count-of-usability, remove useless subpatterns,
548 unshift s. */
2c2d71f5 549
a0288114 550 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 551 (s ? "Found" : "Did not find"),
33b8afdf 552 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 553 PL_colors[0],
7b0972df 554 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 555 SvPVX_const(check),
2c2d71f5
JH
556 PL_colors[1], (SvTAIL(check) ? "$" : ""),
557 (s ? " at offset " : "...\n") ) );
558
559 if (!s)
560 goto fail_finish;
561
6eb5f6b9
JH
562 check_at = s;
563
2c2d71f5 564 /* Finish the diagnostic message */
a3621e74 565 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
566
567 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
568 Start with the other substr.
569 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 570 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
571 *always* match. Probably should be marked during compile...
572 Probably it is right to do no SCREAM here...
573 */
574
33b8afdf 575 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 576 /* Take into account the "other" substring. */
2c2d71f5
JH
577 /* XXXX May be hopelessly wrong for UTF... */
578 if (!other_last)
6eb5f6b9 579 other_last = strpos;
33b8afdf 580 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
581 do_other_anchored:
582 {
890ce7af
AL
583 char * const last = HOP3c(s, -start_shift, strbeg);
584 char *last1, *last2;
2c2d71f5 585 char *s1 = s;
33b8afdf 586 SV* must;
2c2d71f5 587
2c2d71f5
JH
588 t = s - prog->check_offset_max;
589 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 590 && (!do_utf8
1aa99e6b 591 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 592 && t > strpos)))
30944b6d 593 /* EMPTY */;
2c2d71f5
JH
594 else
595 t = strpos;
1aa99e6b 596 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
597 if (t < other_last) /* These positions already checked */
598 t = other_last;
1aa99e6b 599 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
600 if (last < last1)
601 last1 = last;
602 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
603 /* On end-of-str: see comment below. */
33b8afdf
JH
604 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
605 if (must == &PL_sv_undef) {
606 s = (char*)NULL;
a3621e74 607 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
608 }
609 else
610 s = fbm_instr(
611 (unsigned char*)t,
612 HOP3(HOP3(last1, prog->anchored_offset, strend)
613 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
614 must,
7fba1cd6 615 multiline ? FBMrf_MULTILINE : 0
33b8afdf 616 );
a3621e74 617 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 618 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
619 (s ? "Found" : "Contradicts"),
620 PL_colors[0],
33b8afdf
JH
621 (int)(SvCUR(must)
622 - (SvTAIL(must)!=0)),
3f7c398e 623 SvPVX_const(must),
33b8afdf 624 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
625 if (!s) {
626 if (last1 >= last2) {
a3621e74 627 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
628 ", giving up...\n"));
629 goto fail_finish;
630 }
a3621e74 631 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 632 ", trying floating at offset %ld...\n",
1aa99e6b
IH
633 (long)(HOP3c(s1, 1, strend) - i_strpos)));
634 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
635 s = HOP3c(last, 1, strend);
2c2d71f5
JH
636 goto restart;
637 }
638 else {
a3621e74 639 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 640 (long)(s - i_strpos)));
1aa99e6b
IH
641 t = HOP3c(s, -prog->anchored_offset, strbeg);
642 other_last = HOP3c(s, 1, strend);
30944b6d 643 s = s1;
2c2d71f5
JH
644 if (t == strpos)
645 goto try_at_start;
2c2d71f5
JH
646 goto try_at_offset;
647 }
30944b6d 648 }
2c2d71f5
JH
649 }
650 else { /* Take into account the floating substring. */
33b8afdf
JH
651 char *last, *last1;
652 char *s1 = s;
653 SV* must;
654
655 t = HOP3c(s, -start_shift, strbeg);
656 last1 = last =
657 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
658 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
659 last = HOP3c(t, prog->float_max_offset, strend);
660 s = HOP3c(t, prog->float_min_offset, strend);
661 if (s < other_last)
662 s = other_last;
2c2d71f5 663 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
664 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
665 /* fbm_instr() takes into account exact value of end-of-str
666 if the check is SvTAIL(ed). Since false positives are OK,
667 and end-of-str is not later than strend we are OK. */
668 if (must == &PL_sv_undef) {
669 s = (char*)NULL;
a3621e74 670 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
671 }
672 else
2c2d71f5 673 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
674 (unsigned char*)last + SvCUR(must)
675 - (SvTAIL(must)!=0),
7fba1cd6 676 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 677 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
678 (s ? "Found" : "Contradicts"),
679 PL_colors[0],
680 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 681 SvPVX_const(must),
33b8afdf
JH
682 PL_colors[1], (SvTAIL(must) ? "$" : "")));
683 if (!s) {
684 if (last1 == last) {
a3621e74 685 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
686 ", giving up...\n"));
687 goto fail_finish;
2c2d71f5 688 }
a3621e74 689 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
690 ", trying anchored starting at offset %ld...\n",
691 (long)(s1 + 1 - i_strpos)));
692 other_last = last;
693 s = HOP3c(t, 1, strend);
694 goto restart;
695 }
696 else {
a3621e74 697 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
698 (long)(s - i_strpos)));
699 other_last = s; /* Fix this later. --Hugo */
700 s = s1;
701 if (t == strpos)
702 goto try_at_start;
703 goto try_at_offset;
704 }
2c2d71f5 705 }
cad2e5aa 706 }
2c2d71f5
JH
707
708 t = s - prog->check_offset_max;
2c2d71f5 709 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 710 && (!do_utf8
1aa99e6b
IH
711 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
712 && t > strpos))) {
2c2d71f5
JH
713 /* Fixed substring is found far enough so that the match
714 cannot start at strpos. */
715 try_at_offset:
cad2e5aa 716 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
717 /* Eventually fbm_*() should handle this, but often
718 anchored_offset is not 0, so this check will not be wasted. */
719 /* XXXX In the code below we prefer to look for "^" even in
720 presence of anchored substrings. And we search even
721 beyond the found float position. These pessimizations
722 are historical artefacts only. */
723 find_anchor:
2c2d71f5 724 while (t < strend - prog->minlen) {
cad2e5aa 725 if (*t == '\n') {
4ee3650e 726 if (t < check_at - prog->check_offset_min) {
33b8afdf 727 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
728 /* Since we moved from the found position,
729 we definitely contradict the found anchored
30944b6d
IZ
730 substr. Due to the above check we do not
731 contradict "check" substr.
732 Thus we can arrive here only if check substr
733 is float. Redo checking for "other"=="fixed".
734 */
9041c2e3 735 strpos = t + 1;
a3621e74 736 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 737 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
738 goto do_other_anchored;
739 }
4ee3650e
GS
740 /* We don't contradict the found floating substring. */
741 /* XXXX Why not check for STCLASS? */
cad2e5aa 742 s = t + 1;
a3621e74 743 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 744 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
745 goto set_useful;
746 }
4ee3650e
GS
747 /* Position contradicts check-string */
748 /* XXXX probably better to look for check-string
749 than for "\n", so one should lower the limit for t? */
a3621e74 750 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 751 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 752 other_last = strpos = s = t + 1;
cad2e5aa
JH
753 goto restart;
754 }
755 t++;
756 }
a3621e74 757 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 758 PL_colors[0], PL_colors[1]));
2c2d71f5 759 goto fail_finish;
cad2e5aa 760 }
f5952150 761 else {
a3621e74 762 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 763 PL_colors[0], PL_colors[1]));
f5952150 764 }
cad2e5aa
JH
765 s = t;
766 set_useful:
33b8afdf 767 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
768 }
769 else {
f5952150 770 /* The found string does not prohibit matching at strpos,
2c2d71f5 771 - no optimization of calling REx engine can be performed,
f5952150
GS
772 unless it was an MBOL and we are not after MBOL,
773 or a future STCLASS check will fail this. */
2c2d71f5
JH
774 try_at_start:
775 /* Even in this situation we may use MBOL flag if strpos is offset
776 wrt the start of the string. */
05b4157f 777 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 778 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
779 /* May be due to an implicit anchor of m{.*foo} */
780 && !(prog->reganch & ROPT_IMPLICIT))
781 {
cad2e5aa
JH
782 t = strpos;
783 goto find_anchor;
784 }
a3621e74 785 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 786 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 787 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 788 );
2c2d71f5 789 success_at_start:
30944b6d 790 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
791 && (do_utf8 ? (
792 prog->check_utf8 /* Could be deleted already */
793 && --BmUSEFUL(prog->check_utf8) < 0
794 && (prog->check_utf8 == prog->float_utf8)
795 ) : (
796 prog->check_substr /* Could be deleted already */
797 && --BmUSEFUL(prog->check_substr) < 0
798 && (prog->check_substr == prog->float_substr)
799 )))
66e933ab 800 {
cad2e5aa 801 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 802 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
803 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
804 if (do_utf8 ? prog->check_substr : prog->check_utf8)
805 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
806 prog->check_substr = prog->check_utf8 = NULL; /* disable */
807 prog->float_substr = prog->float_utf8 = NULL; /* clear */
808 check = NULL; /* abort */
cad2e5aa 809 s = strpos;
3cf5c195
IZ
810 /* XXXX This is a remnant of the old implementation. It
811 looks wasteful, since now INTUIT can use many
6eb5f6b9 812 other heuristics. */
cad2e5aa
JH
813 prog->reganch &= ~RE_USE_INTUIT;
814 }
815 else
816 s = strpos;
817 }
818
6eb5f6b9
JH
819 /* Last resort... */
820 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
821 if (prog->regstclass) {
822 /* minlen == 0 is possible if regstclass is \b or \B,
823 and the fixed substr is ''$.
824 Since minlen is already taken into account, s+1 is before strend;
825 accidentally, minlen >= 1 guaranties no false positives at s + 1
826 even for \b or \B. But (minlen? 1 : 0) below assumes that
827 regstclass does not come from lookahead... */
828 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
829 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 830 const U8* const str = (U8*)STRING(prog->regstclass);
06b5626a 831 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 832 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 833 : 1);
a3b680e6 834 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 835 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 836 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
837 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
838 cl_l, strend)
839 : strend);
6eb5f6b9
JH
840
841 t = s;
06b5626a 842 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
6eb5f6b9
JH
843 if (!s) {
844#ifdef DEBUGGING
cbbf8932 845 const char *what = NULL;
6eb5f6b9
JH
846#endif
847 if (endpos == strend) {
a3621e74 848 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
849 "Could not match STCLASS...\n") );
850 goto fail;
851 }
a3621e74 852 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 853 "This position contradicts STCLASS...\n") );
653099ff
GS
854 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
855 goto fail;
6eb5f6b9 856 /* Contradict one of substrings */
33b8afdf
JH
857 if (prog->anchored_substr || prog->anchored_utf8) {
858 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 859 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 860 hop_and_restart:
1aa99e6b 861 s = HOP3c(t, 1, strend);
66e933ab
GS
862 if (s + start_shift + end_shift > strend) {
863 /* XXXX Should be taken into account earlier? */
a3621e74 864 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
865 "Could not match STCLASS...\n") );
866 goto fail;
867 }
5e39e1e5
HS
868 if (!check)
869 goto giveup;
a3621e74 870 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 871 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
872 what, (long)(s + start_shift - i_strpos)) );
873 goto restart;
874 }
66e933ab 875 /* Have both, check_string is floating */
6eb5f6b9
JH
876 if (t + start_shift >= check_at) /* Contradicts floating=check */
877 goto retry_floating_check;
878 /* Recheck anchored substring, but not floating... */
9041c2e3 879 s = check_at;
5e39e1e5
HS
880 if (!check)
881 goto giveup;
a3621e74 882 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 883 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
884 (long)(other_last - i_strpos)) );
885 goto do_other_anchored;
886 }
60e71179
GS
887 /* Another way we could have checked stclass at the
888 current position only: */
889 if (ml_anch) {
890 s = t = t + 1;
5e39e1e5
HS
891 if (!check)
892 goto giveup;
a3621e74 893 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 894 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 895 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 896 goto try_at_offset;
66e933ab 897 }
33b8afdf 898 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 899 goto fail;
6eb5f6b9
JH
900 /* Check is floating subtring. */
901 retry_floating_check:
902 t = check_at - start_shift;
a3621e74 903 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
904 goto hop_and_restart;
905 }
b7953727 906 if (t != s) {
a3621e74 907 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 908 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
909 (long)(t - i_strpos), (long)(s - i_strpos))
910 );
911 }
912 else {
a3621e74 913 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
914 "Does not contradict STCLASS...\n");
915 );
916 }
6eb5f6b9 917 }
5e39e1e5 918 giveup:
a3621e74 919 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
920 PL_colors[4], (check ? "Guessed" : "Giving up"),
921 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 922 return s;
2c2d71f5
JH
923
924 fail_finish: /* Substring not found */
33b8afdf
JH
925 if (prog->check_substr || prog->check_utf8) /* could be removed already */
926 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 927 fail:
a3621e74 928 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 929 PL_colors[4], PL_colors[5]));
bd61b366 930 return NULL;
cad2e5aa 931}
9661b544 932
6eb5f6b9 933/* We know what class REx starts with. Try to find this position... */
3c3eec57 934STATIC char *
097eb12c 935S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, const char *strend, I32 norun)
a687059c 936{
27da23d5 937 dVAR;
1df70142 938 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 939 char *m;
d8093b23 940 STRLEN ln;
5dab1207 941 STRLEN lnc;
078c425b 942 register STRLEN uskip;
d8093b23
G
943 unsigned int c1;
944 unsigned int c2;
6eb5f6b9
JH
945 char *e;
946 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 947 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 948
6eb5f6b9
JH
949 /* We know what class it must start with. */
950 switch (OP(c)) {
6eb5f6b9 951 case ANYOF:
388cc4de 952 if (do_utf8) {
078c425b 953 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
954 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
955 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a
DM
956 reginclass(prog, c, (U8*)s, 0, do_utf8) :
957 REGINCLASS(prog, c, (U8*)s)) {
388cc4de
HS
958 if (tmp && (norun || regtry(prog, s)))
959 goto got_it;
960 else
961 tmp = doevery;
962 }
963 else
964 tmp = 1;
078c425b 965 s += uskip;
388cc4de
HS
966 }
967 }
968 else {
969 while (s < strend) {
970 STRLEN skip = 1;
971
32fc9b6a 972 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
973 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
974 /* The assignment of 2 is intentional:
975 * for the folded sharp s, the skip is 2. */
976 (skip = SHARP_S_SKIP))) {
977 if (tmp && (norun || regtry(prog, s)))
978 goto got_it;
979 else
980 tmp = doevery;
981 }
982 else
983 tmp = 1;
984 s += skip;
985 }
a0d0e21e 986 }
6eb5f6b9 987 break;
f33976b4
DB
988 case CANY:
989 while (s < strend) {
990 if (tmp && (norun || regtry(prog, s)))
991 goto got_it;
992 else
993 tmp = doevery;
994 s++;
995 }
996 break;
6eb5f6b9 997 case EXACTF:
5dab1207
NIS
998 m = STRING(c);
999 ln = STR_LEN(c); /* length to match in octets/bytes */
1000 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1001 if (UTF) {
a2a2844f 1002 STRLEN ulen1, ulen2;
5dab1207 1003 U8 *sm = (U8 *) m;
89ebb4a3
JH
1004 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1005 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1006 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1007
1008 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1009 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1010
89ebb4a3 1011 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1012 0, uniflags);
89ebb4a3 1013 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1014 0, uniflags);
5dab1207
NIS
1015 lnc = 0;
1016 while (sm < ((U8 *) m + ln)) {
1017 lnc++;
1018 sm += UTF8SKIP(sm);
1019 }
1aa99e6b
IH
1020 }
1021 else {
1022 c1 = *(U8*)m;
1023 c2 = PL_fold[c1];
1024 }
6eb5f6b9
JH
1025 goto do_exactf;
1026 case EXACTFL:
5dab1207
NIS
1027 m = STRING(c);
1028 ln = STR_LEN(c);
1029 lnc = (I32) ln;
d8093b23 1030 c1 = *(U8*)m;
6eb5f6b9
JH
1031 c2 = PL_fold_locale[c1];
1032 do_exactf:
db12adc6 1033 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1034
6eb5f6b9
JH
1035 if (norun && e < s)
1036 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1037
60a8b682
JH
1038 /* The idea in the EXACTF* cases is to first find the
1039 * first character of the EXACTF* node and then, if
1040 * necessary, case-insensitively compare the full
1041 * text of the node. The c1 and c2 are the first
1042 * characters (though in Unicode it gets a bit
1043 * more complicated because there are more cases
7f16dd3d
JH
1044 * than just upper and lower: one needs to use
1045 * the so-called folding case for case-insensitive
1046 * matching (called "loose matching" in Unicode).
1047 * ibcmp_utf8() will do just that. */
60a8b682 1048
1aa99e6b 1049 if (do_utf8) {
575cac57 1050 UV c, f;
89ebb4a3 1051 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1052 STRLEN len, foldlen;
4ad0818d 1053 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1054 if (c1 == c2) {
5dab1207
NIS
1055 /* Upper and lower of 1st char are equal -
1056 * probably not a "letter". */
1aa99e6b 1057 while (s <= e) {
89ebb4a3 1058 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1059 uniflags);
80aecb99
JH
1060 if ( c == c1
1061 && (ln == len ||
66423254 1062 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1063 m, (char **)0, ln, (bool)UTF))
55da9344 1064 && (norun || regtry(prog, s)) )
1aa99e6b 1065 goto got_it;
80aecb99 1066 else {
1df70142 1067 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1068 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1069 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1070 if ( f != c
1071 && (f == c1 || f == c2)
1072 && (ln == foldlen ||
66423254
JH
1073 !ibcmp_utf8((char *) foldbuf,
1074 (char **)0, foldlen, do_utf8,
d07ddd77 1075 m,
eb160463 1076 (char **)0, ln, (bool)UTF))
80aecb99
JH
1077 && (norun || regtry(prog, s)) )
1078 goto got_it;
1079 }
1aa99e6b
IH
1080 s += len;
1081 }
09091399
JH
1082 }
1083 else {
1aa99e6b 1084 while (s <= e) {
89ebb4a3 1085 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1086 uniflags);
80aecb99 1087
60a8b682 1088 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1089 * Note that not all the possible combinations
1090 * are handled here: some of them are handled
1091 * by the standard folding rules, and some of
1092 * them (the character class or ANYOF cases)
1093 * are handled during compiletime in
1094 * regexec.c:S_regclass(). */
880bd946
JH
1095 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1096 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1097 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1098
1099 if ( (c == c1 || c == c2)
1100 && (ln == len ||
66423254 1101 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1102 m, (char **)0, ln, (bool)UTF))
55da9344 1103 && (norun || regtry(prog, s)) )
1aa99e6b 1104 goto got_it;
80aecb99 1105 else {
1df70142 1106 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1107 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1108 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1109 if ( f != c
1110 && (f == c1 || f == c2)
1111 && (ln == foldlen ||
a6872d42 1112 !ibcmp_utf8((char *) foldbuf,
66423254 1113 (char **)0, foldlen, do_utf8,
d07ddd77 1114 m,
eb160463 1115 (char **)0, ln, (bool)UTF))
80aecb99
JH
1116 && (norun || regtry(prog, s)) )
1117 goto got_it;
1118 }
1aa99e6b
IH
1119 s += len;
1120 }
09091399 1121 }
1aa99e6b
IH
1122 }
1123 else {
1124 if (c1 == c2)
1125 while (s <= e) {
1126 if ( *(U8*)s == c1
1127 && (ln == 1 || !(OP(c) == EXACTF
1128 ? ibcmp(s, m, ln)
1129 : ibcmp_locale(s, m, ln)))
1130 && (norun || regtry(prog, s)) )
1131 goto got_it;
1132 s++;
1133 }
1134 else
1135 while (s <= e) {
1136 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1137 && (ln == 1 || !(OP(c) == EXACTF
1138 ? ibcmp(s, m, ln)
1139 : ibcmp_locale(s, m, ln)))
1140 && (norun || regtry(prog, s)) )
1141 goto got_it;
1142 s++;
1143 }
b3c9acc1
IZ
1144 }
1145 break;
bbce6d69 1146 case BOUNDL:
3280af22 1147 PL_reg_flags |= RF_tainted;
bbce6d69 1148 /* FALL THROUGH */
a0d0e21e 1149 case BOUND:
ffc61ed2 1150 if (do_utf8) {
12d33761 1151 if (s == PL_bostr)
ffc61ed2
JH
1152 tmp = '\n';
1153 else {
6136c704 1154 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1155 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1156 }
1157 tmp = ((OP(c) == BOUND ?
9041c2e3 1158 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1159 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1160 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1161 if (tmp == !(OP(c) == BOUND ?
3568d838 1162 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1163 isALNUM_LC_utf8((U8*)s)))
1164 {
1165 tmp = !tmp;
1166 if ((norun || regtry(prog, s)))
1167 goto got_it;
1168 }
078c425b 1169 s += uskip;
a687059c 1170 }
a0d0e21e 1171 }
667bb95a 1172 else {
12d33761 1173 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1174 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1175 while (s < strend) {
1176 if (tmp ==
1177 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1178 tmp = !tmp;
1179 if ((norun || regtry(prog, s)))
1180 goto got_it;
1181 }
1182 s++;
a0ed51b3 1183 }
a0ed51b3 1184 }
6eb5f6b9 1185 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1186 goto got_it;
1187 break;
bbce6d69 1188 case NBOUNDL:
3280af22 1189 PL_reg_flags |= RF_tainted;
bbce6d69 1190 /* FALL THROUGH */
a0d0e21e 1191 case NBOUND:
ffc61ed2 1192 if (do_utf8) {
12d33761 1193 if (s == PL_bostr)
ffc61ed2
JH
1194 tmp = '\n';
1195 else {
6136c704 1196 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1197 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1198 }
1199 tmp = ((OP(c) == NBOUND ?
9041c2e3 1200 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1201 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1202 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1203 if (tmp == !(OP(c) == NBOUND ?
3568d838 1204 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1205 isALNUM_LC_utf8((U8*)s)))
1206 tmp = !tmp;
1207 else if ((norun || regtry(prog, s)))
1208 goto got_it;
078c425b 1209 s += uskip;
ffc61ed2 1210 }
a0d0e21e 1211 }
667bb95a 1212 else {
12d33761 1213 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1214 tmp = ((OP(c) == NBOUND ?
1215 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1216 while (s < strend) {
1217 if (tmp ==
1218 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1219 tmp = !tmp;
1220 else if ((norun || regtry(prog, s)))
1221 goto got_it;
1222 s++;
1223 }
a0ed51b3 1224 }
6eb5f6b9 1225 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1226 goto got_it;
1227 break;
a0d0e21e 1228 case ALNUM:
ffc61ed2 1229 if (do_utf8) {
1a4fad37 1230 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1231 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1232 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1233 if (tmp && (norun || regtry(prog, s)))
1234 goto got_it;
1235 else
1236 tmp = doevery;
1237 }
bbce6d69 1238 else
ffc61ed2 1239 tmp = 1;
078c425b 1240 s += uskip;
bbce6d69 1241 }
bbce6d69 1242 }
ffc61ed2
JH
1243 else {
1244 while (s < strend) {
1245 if (isALNUM(*s)) {
1246 if (tmp && (norun || regtry(prog, s)))
1247 goto got_it;
1248 else
1249 tmp = doevery;
1250 }
a0ed51b3 1251 else
ffc61ed2
JH
1252 tmp = 1;
1253 s++;
a0ed51b3 1254 }
a0ed51b3
LW
1255 }
1256 break;
bbce6d69 1257 case ALNUML:
3280af22 1258 PL_reg_flags |= RF_tainted;
ffc61ed2 1259 if (do_utf8) {
078c425b 1260 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1261 if (isALNUM_LC_utf8((U8*)s)) {
1262 if (tmp && (norun || regtry(prog, s)))
1263 goto got_it;
1264 else
1265 tmp = doevery;
1266 }
a687059c 1267 else
ffc61ed2 1268 tmp = 1;
078c425b 1269 s += uskip;
a0d0e21e 1270 }
a0d0e21e 1271 }
ffc61ed2
JH
1272 else {
1273 while (s < strend) {
1274 if (isALNUM_LC(*s)) {
1275 if (tmp && (norun || regtry(prog, s)))
1276 goto got_it;
1277 else
1278 tmp = doevery;
1279 }
a0ed51b3 1280 else
ffc61ed2
JH
1281 tmp = 1;
1282 s++;
a0ed51b3 1283 }
a0ed51b3
LW
1284 }
1285 break;
a0d0e21e 1286 case NALNUM:
ffc61ed2 1287 if (do_utf8) {
1a4fad37 1288 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1289 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1290 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1291 if (tmp && (norun || regtry(prog, s)))
1292 goto got_it;
1293 else
1294 tmp = doevery;
1295 }
bbce6d69 1296 else
ffc61ed2 1297 tmp = 1;
078c425b 1298 s += uskip;
bbce6d69 1299 }
bbce6d69 1300 }
ffc61ed2
JH
1301 else {
1302 while (s < strend) {
1303 if (!isALNUM(*s)) {
1304 if (tmp && (norun || regtry(prog, s)))
1305 goto got_it;
1306 else
1307 tmp = doevery;
1308 }
a0ed51b3 1309 else
ffc61ed2
JH
1310 tmp = 1;
1311 s++;
a0ed51b3 1312 }
a0ed51b3
LW
1313 }
1314 break;
bbce6d69 1315 case NALNUML:
3280af22 1316 PL_reg_flags |= RF_tainted;
ffc61ed2 1317 if (do_utf8) {
078c425b 1318 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1319 if (!isALNUM_LC_utf8((U8*)s)) {
1320 if (tmp && (norun || regtry(prog, s)))
1321 goto got_it;
1322 else
1323 tmp = doevery;
1324 }
a687059c 1325 else
ffc61ed2 1326 tmp = 1;
078c425b 1327 s += uskip;
a687059c 1328 }
a0d0e21e 1329 }
ffc61ed2
JH
1330 else {
1331 while (s < strend) {
1332 if (!isALNUM_LC(*s)) {
1333 if (tmp && (norun || regtry(prog, s)))
1334 goto got_it;
1335 else
1336 tmp = doevery;
1337 }
a0ed51b3 1338 else
ffc61ed2
JH
1339 tmp = 1;
1340 s++;
a0ed51b3 1341 }
a0ed51b3
LW
1342 }
1343 break;
a0d0e21e 1344 case SPACE:
ffc61ed2 1345 if (do_utf8) {
1a4fad37 1346 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1347 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1348 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1349 if (tmp && (norun || regtry(prog, s)))
1350 goto got_it;
1351 else
1352 tmp = doevery;
1353 }
a0d0e21e 1354 else
ffc61ed2 1355 tmp = 1;
078c425b 1356 s += uskip;
2304df62 1357 }
a0d0e21e 1358 }
ffc61ed2
JH
1359 else {
1360 while (s < strend) {
1361 if (isSPACE(*s)) {
1362 if (tmp && (norun || regtry(prog, s)))
1363 goto got_it;
1364 else
1365 tmp = doevery;
1366 }
a0ed51b3 1367 else
ffc61ed2
JH
1368 tmp = 1;
1369 s++;
a0ed51b3 1370 }
a0ed51b3
LW
1371 }
1372 break;
bbce6d69 1373 case SPACEL:
3280af22 1374 PL_reg_flags |= RF_tainted;
ffc61ed2 1375 if (do_utf8) {
078c425b 1376 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1377 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1378 if (tmp && (norun || regtry(prog, s)))
1379 goto got_it;
1380 else
1381 tmp = doevery;
1382 }
bbce6d69 1383 else
ffc61ed2 1384 tmp = 1;
078c425b 1385 s += uskip;
bbce6d69 1386 }
bbce6d69 1387 }
ffc61ed2
JH
1388 else {
1389 while (s < strend) {
1390 if (isSPACE_LC(*s)) {
1391 if (tmp && (norun || regtry(prog, s)))
1392 goto got_it;
1393 else
1394 tmp = doevery;
1395 }
a0ed51b3 1396 else
ffc61ed2
JH
1397 tmp = 1;
1398 s++;
a0ed51b3 1399 }
a0ed51b3
LW
1400 }
1401 break;
a0d0e21e 1402 case NSPACE:
ffc61ed2 1403 if (do_utf8) {
1a4fad37 1404 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1405 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1406 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1407 if (tmp && (norun || regtry(prog, s)))
1408 goto got_it;
1409 else
1410 tmp = doevery;
1411 }
a0d0e21e 1412 else
ffc61ed2 1413 tmp = 1;
078c425b 1414 s += uskip;
a687059c 1415 }
a0d0e21e 1416 }
ffc61ed2
JH
1417 else {
1418 while (s < strend) {
1419 if (!isSPACE(*s)) {
1420 if (tmp && (norun || regtry(prog, s)))
1421 goto got_it;
1422 else
1423 tmp = doevery;
1424 }
a0ed51b3 1425 else
ffc61ed2
JH
1426 tmp = 1;
1427 s++;
a0ed51b3 1428 }
a0ed51b3
LW
1429 }
1430 break;
bbce6d69 1431 case NSPACEL:
3280af22 1432 PL_reg_flags |= RF_tainted;
ffc61ed2 1433 if (do_utf8) {
078c425b 1434 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1435 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1436 if (tmp && (norun || regtry(prog, s)))
1437 goto got_it;
1438 else
1439 tmp = doevery;
1440 }
bbce6d69 1441 else
ffc61ed2 1442 tmp = 1;
078c425b 1443 s += uskip;
bbce6d69 1444 }
bbce6d69 1445 }
ffc61ed2
JH
1446 else {
1447 while (s < strend) {
1448 if (!isSPACE_LC(*s)) {
1449 if (tmp && (norun || regtry(prog, s)))
1450 goto got_it;
1451 else
1452 tmp = doevery;
1453 }
a0ed51b3 1454 else
ffc61ed2
JH
1455 tmp = 1;
1456 s++;
a0ed51b3 1457 }
a0ed51b3
LW
1458 }
1459 break;
a0d0e21e 1460 case DIGIT:
ffc61ed2 1461 if (do_utf8) {
1a4fad37 1462 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1463 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1464 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1465 if (tmp && (norun || regtry(prog, s)))
1466 goto got_it;
1467 else
1468 tmp = doevery;
1469 }
a0d0e21e 1470 else
ffc61ed2 1471 tmp = 1;
078c425b 1472 s += uskip;
2b69d0c2 1473 }
a0d0e21e 1474 }
ffc61ed2
JH
1475 else {
1476 while (s < strend) {
1477 if (isDIGIT(*s)) {
1478 if (tmp && (norun || regtry(prog, s)))
1479 goto got_it;
1480 else
1481 tmp = doevery;
1482 }
a0ed51b3 1483 else
ffc61ed2
JH
1484 tmp = 1;
1485 s++;
a0ed51b3 1486 }
a0ed51b3
LW
1487 }
1488 break;
b8c5462f
JH
1489 case DIGITL:
1490 PL_reg_flags |= RF_tainted;
ffc61ed2 1491 if (do_utf8) {
078c425b 1492 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1493 if (isDIGIT_LC_utf8((U8*)s)) {
1494 if (tmp && (norun || regtry(prog, s)))
1495 goto got_it;
1496 else
1497 tmp = doevery;
1498 }
b8c5462f 1499 else
ffc61ed2 1500 tmp = 1;
078c425b 1501 s += uskip;
b8c5462f 1502 }
b8c5462f 1503 }
ffc61ed2
JH
1504 else {
1505 while (s < strend) {
1506 if (isDIGIT_LC(*s)) {
1507 if (tmp && (norun || regtry(prog, s)))
1508 goto got_it;
1509 else
1510 tmp = doevery;
1511 }
b8c5462f 1512 else
ffc61ed2
JH
1513 tmp = 1;
1514 s++;
b8c5462f 1515 }
b8c5462f
JH
1516 }
1517 break;
a0d0e21e 1518 case NDIGIT:
ffc61ed2 1519 if (do_utf8) {
1a4fad37 1520 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1521 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1522 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1523 if (tmp && (norun || regtry(prog, s)))
1524 goto got_it;
1525 else
1526 tmp = doevery;
1527 }
a0d0e21e 1528 else
ffc61ed2 1529 tmp = 1;
078c425b 1530 s += uskip;
a687059c 1531 }
a0d0e21e 1532 }
ffc61ed2
JH
1533 else {
1534 while (s < strend) {
1535 if (!isDIGIT(*s)) {
1536 if (tmp && (norun || regtry(prog, s)))
1537 goto got_it;
1538 else
1539 tmp = doevery;
1540 }
a0ed51b3 1541 else
ffc61ed2
JH
1542 tmp = 1;
1543 s++;
a0ed51b3 1544 }
a0ed51b3
LW
1545 }
1546 break;
b8c5462f
JH
1547 case NDIGITL:
1548 PL_reg_flags |= RF_tainted;
ffc61ed2 1549 if (do_utf8) {
078c425b 1550 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1551 if (!isDIGIT_LC_utf8((U8*)s)) {
1552 if (tmp && (norun || regtry(prog, s)))
1553 goto got_it;
1554 else
1555 tmp = doevery;
1556 }
b8c5462f 1557 else
ffc61ed2 1558 tmp = 1;
078c425b 1559 s += uskip;
b8c5462f 1560 }
a0ed51b3 1561 }
ffc61ed2
JH
1562 else {
1563 while (s < strend) {
1564 if (!isDIGIT_LC(*s)) {
1565 if (tmp && (norun || regtry(prog, s)))
1566 goto got_it;
1567 else
1568 tmp = doevery;
1569 }
cf93c79d 1570 else
ffc61ed2
JH
1571 tmp = 1;
1572 s++;
b8c5462f 1573 }
b8c5462f
JH
1574 }
1575 break;
b3c9acc1 1576 default:
3c3eec57
GS
1577 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1578 break;
d6a28714 1579 }
6eb5f6b9
JH
1580 return 0;
1581 got_it:
1582 return s;
1583}
1584
1585/*
1586 - regexec_flags - match a regexp against a string
1587 */
1588I32
1589Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1590 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1591/* strend: pointer to null at end of string */
1592/* strbeg: real beginning of string */
1593/* minend: end of match must be >=minend after stringarg. */
1594/* data: May be used for some additional optimizations. */
1595/* nosave: For optimizations. */
1596{
97aff369 1597 dVAR;
6eb5f6b9
JH
1598 register char *s;
1599 register regnode *c;
1600 register char *startpos = stringarg;
6eb5f6b9
JH
1601 I32 minlen; /* must match at least this many chars */
1602 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1603 I32 end_shift = 0; /* Same for the end. */ /* CC */
1604 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1605 char *scream_olds = NULL;
6eb5f6b9 1606 SV* oreplsv = GvSV(PL_replgv);
1df70142 1607 const bool do_utf8 = DO_UTF8(sv);
2757e526 1608 I32 multiline;
2a782b5b 1609#ifdef DEBUGGING
2757e526
JH
1610 SV* dsv0;
1611 SV* dsv1;
2a782b5b 1612#endif
a3621e74
YO
1613
1614 GET_RE_DEBUG_FLAGS_DECL;
1615
9d4ba2ae 1616 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1617
1618 /* Be paranoid... */
1619 if (prog == NULL || startpos == NULL) {
1620 Perl_croak(aTHX_ "NULL regexp parameter");
1621 return 0;
1622 }
1623
2757e526
JH
1624 multiline = prog->reganch & PMf_MULTILINE;
1625
1626#ifdef DEBUGGING
1627 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1628 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1629#endif
1630
bac06658
JH
1631#ifdef DEBUGGING
1632 PL_regnarrate = DEBUG_r_TEST;
1633#endif
1634
1635 RX_MATCH_UTF8_set(prog, do_utf8);
1636
6eb5f6b9 1637 minlen = prog->minlen;
61a36c01 1638 if (strend - startpos < minlen) {
a3621e74 1639 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1640 "String too short [regexec_flags]...\n"));
1641 goto phooey;
1aa99e6b 1642 }
6eb5f6b9 1643
6eb5f6b9
JH
1644 /* Check validity of program. */
1645 if (UCHARAT(prog->program) != REG_MAGIC) {
1646 Perl_croak(aTHX_ "corrupted regexp program");
1647 }
1648
1649 PL_reg_flags = 0;
1650 PL_reg_eval_set = 0;
1651 PL_reg_maxiter = 0;
1652
1653 if (prog->reganch & ROPT_UTF8)
1654 PL_reg_flags |= RF_utf8;
1655
1656 /* Mark beginning of line for ^ and lookbehind. */
1657 PL_regbol = startpos;
1658 PL_bostr = strbeg;
1659 PL_reg_sv = sv;
1660
1661 /* Mark end of line for $ (and such) */
1662 PL_regeol = strend;
1663
1664 /* see how far we have to get to not match where we matched before */
1665 PL_regtill = startpos+minend;
1666
6eb5f6b9
JH
1667 /* If there is a "must appear" string, look for it. */
1668 s = startpos;
1669
1670 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1671 MAGIC *mg;
1672
1673 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1674 PL_reg_ganch = startpos;
1675 else if (sv && SvTYPE(sv) >= SVt_PVMG
1676 && SvMAGIC(sv)
14befaf4
DM
1677 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1678 && mg->mg_len >= 0) {
6eb5f6b9
JH
1679 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1680 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1681 if (s > PL_reg_ganch)
6eb5f6b9
JH
1682 goto phooey;
1683 s = PL_reg_ganch;
1684 }
1685 }
1686 else /* pos() not defined */
1687 PL_reg_ganch = strbeg;
1688 }
1689
a0714e2c 1690 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1691 re_scream_pos_data d;
1692
1693 d.scream_olds = &scream_olds;
1694 d.scream_pos = &scream_pos;
1695 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1696 if (!s) {
a3621e74 1697 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1698 goto phooey; /* not present */
3fa9c3d7 1699 }
6eb5f6b9
JH
1700 }
1701
a3621e74 1702 DEBUG_EXECUTE_r({
1df70142
AL
1703 const char * const s0 = UTF
1704 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1705 UNI_DISPLAY_REGEX)
1706 : prog->precomp;
1707 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1708 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1709 UNI_DISPLAY_REGEX) : startpos;
1df70142 1710 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1711 if (!PL_colorset)
1712 reginitcolors();
1713 PerlIO_printf(Perl_debug_log,
a0288114 1714 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1715 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1716 len0, len0, s0,
2a782b5b 1717 PL_colors[1],
9e55ce06 1718 len0 > 60 ? "..." : "",
2a782b5b 1719 PL_colors[0],
9e55ce06
JH
1720 (int)(len1 > 60 ? 60 : len1),
1721 s1, PL_colors[1],
1722 (len1 > 60 ? "..." : "")
2a782b5b
JH
1723 );
1724 });
6eb5f6b9
JH
1725
1726 /* Simplest case: anchored match need be tried only once. */
1727 /* [unless only anchor is BOL and multiline is set] */
1728 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1729 if (s == startpos && regtry(prog, startpos))
1730 goto got_it;
7fba1cd6 1731 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1732 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1733 {
1734 char *end;
1735
1736 if (minlen)
1737 dontbother = minlen - 1;
1aa99e6b 1738 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1739 /* for multiline we only have to try after newlines */
33b8afdf 1740 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1741 if (s == startpos)
1742 goto after_try;
1743 while (1) {
1744 if (regtry(prog, s))
1745 goto got_it;
1746 after_try:
1747 if (s >= end)
1748 goto phooey;
1749 if (prog->reganch & RE_USE_INTUIT) {
1750 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1751 if (!s)
1752 goto phooey;
1753 }
1754 else
1755 s++;
1756 }
1757 } else {
1758 if (s > startpos)
1759 s--;
1760 while (s < end) {
1761 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1762 if (regtry(prog, s))
1763 goto got_it;
1764 }
1765 }
1766 }
1767 }
1768 goto phooey;
1769 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1770 if (regtry(prog, PL_reg_ganch))
1771 goto got_it;
1772 goto phooey;
1773 }
1774
1775 /* Messy cases: unanchored match. */
33b8afdf 1776 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1777 /* we have /x+whatever/ */
1778 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1779 char ch;
bf93d4cc
GS
1780#ifdef DEBUGGING
1781 int did_match = 0;
1782#endif
33b8afdf
JH
1783 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1784 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1785 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1786
1aa99e6b 1787 if (do_utf8) {
6eb5f6b9
JH
1788 while (s < strend) {
1789 if (*s == ch) {
a3621e74 1790 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1791 if (regtry(prog, s)) goto got_it;
1792 s += UTF8SKIP(s);
1793 while (s < strend && *s == ch)
1794 s += UTF8SKIP(s);
1795 }
1796 s += UTF8SKIP(s);
1797 }
1798 }
1799 else {
1800 while (s < strend) {
1801 if (*s == ch) {
a3621e74 1802 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1803 if (regtry(prog, s)) goto got_it;
1804 s++;
1805 while (s < strend && *s == ch)
1806 s++;
1807 }
1808 s++;
1809 }
1810 }
a3621e74 1811 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1812 PerlIO_printf(Perl_debug_log,
b7953727
JH
1813 "Did not find anchored character...\n")
1814 );
6eb5f6b9 1815 }
a0714e2c
SS
1816 else if (prog->anchored_substr != NULL
1817 || prog->anchored_utf8 != NULL
1818 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1819 && prog->float_max_offset < strend - s)) {
1820 SV *must;
1821 I32 back_max;
1822 I32 back_min;
1823 char *last;
6eb5f6b9 1824 char *last1; /* Last position checked before */
bf93d4cc
GS
1825#ifdef DEBUGGING
1826 int did_match = 0;
1827#endif
33b8afdf
JH
1828 if (prog->anchored_substr || prog->anchored_utf8) {
1829 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1830 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1831 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1832 back_max = back_min = prog->anchored_offset;
1833 } else {
1834 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1835 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1836 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1837 back_max = prog->float_max_offset;
1838 back_min = prog->float_min_offset;
1839 }
1840 if (must == &PL_sv_undef)
1841 /* could not downgrade utf8 check substring, so must fail */
1842 goto phooey;
1843
1844 last = HOP3c(strend, /* Cannot start after this */
1845 -(I32)(CHR_SVLEN(must)
1846 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1847
1848 if (s > PL_bostr)
1849 last1 = HOPc(s, -1);
1850 else
1851 last1 = s - 1; /* bogus */
1852
a0288114 1853 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1854 check_substr==must. */
1855 scream_pos = -1;
1856 dontbother = end_shift;
1857 strend = HOPc(strend, -dontbother);
1858 while ( (s <= last) &&
9041c2e3 1859 ((flags & REXEC_SCREAM)
1aa99e6b 1860 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1861 end_shift, &scream_pos, 0))
1aa99e6b 1862 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1863 (unsigned char*)strend, must,
7fba1cd6 1864 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1865 /* we may be pointing at the wrong string */
1866 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1867 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1868 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1869 if (HOPc(s, -back_max) > last1) {
1870 last1 = HOPc(s, -back_min);
1871 s = HOPc(s, -back_max);
1872 }
1873 else {
52657f30 1874 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1875
1876 last1 = HOPc(s, -back_min);
52657f30 1877 s = t;
6eb5f6b9 1878 }
1aa99e6b 1879 if (do_utf8) {
6eb5f6b9
JH
1880 while (s <= last1) {
1881 if (regtry(prog, s))
1882 goto got_it;
1883 s += UTF8SKIP(s);
1884 }
1885 }
1886 else {
1887 while (s <= last1) {
1888 if (regtry(prog, s))
1889 goto got_it;
1890 s++;
1891 }
1892 }
1893 }
a3621e74 1894 DEBUG_EXECUTE_r(if (!did_match)
b7953727 1895 PerlIO_printf(Perl_debug_log,
a0288114 1896 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1897 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1898 ? "anchored" : "floating"),
1899 PL_colors[0],
1900 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 1901 SvPVX_const(must),
b7953727
JH
1902 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1903 );
6eb5f6b9
JH
1904 goto phooey;
1905 }
155aba94 1906 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1907 if (minlen) {
1908 I32 op = (U8)OP(prog->regstclass);
66e933ab 1909 /* don't bother with what can't match */
f14c76ed
RGS
1910 if (PL_regkind[op] != EXACT && op != CANY)
1911 strend = HOPc(strend, -(minlen - 1));
1912 }
a3621e74 1913 DEBUG_EXECUTE_r({
ffc61ed2 1914 SV *prop = sv_newmortal();
cfd0369c
NC
1915 const char *s0;
1916 const char *s1;
9e55ce06
JH
1917 int len0;
1918 int len1;
1919
32fc9b6a 1920 regprop(prog, prop, c);
9e55ce06 1921 s0 = UTF ?
3f7c398e 1922 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 1923 UNI_DISPLAY_REGEX) :
cfd0369c 1924 SvPVX_const(prop);
9e55ce06
JH
1925 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1926 s1 = UTF ?
c728cb41 1927 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1928 len1 = UTF ? SvCUR(dsv1) : strend - s;
1929 PerlIO_printf(Perl_debug_log,
a0288114 1930 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
9e55ce06
JH
1931 len0, len0, s0,
1932 len1, len1, s1);
ffc61ed2 1933 });
06b5626a 1934 if (find_byclass(prog, c, s, strend, 0))
6eb5f6b9 1935 goto got_it;
a3621e74 1936 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1937 }
1938 else {
1939 dontbother = 0;
a0714e2c 1940 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1941 /* Trim the end. */
d6a28714 1942 char *last;
33b8afdf
JH
1943 SV* float_real;
1944
1945 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1946 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1947 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1948
1949 if (flags & REXEC_SCREAM) {
33b8afdf 1950 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1951 end_shift, &scream_pos, 1); /* last one */
1952 if (!last)
ffc61ed2 1953 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1954 /* we may be pointing at the wrong string */
1955 else if (RX_MATCH_COPIED(prog))
3f7c398e 1956 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1957 }
d6a28714
JH
1958 else {
1959 STRLEN len;
cfd0369c 1960 const char * const little = SvPV_const(float_real, len);
d6a28714 1961
33b8afdf 1962 if (SvTAIL(float_real)) {
d6a28714
JH
1963 if (memEQ(strend - len + 1, little, len - 1))
1964 last = strend - len + 1;
7fba1cd6 1965 else if (!multiline)
9041c2e3 1966 last = memEQ(strend - len, little, len)
bd61b366 1967 ? strend - len : NULL;
b8c5462f 1968 else
d6a28714
JH
1969 goto find_last;
1970 } else {
1971 find_last:
9041c2e3 1972 if (len)
d6a28714 1973 last = rninstr(s, strend, little, little + len);
b8c5462f 1974 else
a0288114 1975 last = strend; /* matching "$" */
b8c5462f 1976 }
b8c5462f 1977 }
bf93d4cc 1978 if (last == NULL) {
a3621e74 1979 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 1980 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 1981 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1982 goto phooey; /* Should not happen! */
1983 }
d6a28714
JH
1984 dontbother = strend - last + prog->float_min_offset;
1985 }
1986 if (minlen && (dontbother < minlen))
1987 dontbother = minlen - 1;
1988 strend -= dontbother; /* this one's always in bytes! */
1989 /* We don't know much -- general case. */
1aa99e6b 1990 if (do_utf8) {
d6a28714
JH
1991 for (;;) {
1992 if (regtry(prog, s))
1993 goto got_it;
1994 if (s >= strend)
1995 break;
b8c5462f 1996 s += UTF8SKIP(s);
d6a28714
JH
1997 };
1998 }
1999 else {
2000 do {
2001 if (regtry(prog, s))
2002 goto got_it;
2003 } while (s++ < strend);
2004 }
2005 }
2006
2007 /* Failure. */
2008 goto phooey;
2009
2010got_it:
2011 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2012
2013 if (PL_reg_eval_set) {
2014 /* Preserve the current value of $^R */
2015 if (oreplsv != GvSV(PL_replgv))
2016 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2017 restored, the value remains
2018 the same. */
4f639d21 2019 restore_pos(aTHX_ prog);
d6a28714
JH
2020 }
2021
2022 /* make sure $`, $&, $', and $digit will work later */
2023 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2024 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2025 if (flags & REXEC_COPY_STR) {
2026 I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2027#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2028 if ((SvIsCOW(sv)
2029 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2030 if (DEBUG_C_TEST) {
2031 PerlIO_printf(Perl_debug_log,
2032 "Copy on write: regexp capture, type %d\n",
2033 (int) SvTYPE(sv));
2034 }
2035 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2036 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2037 assert (SvPOKp(prog->saved_copy));
2038 } else
2039#endif
2040 {
2041 RX_MATCH_COPIED_on(prog);
2042 s = savepvn(strbeg, i);
2043 prog->subbeg = s;
2044 }
d6a28714 2045 prog->sublen = i;
d6a28714
JH
2046 }
2047 else {
2048 prog->subbeg = strbeg;
2049 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2050 }
2051 }
9041c2e3 2052
d6a28714
JH
2053 return 1;
2054
2055phooey:
a3621e74 2056 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2057 PL_colors[4], PL_colors[5]));
d6a28714 2058 if (PL_reg_eval_set)
4f639d21 2059 restore_pos(aTHX_ prog);
d6a28714
JH
2060 return 0;
2061}
2062
2063/*
2064 - regtry - try match at specific point
2065 */
2066STATIC I32 /* 0 failure, 1 success */
2067S_regtry(pTHX_ regexp *prog, char *startpos)
2068{
97aff369 2069 dVAR;
d6a28714
JH
2070 register I32 *sp;
2071 register I32 *ep;
2072 CHECKPOINT lastcp;
a3621e74 2073 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2074
02db2b7b
IZ
2075#ifdef DEBUGGING
2076 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2077#endif
d6a28714
JH
2078 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2079 MAGIC *mg;
2080
2081 PL_reg_eval_set = RS_init;
a3621e74 2082 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2083 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2084 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2085 ));
e8347627 2086 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2087 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2088 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2089 SAVETMPS;
2090 /* Apparently this is not needed, judging by wantarray. */
e8347627 2091 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2092 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2093
2094 if (PL_reg_sv) {
2095 /* Make $_ available to executed code. */
2096 if (PL_reg_sv != DEFSV) {
59f00321 2097 SAVE_DEFSV;
d6a28714 2098 DEFSV = PL_reg_sv;
b8c5462f 2099 }
d6a28714 2100
9041c2e3 2101 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2102 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2103 /* prepare for quick setting of pos */
d300d9fa
NC
2104#ifdef PERL_OLD_COPY_ON_WRITE
2105 if (SvIsCOW(sv))
2106 sv_force_normal_flags(sv, 0);
2107#endif
2108 mg = sv_magicext(PL_reg_sv, (SV*)0, PERL_MAGIC_regex_global,
2109 &PL_vtbl_mglob, NULL, 0);
d6a28714 2110 mg->mg_len = -1;
b8c5462f 2111 }
d6a28714
JH
2112 PL_reg_magic = mg;
2113 PL_reg_oldpos = mg->mg_len;
4f639d21 2114 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2115 }
09687e5a 2116 if (!PL_reg_curpm) {
a02a5408 2117 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2118#ifdef USE_ITHREADS
2119 {
2120 SV* repointer = newSViv(0);
577e12cc 2121 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2122 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2123 av_push(PL_regex_padav,repointer);
2124 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2125 PL_regex_pad = AvARRAY(PL_regex_padav);
2126 }
2127#endif
2128 }
aaa362c4 2129 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2130 PL_reg_oldcurpm = PL_curpm;
2131 PL_curpm = PL_reg_curpm;
2132 if (RX_MATCH_COPIED(prog)) {
2133 /* Here is a serious problem: we cannot rewrite subbeg,
2134 since it may be needed if this match fails. Thus
2135 $` inside (?{}) could fail... */
2136 PL_reg_oldsaved = prog->subbeg;
2137 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2138#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2139 PL_nrs = prog->saved_copy;
2140#endif
d6a28714
JH
2141 RX_MATCH_COPIED_off(prog);
2142 }
2143 else
bd61b366 2144 PL_reg_oldsaved = NULL;
d6a28714
JH
2145 prog->subbeg = PL_bostr;
2146 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2147 }
973dddac 2148 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2149 PL_reginput = startpos;
2150 PL_regstartp = prog->startp;
2151 PL_regendp = prog->endp;
2152 PL_reglastparen = &prog->lastparen;
a01268b5 2153 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2154 prog->lastparen = 0;
03994de8 2155 prog->lastcloseparen = 0;
d6a28714 2156 PL_regsize = 0;
a3621e74 2157 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2158 if (PL_reg_start_tmpl <= prog->nparens) {
2159 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2160 if(PL_reg_start_tmp)
2161 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2162 else
a02a5408 2163 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2164 }
2165
2166 /* XXXX What this code is doing here?!!! There should be no need
2167 to do this again and again, PL_reglastparen should take care of
3dd2943c 2168 this! --ilya*/
dafc8851
JH
2169
2170 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2171 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2172 * PL_reglastparen), is not needed at all by the test suite
2173 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2174 * enough, for building DynaLoader, or otherwise this
2175 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2176 * will happen. Meanwhile, this code *is* needed for the
2177 * above-mentioned test suite tests to succeed. The common theme
2178 * on those tests seems to be returning null fields from matches.
2179 * --jhi */
dafc8851 2180#if 1
d6a28714
JH
2181 sp = prog->startp;
2182 ep = prog->endp;
2183 if (prog->nparens) {
097eb12c 2184 register I32 i;
eb160463 2185 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2186 *++sp = -1;
2187 *++ep = -1;
2188 }
2189 }
dafc8851 2190#endif
02db2b7b 2191 REGCP_SET(lastcp);
4f639d21 2192 if (regmatch(prog, prog->program + 1)) {
d6a28714
JH
2193 prog->endp[0] = PL_reginput - PL_bostr;
2194 return 1;
2195 }
02db2b7b 2196 REGCP_UNWIND(lastcp);
d6a28714
JH
2197 return 0;
2198}
2199
02db2b7b
IZ
2200#define RE_UNWIND_BRANCH 1
2201#define RE_UNWIND_BRANCHJ 2
2202
2203union re_unwind_t;
2204
2205typedef struct { /* XX: makes sense to enlarge it... */
2206 I32 type;
2207 I32 prev;
2208 CHECKPOINT lastcp;
2209} re_unwind_generic_t;
2210
2211typedef struct {
2212 I32 type;
2213 I32 prev;
2214 CHECKPOINT lastcp;
2215 I32 lastparen;
2216 regnode *next;
2217 char *locinput;
2218 I32 nextchr;
3a2830be 2219 int minmod;
02db2b7b
IZ
2220#ifdef DEBUGGING
2221 int regindent;
2222#endif
2223} re_unwind_branch_t;
2224
2225typedef union re_unwind_t {
2226 I32 type;
2227 re_unwind_generic_t generic;
2228 re_unwind_branch_t branch;
2229} re_unwind_t;
2230
8ba1375e
MJD
2231#define sayYES goto yes
2232#define sayNO goto no
e0f9d4a8 2233#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2234#define sayYES_FINAL goto yes_final
2235#define sayYES_LOUD goto yes_loud
2236#define sayNO_FINAL goto no_final
2237#define sayNO_SILENT goto do_no
2238#define saySAME(x) if (x) goto yes; else goto no
2239
3ab3c9b4
HS
2240#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2241#define POSCACHE_SEEN 1 /* we know what we're caching */
2242#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2243#define CACHEsayYES STMT_START { \
d8319b27 2244 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
3ab3c9b4
HS
2245 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2246 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2247 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2248 /* cache records failure, but this is success */ \
2249 DEBUG_r( \
2250 PerlIO_printf(Perl_debug_log, \
2251 "%*s (remove success from failure cache)\n", \
2252 REPORT_CODE_OFF+PL_regindent*2, "") \
2253 ); \
d8319b27 2254 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2255 } \
2256 } \
2257 sayYES; \
2258} STMT_END
2259#define CACHEsayNO STMT_START { \
d8319b27 2260 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
3ab3c9b4
HS
2261 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2262 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2263 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2264 /* cache records success, but this is failure */ \
2265 DEBUG_r( \
2266 PerlIO_printf(Perl_debug_log, \
2267 "%*s (remove failure from success cache)\n", \
2268 REPORT_CODE_OFF+PL_regindent*2, "") \
2269 ); \
d8319b27 2270 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2271 } \
2272 } \
2273 sayNO; \
2274} STMT_END
2275
a3621e74
YO
2276/* this is used to determine how far from the left messages like
2277 'failed...' are printed. Currently 29 makes these messages line
2278 up with the opcode they refer to. Earlier perls used 25 which
2279 left these messages outdented making reviewing a debug output
2280 quite difficult.
2281*/
2282#define REPORT_CODE_OFF 29
2283
2284
2285/* Make sure there is a test for this +1 options in re_tests */
2286#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2287
5d9a96ca
DM
2288/* grab a new slab and return the first slot in it */
2289
2290STATIC regmatch_state *
2291S_push_slab(pTHX)
2292{
2293 regmatch_slab *s = PL_regmatch_slab->next;
2294 if (!s) {
2295 Newx(s, 1, regmatch_slab);
2296 s->prev = PL_regmatch_slab;
2297 s->next = NULL;
2298 PL_regmatch_slab->next = s;
2299 }
2300 PL_regmatch_slab = s;
2301 return &s->states[0];
2302}
5b47454d 2303
95b24440
DM
2304/* simulate a recursive call to regmatch */
2305
2306#define REGMATCH(ns, where) \
5d9a96ca
DM
2307 st->scan = scan; \
2308 scan = (ns); \
2309 st->resume_state = resume_##where; \
95b24440
DM
2310 goto start_recurse; \
2311 resume_point_##where:
2312
aa283a38
DM
2313
2314/* push a new regex state. Set newst to point to it */
2315
2316#define PUSH_STATE(newst, resume) \
2317 depth++; \
2318 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH STATE(%d)\n", depth)); \
2319 st->scan = scan; \
2320 st->next = next; \
2321 st->n = n; \
2322 st->locinput = locinput; \
2323 st->resume_state = resume; \
2324 newst = st+1; \
2325 if (newst > &(PL_regmatch_slab->states[PERL_REGMATCH_SLAB_SLOTS-1])) \
2326 newst = S_push_slab(aTHX); \
2327 PL_regmatch_state = newst; \
2328 newst->cc = 0; \
2329 newst->minmod = 0; \
2330 newst->sw = 0; \
2331 newst->logical = 0; \
2332 newst->unwind = 0; \
2333 locinput = PL_reginput; \
2334 nextchr = UCHARAT(locinput);
2335
2336#define POP_STATE \
2337 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth)); \
2338 depth--; \
2339 st--; \
2340 if (st < &PL_regmatch_slab->states[0]) { \
2341 PL_regmatch_slab = PL_regmatch_slab->prev; \
2342 st = &PL_regmatch_slab->states[PERL_REGMATCH_SLAB_SLOTS-1]; \
2343 } \
2344 PL_regmatch_state = st; \
2345 scan = st->scan; \
2346 next = st->next; \
2347 n = st->n; \
2348 locinput = st->locinput; \
2349 nextchr = UCHARAT(locinput);
2350
d6a28714
JH
2351/*
2352 - regmatch - main matching routine
2353 *
2354 * Conceptually the strategy is simple: check to see whether the current
2355 * node matches, call self recursively to see whether the rest matches,
2356 * and then act accordingly. In practice we make some effort to avoid
2357 * recursion, in particular by going through "ordinary" nodes (that don't
2358 * need to know whether the rest of the match failed) by a loop instead of
2359 * by recursion.
2360 */
2361/* [lwall] I've hoisted the register declarations to the outer block in order to
2362 * maybe save a little bit of pushing and popping on the stack. It also takes
2363 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2364 *
2365 * This function used to be heavily recursive, but since this had the
2366 * effect of blowing the CPU stack on complex regexes, it has been
2367 * restructured to be iterative, and to save state onto the heap rather
2368 * than the stack. Essentially whereever regmatch() used to be called, it
2369 * pushes the current state, notes where to return, then jumps back into
2370 * the main loop.
2371 *
2372 * Originally the structure of this function used to look something like
2373
2374 S_regmatch() {
2375 int a = 1, b = 2;
2376 ...
2377 while (scan != NULL) {
5d9a96ca 2378 a++; // do stuff with a and b
95b24440
DM
2379 ...
2380 switch (OP(scan)) {
2381 case FOO: {
2382 int local = 3;
2383 ...
2384 if (regmatch(...)) // recurse
2385 goto yes;
2386 }
2387 ...
2388 }
2389 }
2390 yes:
2391 return 1;
2392 }
2393
2394 * Now it looks something like this:
2395
5d9a96ca 2396 typedef struct {
95b24440
DM
2397 int a, b, local;
2398 int resume_state;
5d9a96ca 2399 } regmatch_state;
95b24440
DM
2400
2401 S_regmatch() {
5d9a96ca
DM
2402 regmatch_state *st = new();
2403 int depth=0;
2404 st->a++; // do stuff with a and b
95b24440
DM
2405 ...
2406 while (scan != NULL) {
2407 ...
2408 switch (OP(scan)) {
2409 case FOO: {
5d9a96ca 2410 st->local = 3;
95b24440 2411 ...
5d9a96ca
DM
2412 st->scan = scan;
2413 scan = ...;
2414 st->resume_state = resume_FOO;
2415 goto start_recurse; // recurse
95b24440 2416
5d9a96ca
DM
2417 resume_point_FOO:
2418 if (result)
95b24440
DM
2419 goto yes;
2420 }
2421 ...
2422 }
5d9a96ca
DM
2423 start_recurse:
2424 st = new(); push a new state
2425 st->a = 1; st->b = 2;
2426 depth++;
95b24440 2427 }
5d9a96ca 2428 yes:
95b24440 2429 result = 1;
5d9a96ca
DM
2430 if (depth--) {
2431 st = pop();
95b24440
DM
2432 switch (resume_state) {
2433 case resume_FOO:
2434 goto resume_point_FOO;
2435 ...
2436 }
2437 }
2438 return result
2439 }
2440
2441 * WARNING: this means that any line in this function that contains a
2442 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2443 * regmatch() using gotos instead. Thus the values of any local variables
2444 * not saved in the regmatch_state structure will have been lost when
2445 * execution resumes on the next line .
5d9a96ca
DM
2446 *
2447 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2448 * PL_regmatch_state always points to the currently active state, and
2449 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2450 * The first time regmatch is called, the first slab is allocated, and is
2451 * never freed until interpreter desctruction. When the slab is full,
2452 * a new one is allocated chained to the end. At exit from regmatch, slabs
2453 * allocated since entry are freed.
d6a28714 2454 */
95b24440
DM
2455
2456
d6a28714 2457STATIC I32 /* 0 failure, 1 success */
4f639d21 2458S_regmatch(pTHX_ regexp *rex, regnode *prog)
d6a28714 2459{
27da23d5 2460 dVAR;
95b24440 2461 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2462 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2463
5d9a96ca
DM
2464 regmatch_slab *orig_slab;
2465 regmatch_state *orig_state;
a3621e74 2466
5d9a96ca
DM
2467 /* the current state. This is a cached copy of PL_regmatch_state */
2468 register regmatch_state *st;
95b24440 2469
5d9a96ca
DM
2470 /* cache heavy used fields of st in registers */
2471 register regnode *scan;
2472 register regnode *next;
2473 register I32 n = 0; /* initialize to shut up compiler warning */
2474 register char *locinput = PL_reginput;
95b24440 2475
5d9a96ca
DM
2476 /* these variables are NOT saved during a recusive RFEGMATCH: */
2477 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2478 bool result; /* return value of S_regmatch */
2479 regnode *inner; /* Next node in internal branch. */
2480 int depth = 0; /* depth of recursion */
aa283a38
DM
2481 regmatch_state *newst; /* when pushing a state, this is the new one */
2482 regmatch_state *cur_eval = NULL; /* most recent (??{}) state */
95b24440
DM
2483
2484#ifdef DEBUGGING
ab74612d 2485 SV *re_debug_flags = NULL;
a3621e74 2486 GET_RE_DEBUG_FLAGS;
d6a28714
JH
2487 PL_regindent++;
2488#endif
2489
5d9a96ca
DM
2490 /* on first ever call to regmatch, allocate first slab */
2491 if (!PL_regmatch_slab) {
2492 Newx(PL_regmatch_slab, 1, regmatch_slab);
2493 PL_regmatch_slab->prev = NULL;
2494 PL_regmatch_slab->next = NULL;
2495 PL_regmatch_state = &PL_regmatch_slab->states[0] - 1;
2496 }
2497
2498 /* remember current high-water mark for exit */
2499 /* XXX this should be done with SAVE* instead */
2500 orig_slab = PL_regmatch_slab;
2501 orig_state = PL_regmatch_state;
2502
2503 /* grab next free state slot */
2504 st = ++PL_regmatch_state;
2505 if (st > &(PL_regmatch_slab->states[PERL_REGMATCH_SLAB_SLOTS-1]))
2506 st = PL_regmatch_state = S_push_slab(aTHX);
2507
2508 st->minmod = 0;
2509 st->sw = 0;
2510 st->logical = 0;
2511 st->unwind = 0;
2512 st->cc = NULL;
d6a28714
JH
2513 /* Note that nextchr is a byte even in UTF */
2514 nextchr = UCHARAT(locinput);
2515 scan = prog;
2516 while (scan != NULL) {
8ba1375e 2517
a3621e74 2518 DEBUG_EXECUTE_r( {
6136c704 2519 SV * const prop = sv_newmortal();
1df70142
AL
2520 const int docolor = *PL_colors[0];
2521 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2522 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2523 /* The part of the string before starttry has one color
2524 (pref0_len chars), between starttry and current
2525 position another one (pref_len - pref0_len chars),
2526 after the current position the third one.
2527 We assume that pref0_len <= pref_len, otherwise we
2528 decrease pref0_len. */
9041c2e3 2529 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2530 ? (5 + taill) - l : locinput - PL_bostr;
2531 int pref0_len;
d6a28714 2532
df1ffd02 2533 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2534 pref_len++;
2535 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2536 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2537 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2538 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2539 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2540 l--;
d6a28714
JH
2541 if (pref0_len < 0)
2542 pref0_len = 0;
2543 if (pref0_len > pref_len)
2544 pref0_len = pref_len;
32fc9b6a 2545 regprop(rex, prop, scan);
2a782b5b 2546 {
1df70142 2547 const char * const s0 =
f14c76ed 2548 do_utf8 && OP(scan) != CANY ?
95b24440 2549 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
c728cb41 2550 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2551 locinput - pref_len;
1df70142
AL
2552 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2553 const char * const s1 = do_utf8 && OP(scan) != CANY ?
95b24440
DM
2554 pv_uni_display(PERL_DEBUG_PAD(1),
2555 (U8*)(locinput - pref_len + pref0_len),
c728cb41 2556 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2557 locinput - pref_len + pref0_len;
1df70142
AL
2558 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2559 const char * const s2 = do_utf8 && OP(scan) != CANY ?
95b24440 2560 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
c728cb41 2561 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2562 locinput;
1df70142 2563 const int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2564 PerlIO_printf(Perl_debug_log,
2565 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2566 (IV)(locinput - PL_bostr),
2567 PL_colors[4],
2568 len0, s0,
2569 PL_colors[5],
2570 PL_colors[2],
2571 len1, s1,
2572 PL_colors[3],
2573 (docolor ? "" : "> <"),
2574 PL_colors[0],
2575 len2, s2,
2576 PL_colors[1],
2577 15 - l - pref_len + 1,
2578 "",
4f639d21 2579 (IV)(scan - rex->program), PL_regindent*2, "",
3f7c398e 2580 SvPVX_const(prop));
2a782b5b
JH
2581 }
2582 });
d6a28714
JH
2583
2584 next = scan + NEXT_OFF(scan);
2585 if (next == scan)
2586 next = NULL;
2587
2588 switch (OP(scan)) {
2589 case BOL:
7fba1cd6 2590 if (locinput == PL_bostr)
d6a28714
JH
2591 {
2592 /* regtill = regbol; */
b8c5462f
JH
2593 break;
2594 }
d6a28714
JH
2595 sayNO;
2596 case MBOL:
12d33761
HS
2597 if (locinput == PL_bostr ||
2598 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2599 {
b8c5462f
JH
2600 break;
2601 }
d6a28714
JH
2602 sayNO;
2603 case SBOL:
c2a73568 2604 if (locinput == PL_bostr)
b8c5462f 2605 break;
d6a28714
JH
2606 sayNO;
2607 case GPOS:
2608 if (locinput == PL_reg_ganch)
2609 break;
2610 sayNO;
2611 case EOL:
d6a28714
JH
2612 goto seol;
2613 case MEOL:
d6a28714 2614 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2615 sayNO;
b8c5462f 2616 break;
d6a28714
JH
2617 case SEOL:
2618 seol:
2619 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2620 sayNO;
d6a28714 2621 if (PL_regeol - locinput > 1)
b8c5462f 2622 sayNO;
b8c5462f 2623 break;
d6a28714
JH
2624 case EOS:
2625 if (PL_regeol != locinput)
b8c5462f 2626 sayNO;
d6a28714 2627 break;
ffc61ed2 2628 case SANY:
d6a28714 2629 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2630 sayNO;
f33976b4
DB
2631 if (do_utf8) {
2632 locinput += PL_utf8skip[nextchr];
2633 if (locinput > PL_regeol)
2634 sayNO;
2635 nextchr = UCHARAT(locinput);
2636 }
2637 else
2638 nextchr = UCHARAT(++locinput);
2639 break;
2640 case CANY:
2641 if (!nextchr && locinput >= PL_regeol)
2642 sayNO;
b8c5462f 2643 nextchr = UCHARAT(++locinput);
a0d0e21e 2644 break;
ffc61ed2 2645 case REG_ANY:
1aa99e6b
IH
2646 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2647 sayNO;
2648 if (do_utf8) {
b8c5462f 2649 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2650 if (locinput > PL_regeol)
2651 sayNO;
a0ed51b3 2652 nextchr = UCHARAT(locinput);
a0ed51b3 2653 }
1aa99e6b
IH
2654 else
2655 nextchr = UCHARAT(++locinput);
a0ed51b3 2656 break;
a3621e74
YO
2657
2658
2659
2660 /*
2661 traverse the TRIE keeping track of all accepting states
2662 we transition through until we get to a failing node.
2663
a3621e74
YO
2664
2665 */
5b47454d 2666 case TRIE:
a3621e74
YO
2667 case TRIEF:
2668 case TRIEFL:
2669 {
a3621e74
YO
2670 U8 *uc = ( U8* )locinput;
2671 U32 state = 1;
2672 U16 charid = 0;
2673 U32 base = 0;
2674 UV uvc = 0;
2675 STRLEN len = 0;
2676 STRLEN foldlen = 0;
a3621e74
YO
2677 U8 *uscan = (U8*)NULL;
2678 STRLEN bufflen=0;
95b24440 2679 SV *sv_accept_buff = NULL;
5b47454d
DM
2680 const enum { trie_plain, trie_utf8, trie_uft8_fold }
2681 trie_type = do_utf8 ?
2682 (OP(scan) == TRIE ? trie_utf8 : trie_uft8_fold)
2683 : trie_plain;
2684
7087a21c
NC
2685 /* what trie are we using right now */
2686 reg_trie_data *trie
32fc9b6a 2687 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
d8319b27 2688 st->u.trie.accepted = 0; /* how many accepting states we have seen */
95b24440 2689 result = 0;
a3621e74
YO
2690
2691 while ( state && uc <= (U8*)PL_regeol ) {
2692
5b47454d 2693 if (trie->states[ state ].wordnum) {
d8319b27 2694 if (!st->u.trie.accepted ) {
5b47454d
DM
2695 ENTER;
2696 SAVETMPS;
2697 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2698 sv_accept_buff=newSV(bufflen *
2699 sizeof(reg_trie_accepted) - 1);
2700 SvCUR_set(sv_accept_buff,
2701 sizeof(reg_trie_accepted));
2702 SvPOK_on(sv_accept_buff);
2703 sv_2mortal(sv_accept_buff);
d8319b27 2704 st->u.trie.accept_buff =
5b47454d
DM
2705 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2706 }
2707 else {
d8319b27 2708 if (st->u.trie.accepted >= bufflen) {
5b47454d 2709 bufflen *= 2;
d8319b27 2710 st->u.trie.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2711 SvGROW(sv_accept_buff,
2712 bufflen * sizeof(reg_trie_accepted));
2713 }
2714 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2715 + sizeof(reg_trie_accepted));
2716 }
d8319b27
DM
2717 st->u.trie.accept_buff[st->u.trie.accepted].wordnum = trie->states[state].wordnum;
2718 st->u.trie.accept_buff[st->u.trie.accepted].endpos = uc;
2719 ++st->u.trie.accepted;
5b47454d 2720 }
a3621e74
YO
2721
2722 base = trie->states[ state ].trans.base;
2723
2724 DEBUG_TRIE_EXECUTE_r(
2725 PerlIO_printf( Perl_debug_log,
e4584336 2726 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2727 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27 2728 (UV)state, (UV)base, (UV)st->u.trie.accepted );
a3621e74
YO
2729 );
2730
2731 if ( base ) {
5b47454d
DM
2732 switch (trie_type) {
2733 case trie_uft8_fold:
a3621e74
YO
2734 if ( foldlen>0 ) {
2735 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2736 foldlen -= len;
2737 uscan += len;
2738 len=0;
2739 } else {
1df70142 2740 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2741 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2742 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2743 foldlen -= UNISKIP( uvc );
2744 uscan = foldbuf + UNISKIP( uvc );
2745 }
5b47454d
DM
2746 break;
2747 case trie_utf8:
2748 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2749 &len, uniflags );
2750 break;
2751 case trie_plain:
e4584336 2752 uvc = (UV)*uc;
a3621e74
YO
2753 len = 1;
2754 }
2755
5b47454d
DM
2756 if (uvc < 256) {
2757 charid = trie->charmap[ uvc ];
2758 }
2759 else {
2760 charid = 0;
2761 if (trie->widecharmap) {
2762 SV** svpp = (SV**)NULL;
2763 svpp = hv_fetch(trie->widecharmap,
2764 (char*)&uvc, sizeof(UV), 0);
2765 if (svpp)
2766 charid = (U16)SvIV(*svpp);
2767 }
2768 }
a3621e74 2769
5b47454d
DM
2770 if (charid &&
2771 (base + charid > trie->uniquecharcount )
2772 && (base + charid - 1 - trie->uniquecharcount
2773 < trie->lasttrans)
2774 && trie->trans[base + charid - 1 -
2775 trie->uniquecharcount].check == state)
2776 {
2777 state = trie->trans[base + charid - 1 -
2778 trie->uniquecharcount ].next;
2779 }
2780 else {
2781 state = 0;
2782 }
2783 uc += len;
2784
2785 }
2786 else {
a3621e74
YO
2787 state = 0;
2788 }
2789 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2790 PerlIO_printf( Perl_debug_log,
2791 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2792 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2793 );
2794 }
d8319b27 2795 if (!st->u.trie.accepted )
a3621e74 2796 sayNO;
a3621e74
YO
2797
2798 /*
2799 There was at least one accepting state that we
2800 transitioned through. Presumably the number of accepting
2801 states is going to be low, typically one or two. So we
2802 simply scan through to find the one with lowest wordnum.
2803 Once we find it, we swap the last state into its place
2804 and decrement the size. We then try to match the rest of
2805 the pattern at the point where the word ends, if we
2806 succeed then we end the loop, otherwise the loop
2807 eventually terminates once all of the accepting states
2808 have been tried.
2809 */
a3621e74 2810
d8319b27 2811 if ( st->u.trie.accepted == 1 ) {
a3621e74 2812 DEBUG_EXECUTE_r({
097eb12c 2813 SV ** const tmp = av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 );
a3621e74
YO
2814 PerlIO_printf( Perl_debug_log,
2815 "%*s %sonly one match : #%d <%s>%s\n",
e4584336 2816 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
d8319b27 2817 st->u.trie.accept_buff[ 0 ].wordnum,
cfd0369c 2818 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
a3621e74
YO
2819 PL_colors[5] );
2820 });
d8319b27 2821 PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos;
a3621e74
YO
2822 /* in this case we free tmps/leave before we call regmatch
2823 as we wont be using accept_buff again. */
2824 FREETMPS;
2825 LEAVE;
95b24440
DM
2826 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
2827 /*** all unsaved local vars undefined at this point */
a3621e74
YO
2828 } else {
2829 DEBUG_EXECUTE_r(
e4584336 2830 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
d8319b27 2831 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)st->u.trie.accepted,
a3621e74
YO
2832 PL_colors[5] );
2833 );
d8319b27 2834 while ( !result && st->u.trie.accepted-- ) {
a3621e74
YO
2835 U32 best = 0;
2836 U32 cur;
d8319b27 2837 for( cur = 1 ; cur <= st->u.trie.accepted ; cur++ ) {
e4584336
RB
2838 DEBUG_TRIE_EXECUTE_r(
2839 PerlIO_printf( Perl_debug_log,
2840 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2841 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27
DM
2842 (IV)best, st->u.trie.accept_buff[ best ].wordnum, (IV)cur,
2843 st->u.trie.accept_buff[ cur ].wordnum, PL_colors[5] );
e4584336 2844 );
a3621e74 2845
d8319b27
DM
2846 if (st->u.trie.accept_buff[cur].wordnum <
2847 st->u.trie.accept_buff[best].wordnum)
e822a8b4 2848 best = cur;
a3621e74
YO
2849 }
2850 DEBUG_EXECUTE_r({
87830502 2851 reg_trie_data * const trie = (reg_trie_data*)
32fc9b6a 2852 rex->data->data[ARG(scan)];
d8319b27 2853 SV ** const tmp = av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 );
a3621e74 2854 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
e4584336 2855 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
d8319b27 2856 st->u.trie.accept_buff[best].wordnum,
cfd0369c 2857 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",scan,
a3621e74
YO
2858 PL_colors[5] );
2859 });
d8319b27
DM
2860 if ( best<st->u.trie.accepted ) {
2861 reg_trie_accepted tmp = st->u.trie.accept_buff[ best ];
2862 st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ];
2863 st->u.trie.accept_buff[ st->u.trie.accepted ] = tmp;
2864 best = st->u.trie.accepted;
a3621e74 2865 }
d8319b27 2866 PL_reginput = (char *)st->u.trie.accept_buff[ best ].endpos;
a3621e74
YO
2867
2868 /*
2869 as far as I can tell we only need the SAVETMPS/FREETMPS
2870 for re's with EVAL in them but I'm leaving them in for
2871 all until I can be sure.
2872 */
2873 SAVETMPS;
95b24440
DM
2874 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
2875 /*** all unsaved local vars undefined at this point */
a3621e74
YO
2876 FREETMPS;
2877 }
2878 FREETMPS;
2879 LEAVE;
2880 }
2881
95b24440 2882 if (result) {
a3621e74
YO
2883 sayYES;
2884 } else {
2885 sayNO;
2886 }
2887 }
2888 /* unreached codepoint */
95b24440
DM
2889 case EXACT: {
2890 char *s = STRING(scan);
5d9a96ca 2891 st->ln = STR_LEN(scan);
eb160463 2892 if (do_utf8 != UTF) {
bc517b45 2893 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2894 char *l = locinput;
5d9a96ca 2895 const char *e = s + st->ln;
a72c7584 2896
5ff6fc6d
JH
2897 if (do_utf8) {
2898 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2899 while (s < e) {
a3b680e6 2900 STRLEN ulen;
1aa99e6b 2901 if (l >= PL_regeol)
5ff6fc6d
JH
2902 sayNO;
2903 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2904 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2905 uniflags))
5ff6fc6d 2906 sayNO;
bc517b45 2907 l += ulen;
5ff6fc6d 2908 s ++;
1aa99e6b 2909 }
5ff6fc6d
JH
2910 }
2911 else {
2912 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2913 while (s < e) {
a3b680e6 2914 STRLEN ulen;
1aa99e6b
IH
2915 if (l >= PL_regeol)
2916 sayNO;
5ff6fc6d 2917 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2918 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2919 uniflags))
1aa99e6b 2920 sayNO;
bc517b45 2921 s += ulen;
a72c7584 2922 l ++;
1aa99e6b 2923 }
5ff6fc6d 2924 }
1aa99e6b
IH
2925 locinput = l;
2926 nextchr = UCHARAT(locinput);
2927 break;
2928 }
bc517b45 2929 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2930 /* Inline the first character, for speed. */
2931 if (UCHARAT(s) != nextchr)
2932 sayNO;
5d9a96ca 2933 if (PL_regeol - locinput < st->ln)
d6a28714 2934 sayNO;
5d9a96ca 2935 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 2936 sayNO;
5d9a96ca 2937 locinput += st->ln;
d6a28714
JH
2938 nextchr = UCHARAT(locinput);
2939 break;
95b24440 2940 }
d6a28714 2941 case EXACTFL:
b8c5462f
JH
2942 PL_reg_flags |= RF_tainted;
2943 /* FALL THROUGH */
95b24440
DM
2944 case EXACTF: {
2945 char *s = STRING(scan);
5d9a96ca 2946 st->ln = STR_LEN(scan);
d6a28714 2947
d07ddd77
JH
2948 if (do_utf8 || UTF) {
2949 /* Either target or the pattern are utf8. */
d6a28714 2950 char *l = locinput;
d07ddd77 2951 char *e = PL_regeol;
bc517b45 2952
5d9a96ca 2953 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 2954 l, &e, 0, do_utf8)) {
5486206c
JH
2955 /* One more case for the sharp s:
2956 * pack("U0U*", 0xDF) =~ /ss/i,
2957 * the 0xC3 0x9F are the UTF-8
2958 * byte sequence for the U+00DF. */
2959 if (!(do_utf8 &&
2960 toLOWER(s[0]) == 's' &&
5d9a96ca 2961 st->ln >= 2 &&
5486206c
JH
2962 toLOWER(s[1]) == 's' &&
2963 (U8)l[0] == 0xC3 &&
2964 e - l >= 2 &&
2965 (U8)l[1] == 0x9F))
2966 sayNO;
2967 }
d07ddd77
JH
2968 locinput = e;
2969 nextchr = UCHARAT(locinput);
2970 break;
a0ed51b3 2971 }
d6a28714 2972
bc517b45
JH
2973 /* Neither the target and the pattern are utf8. */
2974
d6a28714
JH
2975 /* Inline the first character, for speed. */
2976 if (UCHARAT(s) != nextchr &&
2977 UCHARAT(s) != ((OP(scan) == EXACTF)
2978 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2979 sayNO;
5d9a96ca 2980 if (PL_regeol - locinput < st->ln)
b8c5462f 2981 sayNO;
5d9a96ca
DM
2982 if (st->ln > 1 && (OP(scan) == EXACTF
2983 ? ibcmp(s, locinput, st->ln)
2984 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 2985 sayNO;
5d9a96ca 2986 locinput += st->ln;
d6a28714 2987 nextchr = UCHARAT(locinput);
a0d0e21e 2988 break;
95b24440 2989 }
d6a28714 2990 case ANYOF:
ffc61ed2 2991 if (do_utf8) {
9e55ce06
JH
2992 STRLEN inclasslen = PL_regeol - locinput;
2993
32fc9b6a 2994 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2995 sayNO_ANYOF;
ffc61ed2
JH
2996 if (locinput >= PL_regeol)
2997 sayNO;
0f0076b4 2998 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2999 nextchr = UCHARAT(locinput);
e0f9d4a8 3000 break;
ffc61ed2
JH
3001 }
3002 else {
3003 if (nextchr < 0)
3004 nextchr = UCHARAT(locinput);
32fc9b6a 3005 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 3006 sayNO_ANYOF;
ffc61ed2
JH
3007 if (!nextchr && locinput >= PL_regeol)
3008 sayNO;
3009 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3010 break;
3011 }
3012 no_anyof:
3013 /* If we might have the case of the German sharp s
3014 * in a casefolding Unicode character class. */
3015
ebc501f0
JH
3016 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3017 locinput += SHARP_S_SKIP;
e0f9d4a8 3018 nextchr = UCHARAT(locinput);
ffc61ed2 3019 }
e0f9d4a8
JH
3020 else
3021 sayNO;
b8c5462f 3022 break;
d6a28714 3023 case ALNUML:
b8c5462f
JH
3024 PL_reg_flags |= RF_tainted;
3025 /* FALL THROUGH */
d6a28714 3026 case ALNUM:
b8c5462f 3027 if (!nextchr)
4633a7c4 3028 sayNO;
ffc61ed2 3029 if (do_utf8) {
1a4fad37 3030 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3031 if (!(OP(scan) == ALNUM
3568d838 3032 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3033 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3034 {
3035 sayNO;
a0ed51b3 3036 }
b8c5462f 3037 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3038 nextchr = UCHARAT(locinput);
3039 break;
3040 }
ffc61ed2 3041 if (!(OP(scan) == ALNUM
d6a28714 3042 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3043 sayNO;
b8c5462f 3044 nextchr = UCHARAT(++locinput);
a0d0e21e 3045 break;
d6a28714 3046 case NALNUML:
b8c5462f
JH
3047 PL_reg_flags |= RF_tainted;
3048 /* FALL THROUGH */
d6a28714
JH
3049 case NALNUM:
3050 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3051 sayNO;
ffc61ed2 3052 if (do_utf8) {
1a4fad37 3053 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3054 if (OP(scan) == NALNUM
3568d838 3055 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3056 : isALNUM_LC_utf8((U8*)locinput))
3057 {
b8c5462f 3058 sayNO;
d6a28714 3059 }
b8c5462f
JH
3060 locinput += PL_utf8skip[nextchr];
3061 nextchr = UCHARAT(locinput);
3062 break;
3063 }
ffc61ed2 3064 if (OP(scan) == NALNUM
d6a28714 3065 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3066 sayNO;
76e3520e 3067 nextchr = UCHARAT(++locinput);
a0d0e21e 3068 break;
d6a28714
JH
3069 case BOUNDL:
3070 case NBOUNDL:
3280af22 3071 PL_reg_flags |= RF_tainted;
bbce6d69 3072 /* FALL THROUGH */
d6a28714
JH
3073 case BOUND:
3074 case NBOUND:
3075 /* was last char in word? */
ffc61ed2 3076 if (do_utf8) {
12d33761 3077 if (locinput == PL_bostr)
5d9a96ca 3078 st->ln = '\n';
ffc61ed2 3079 else {
a3b680e6 3080 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3081
4ad0818d 3082 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3083 }
3084 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3085 st->ln = isALNUM_uni(st->ln);
1a4fad37 3086 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3087 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3088 }
3089 else {
5d9a96ca 3090 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3091 n = isALNUM_LC_utf8((U8*)locinput);
3092 }
a0ed51b3 3093 }
d6a28714 3094 else {
5d9a96ca 3095 st->ln = (locinput != PL_bostr) ?
12d33761 3096 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3097 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3098 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3099 n = isALNUM(nextchr);
3100 }
3101 else {
5d9a96ca 3102 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3103 n = isALNUM_LC(nextchr);
3104 }
d6a28714 3105 }
5d9a96ca 3106 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3107 OP(scan) == BOUNDL))
3108 sayNO;
a0ed51b3 3109 break;
d6a28714 3110 case SPACEL:
3280af22 3111 PL_reg_flags |= RF_tainted;
bbce6d69 3112 /* FALL THROUGH */
d6a28714 3113 case SPACE:
9442cb0e 3114 if (!nextchr)
4633a7c4 3115 sayNO;
1aa99e6b 3116 if (do_utf8) {
fd400ab9 3117 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3118 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3119 if (!(OP(scan) == SPACE
3568d838 3120 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3121 : isSPACE_LC_utf8((U8*)locinput)))
3122 {
3123 sayNO;
3124 }
3125 locinput += PL_utf8skip[nextchr];
3126 nextchr = UCHARAT(locinput);
3127 break;
d6a28714 3128 }
ffc61ed2
JH
3129 if (!(OP(scan) == SPACE
3130 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3131 sayNO;
3132 nextchr = UCHARAT(++locinput);
3133 }
3134 else {
3135 if (!(OP(scan) == SPACE
3136 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3137 sayNO;
3138 nextchr = UCHARAT(++locinput);
a0ed51b3 3139 }
a0ed51b3 3140 break;
d6a28714 3141 case NSPACEL:
3280af22 3142 PL_reg_flags |= RF_tainted;
bbce6d69 3143 /* FALL THROUGH */
d6a28714 3144 case NSPACE:
9442cb0e 3145 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3146 sayNO;
1aa99e6b 3147 if (do_utf8) {
1a4fad37 3148 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3149 if (OP(scan) == NSPACE
3568d838 3150 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3151 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3152 {
3153 sayNO;
3154 }
3155 locinput += PL_utf8skip[nextchr];
3156 nextchr = UCHARAT(locinput);
3157 break;
a0ed51b3 3158 }
ffc61ed2 3159 if (OP(scan) == NSPACE
d6a28714 3160 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3161 sayNO;
76e3520e 3162 nextchr = UCHARAT(++locinput);
a0d0e21e 3163 break;
d6a28714 3164 case DIGITL:
a0ed51b3
LW
3165 PL_reg_flags |= RF_tainted;
3166 /* FALL THROUGH */
d6a28714 3167 case DIGIT:
9442cb0e 3168 if (!nextchr)
a0ed51b3 3169 sayNO;
1aa99e6b 3170 if (do_utf8) {
1a4fad37 3171 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3172 if (!(OP(scan) == DIGIT
3568d838 3173 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3174 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3175 {
a0ed51b3 3176 sayNO;
dfe13c55 3177 }
6f06b55f 3178 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3179 nextchr = UCHARAT(locinput);
3180 break;
3181 }
ffc61ed2 3182 if (!(OP(scan) == DIGIT
9442cb0e 3183 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3184 sayNO;
3185 nextchr = UCHARAT(++locinput);
3186 break;
d6a28714 3187 case NDIGITL:
b8c5462f
JH
3188 PL_reg_flags |= RF_tainted;
3189 /* FALL THROUGH */
d6a28714 3190 case NDIGIT:
9442cb0e 3191 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3192 sayNO;
1aa99e6b 3193 if (do_utf8) {
1a4fad37 3194 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3195 if (OP(scan) == NDIGIT
3568d838 3196 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3197 : isDIGIT_LC_utf8((U8*)locinput))
3198 {
a0ed51b3 3199 sayNO;
9442cb0e 3200 }
6f06b55f 3201 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3202 nextchr = UCHARAT(locinput);
3203 break;
3204 }
ffc61ed2 3205 if (OP(scan) == NDIGIT
9442cb0e 3206 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3207 sayNO;
3208 nextchr = UCHARAT(++locinput);
3209 break;
3210 case CLUMP:
b7c83a7e 3211 if (locinput >= PL_regeol)
a0ed51b3 3212 sayNO;
b7c83a7e 3213 if (do_utf8) {
1a4fad37 3214 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3215 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3216 sayNO;
3217 locinput += PL_utf8skip[nextchr];
3218 while (locinput < PL_regeol &&
3219 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3220 locinput += UTF8SKIP(locinput);
3221 if (locinput > PL_regeol)
3222 sayNO;
eb08e2da
JH
3223 }
3224 else
3225 locinput++;
a0ed51b3
LW
3226 nextchr = UCHARAT(locinput);
3227 break;
c8756f30 3228 case REFFL:
3280af22 3229 PL_reg_flags |= RF_tainted;
c8756f30 3230 /* FALL THROUGH */
c277df42 3231 case REF:
95b24440
DM
3232 case REFF: {
3233 char *s;
c277df42 3234 n = ARG(scan); /* which paren pair */
5d9a96ca 3235 st->ln = PL_regstartp[n];
2c2d71f5 3236 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3237 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3238 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3239 if (st->ln == PL_regendp[n])
a0d0e21e 3240 break;
a0ed51b3 3241
5d9a96ca 3242 s = PL_bostr + st->ln;
1aa99e6b 3243 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3244 char *l = locinput;
a3b680e6 3245 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3246 /*
3247 * Note that we can't do the "other character" lookup trick as
3248 * in the 8-bit case (no pun intended) because in Unicode we
3249 * have to map both upper and title case to lower case.
3250 */
3251 if (OP(scan) == REFF) {
3252 while (s < e) {
a3b680e6
AL
3253 STRLEN ulen1, ulen2;
3254 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3255 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3256
a0ed51b3
LW
3257 if (l >= PL_regeol)
3258 sayNO;
a2a2844f
JH
3259 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3260 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3261 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3262 sayNO;
a2a2844f
JH
3263 s += ulen1;
3264 l += ulen2;
a0ed51b3
LW
3265 }
3266 }
3267 locinput = l;
3268 nextchr = UCHARAT(locinput);
3269 break;
3270 }
3271
a0d0e21e 3272 /* Inline the first character, for speed. */
76e3520e 3273 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3274 (OP(scan) == REF ||
3275 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3276 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3277 sayNO;
5d9a96ca
DM
3278 st->ln = PL_regendp[n] - st->ln;
3279 if (locinput + st->ln > PL_regeol)
4633a7c4 3280 sayNO;
5d9a96ca
DM
3281 if (st->ln > 1 && (OP(scan) == REF
3282 ? memNE(s, locinput, st->ln)
c8756f30 3283 : (OP(scan) == REFF
5d9a96ca
DM
3284 ? ibcmp(s, locinput, st->ln)
3285 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3286 sayNO;
5d9a96ca 3287 locinput += st->ln;
76e3520e 3288 nextchr = UCHARAT(locinput);
a0d0e21e 3289 break;
95b24440 3290 }
a0d0e21e
LW
3291
3292 case NOTHING:
c277df42 3293 case TAIL:
a0d0e21e
LW
3294 break;
3295 case BACK:
3296 break;
c277df42
IZ
3297 case EVAL:
3298 {
c277df42 3299 SV *ret;
8e5e9ebe 3300 {
4aabdb9b
DM
3301 /* execute the code in the {...} */
3302 dSP;
6136c704 3303 SV ** const before = SP;
4aabdb9b
DM
3304 OP_4tree * const oop = PL_op;
3305 COP * const ocurcop = PL_curcop;
3306 PAD *old_comppad;
4aabdb9b
DM
3307
3308 n = ARG(scan);
32fc9b6a 3309 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3310 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3311 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3312 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3313
8e5e9ebe
RGS
3314 CALLRUNOPS(aTHX); /* Scalar context. */
3315 SPAGAIN;
3316 if (SP == before)
075aa684 3317 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3318 else {
3319 ret = POPs;
3320 PUTBACK;
3321 }
4aabdb9b
DM
3322
3323 PL_op = oop;
3324 PAD_RESTORE_LOCAL(old_comppad);
3325 PL_curcop = ocurcop;
3326 if (!st->logical) {
3327 /* /(?{...})/ */
3328 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3329 break;
3330 }
8e5e9ebe 3331 }
4aabdb9b
DM
3332 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3333 regexp *re;
4aabdb9b 3334 {
4f639d21
DM
3335 /* extract RE object from returned value; compiling if
3336 * necessary */
3337
6136c704 3338 MAGIC *mg = NULL;
4aabdb9b 3339 SV *sv;
faf82a0b
AE
3340 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3341 mg = mg_find(sv, PERL_MAGIC_qr);
3342 else if (SvSMAGICAL(ret)) {
3343 if (SvGMAGICAL(ret))
3344 sv_unmagic(ret, PERL_MAGIC_qr);
3345 else
3346 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3347 }
faf82a0b 3348
0f5d15d6
IZ
3349 if (mg) {
3350 re = (regexp *)mg->mg_obj;
df0003d4 3351 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3352 }
3353 else {
3354 STRLEN len;
6136c704 3355 const char * const t = SvPV_const(ret, len);
0f5d15d6 3356 PMOP pm;
a3b680e6 3357 const I32 osize = PL_regsize;
0f5d15d6 3358
5fcd1c1b 3359 Zero(&pm, 1, PMOP);
4aabdb9b 3360 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3361 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3362 if (!(SvFLAGS(ret)
faf82a0b
AE
3363 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3364 | SVs_GMG)))
14befaf4
DM
3365 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3366 PERL_MAGIC_qr,0,0);
0f5d15d6 3367 PL_regsize = osize;
0f5d15d6 3368 }
4aabdb9b 3369 }
aa283a38
DM
3370
3371 /* run the pattern returned from (??{...}) */
3372
4aabdb9b
DM
3373 DEBUG_EXECUTE_r(
3374 PerlIO_printf(Perl_debug_log,
3375 "Entering embedded \"%s%.60s%s%s\"\n",
3376 PL_colors[0],
3377 re->precomp,
3378 PL_colors[1],
3379 (strlen(re->precomp) > 60 ? "..." : ""))
3380 );
2c2d71f5 3381
4aabdb9b
DM
3382 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
3383 REGCP_SET(st->u.eval.lastcp);
4aabdb9b
DM
3384 *PL_reglastparen = 0;
3385 *PL_reglastcloseparen = 0;
4aabdb9b 3386 PL_reginput = locinput;
4aabdb9b
DM
3387
3388 /* XXXX This is too dramatic a measure... */
3389 PL_reg_maxiter = 0;
3390
5d9a96ca 3391 st->logical = 0;
aa283a38
DM
3392 st->u.eval.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3393 ((re->reganch & ROPT_UTF8) != 0);
3394 if (st->u.eval.toggleutf) PL_reg_flags ^= RF_utf8;
3395 st->u.eval.prev_rex = rex;
aa283a38 3396 rex = re;
aa283a38
DM
3397
3398 st->u.eval.prev_eval = cur_eval;
3399 st->u.eval.prev_slab = PL_regmatch_slab;
3400 st->u.eval.depth = depth;
3401 cur_eval = st;
3402 PUSH_STATE(newst, resume_EVAL);
3403 st = newst;
3404
3405 /* now continue from first node in postoned RE */
3406 next = re->program + 1;
3407 break;
4aabdb9b 3408 /* NOTREACHED */
a0ed51b3 3409 }
4aabdb9b
DM
3410 /* /(?(?{...})X|Y)/ */
3411 st->sw = SvTRUE(ret);
3412 st->logical = 0;
c277df42
IZ
3413 break;
3414 }
a0d0e21e 3415 case OPEN:
c277df42 3416 n = ARG(scan); /* which paren pair */
3280af22
NIS
3417 PL_reg_start_tmp[n] = locinput;
3418 if (n > PL_regsize)
3419 PL_regsize = n;
a0d0e21e
LW
3420 break;
3421 case CLOSE:
c277df42 3422 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3423 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3424 PL_regendp[n] = locinput - PL_bostr;
eb160463 3425 if (n > (I32)*PL_reglastparen)
3280af22 3426 *PL_reglastparen = n;
a01268b5 3427 *PL_reglastcloseparen = n;
a0d0e21e 3428 break;
c277df42
IZ
3429 case GROUPP:
3430 n = ARG(scan); /* which paren pair */
5d9a96ca 3431 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3432 break;
3433 case IFTHEN:
2c2d71f5 3434 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3435 if (st->sw)
c277df42
IZ
3436 next = NEXTOPER(NEXTOPER(scan));
3437 else {
3438 next = scan + ARG(scan);
3439 if (OP(next) == IFTHEN) /* Fake one. */
3440 next = NEXTOPER(NEXTOPER(next));
3441 }
3442 break;
3443 case LOGICAL:
5d9a96ca 3444 st->logical = scan->flags;
c277df42 3445 break;
2ab05381 3446/*******************************************************************
a0374537
DM
3447 cc points to the regmatch_state associated with the most recent CURLYX.
3448 This struct contains info about the innermost (...)* loop (an
3449 "infoblock"), and a pointer to the next outer cc.
2ab05381
IZ
3450
3451 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3452
95b24440 3453 1) After matching Y, regnode for CURLYX is processed;