This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
eliminate PL_reg_re
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
50/* *These* symbols are masked to allow static link. */
51# define Perl_pregexec my_pregexec
9041c2e3 52# define Perl_reginitcolors my_reginitcolors
490a3f88 53# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
9041c2e3 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 **** Alterations to Henry's code are...
79 ****
4bb101f2 80 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 81 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 82 ****
9ef589d8
LW
83 **** You may distribute under the terms of either the GNU General Public
84 **** License or the Artistic License, as specified in the README file.
a687059c
LW
85 *
86 * Beware that some of this code is subtly aware of the way operator
87 * precedence is structured in regular expressions. Serious changes in
88 * regular-expression syntax might require a total rethink.
89 */
90#include "EXTERN.h"
864dbfa3 91#define PERL_IN_REGEXEC_C
a687059c 92#include "perl.h"
0f5d15d6 93
a687059c
LW
94#include "regcomp.h"
95
c277df42
IZ
96#define RF_tainted 1 /* tainted information used? */
97#define RF_warned 2 /* warned about big count? */
ce862d02 98#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
99#define RF_utf8 8 /* String contains multibyte chars? */
100
eb160463 101#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
102
103#define RS_init 1 /* eval environment created */
104#define RS_set 2 /* replsv value is set */
c277df42 105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
32fc9b6a 110#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 111
c277df42
IZ
112/*
113 * Forwards.
114 */
115
33b8afdf 116#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 117#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 118
52657f30
AL
119#define HOPc(pos,off) ((char *)(PL_reg_match_utf8 \
120 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
121 : (U8*)(pos + off)))
122#define HOPBACKc(pos, off) ((char*) \
123 ((PL_reg_match_utf8) \
124 ? reghopmaybe3((U8*)pos, -off, ((U8*)(off < 0 ? PL_regeol : PL_bostr))) \
efb30f32
HS
125 : (pos - off >= PL_bostr) \
126 ? (U8*)(pos - off) \
52657f30 127 : (U8*)NULL) \
efb30f32 128)
efb30f32 129
1aa99e6b 130#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c 131#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 132#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 133
1a4fad37
AL
134#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
135 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
136#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
137#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
138#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
139#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 140
5f80c4cf 141/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
142#define JUMPABLE(rn) ( \
143 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
144 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
145 OP(rn) == PLUS || OP(rn) == MINMOD || \
146 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
147)
148
cca55fe3
JP
149#define HAS_TEXT(rn) ( \
150 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
151)
e2d8ce26 152
a84d97b6
HS
153/*
154 Search for mandatory following text node; for lookahead, the text must
155 follow but for lookbehind (rn->flags != 0) we skip to the next step.
156*/
cca55fe3 157#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 158 while (JUMPABLE(rn)) \
a84d97b6 159 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 160 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
161 else if (OP(rn) == PLUS) \
162 rn = NEXTOPER(rn); \
a84d97b6
HS
163 else if (OP(rn) == IFMATCH) \
164 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 165 else rn += NEXT_OFF(rn); \
5f80c4cf 166} STMT_END
74750237 167
acfe0abc 168static void restore_pos(pTHX_ void *arg);
51371543 169
76e3520e 170STATIC CHECKPOINT
cea2e8a9 171S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 172{
97aff369 173 dVAR;
a3b680e6 174 const int retval = PL_savestack_ix;
b1ce53c5 175#define REGCP_PAREN_ELEMS 4
a3b680e6 176 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
177 int p;
178
e49a9654
IH
179 if (paren_elems_to_push < 0)
180 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
181
a01268b5 182#define REGCP_OTHER_ELEMS 6
4b3c1a47 183 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 184 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 185/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
186 SSPUSHINT(PL_regendp[p]);
187 SSPUSHINT(PL_regstartp[p]);
3280af22 188 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
189 SSPUSHINT(p);
190 }
b1ce53c5 191/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
192 SSPUSHINT(PL_regsize);
193 SSPUSHINT(*PL_reglastparen);
a01268b5 194 SSPUSHINT(*PL_reglastcloseparen);
3280af22 195 SSPUSHPTR(PL_reginput);
41123dfd
JH
196#define REGCP_FRAME_ELEMS 2
197/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
198 * are needed for the regexp context stack bookkeeping. */
199 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 200 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 201
a0d0e21e
LW
202 return retval;
203}
204
c277df42 205/* These are needed since we do not localize EVAL nodes: */
a3621e74 206# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 207 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 208 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 209
a3621e74 210# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 211 PerlIO_printf(Perl_debug_log, \
faccc32b 212 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 213 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 214
76e3520e 215STATIC char *
097eb12c 216S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 217{
97aff369 218 dVAR;
b1ce53c5 219 I32 i;
a0d0e21e 220 char *input;
b1ce53c5 221
a3621e74
YO
222 GET_RE_DEBUG_FLAGS_DECL;
223
b1ce53c5 224 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 225 i = SSPOPINT;
b1ce53c5
JH
226 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
227 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 228 input = (char *) SSPOPPTR;
a01268b5 229 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
230 *PL_reglastparen = SSPOPINT;
231 PL_regsize = SSPOPINT;
b1ce53c5
JH
232
233 /* Now restore the parentheses context. */
41123dfd
JH
234 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
235 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 236 I32 tmps;
097eb12c 237 U32 paren = (U32)SSPOPINT;
3280af22 238 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
239 PL_regstartp[paren] = SSPOPINT;
240 tmps = SSPOPINT;
3280af22
NIS
241 if (paren <= *PL_reglastparen)
242 PL_regendp[paren] = tmps;
a3621e74 243 DEBUG_EXECUTE_r(
c3464db5 244 PerlIO_printf(Perl_debug_log,
b900a521 245 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 246 (UV)paren, (IV)PL_regstartp[paren],
b900a521 247 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 248 (IV)PL_regendp[paren],
3280af22 249 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 250 );
a0d0e21e 251 }
a3621e74 252 DEBUG_EXECUTE_r(
4f639d21 253 if ((I32)(*PL_reglastparen + 1) <= rex->nparens) {
c3464db5 254 PerlIO_printf(Perl_debug_log,
faccc32b 255 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 256 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
257 }
258 );
daf18116 259#if 1
dafc8851
JH
260 /* It would seem that the similar code in regtry()
261 * already takes care of this, and in fact it is in
262 * a better location to since this code can #if 0-ed out
263 * but the code in regtry() is needed or otherwise tests
264 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
265 * (as of patchlevel 7877) will fail. Then again,
266 * this code seems to be necessary or otherwise
267 * building DynaLoader will fail:
268 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
269 * --jhi */
097eb12c
AL
270 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
271 if (i > PL_regsize)
272 PL_regstartp[i] = -1;
273 PL_regendp[i] = -1;
a0d0e21e 274 }
dafc8851 275#endif
a0d0e21e
LW
276 return input;
277}
278
02db2b7b 279#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 280
95b24440 281#define TRYPAREN(paren, n, input, where) { \
29d1e993
HS
282 if (paren) { \
283 if (n) { \
284 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
285 PL_regendp[paren] = input - PL_bostr; \
286 } \
287 else \
288 PL_regendp[paren] = -1; \
289 } \
95b24440
DM
290 REGMATCH(next, where); \
291 if (result) \
29d1e993
HS
292 sayYES; \
293 if (paren && n) \
294 PL_regendp[paren] = -1; \
295}
296
297
a687059c 298/*
e50aee73 299 * pregexec and friends
a687059c
LW
300 */
301
302/*
c277df42 303 - pregexec - match a regexp against a string
a687059c 304 */
c277df42 305I32
864dbfa3 306Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 307 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
308/* strend: pointer to null at end of string */
309/* strbeg: real beginning of string */
310/* minend: end of match must be >=minend after stringarg. */
311/* nosave: For optimizations. */
312{
313 return
9041c2e3 314 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
315 nosave ? 0 : REXEC_COPY_STR);
316}
0f5d15d6 317
22e551b9 318
9041c2e3 319/*
cad2e5aa
JH
320 * Need to implement the following flags for reg_anch:
321 *
322 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
323 * USE_INTUIT_ML
324 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
325 * INTUIT_AUTORITATIVE_ML
326 * INTUIT_ONCE_NOML - Intuit can match in one location only.
327 * INTUIT_ONCE_ML
328 *
329 * Another flag for this function: SECOND_TIME (so that float substrs
330 * with giant delta may be not rechecked).
331 */
332
333/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
334
3f7c398e 335/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
336 Otherwise, only SvCUR(sv) is used to get strbeg. */
337
338/* XXXX We assume that strpos is strbeg unless sv. */
339
6eb5f6b9
JH
340/* XXXX Some places assume that there is a fixed substring.
341 An update may be needed if optimizer marks as "INTUITable"
342 RExen without fixed substrings. Similarly, it is assumed that
343 lengths of all the strings are no more than minlen, thus they
344 cannot come from lookahead.
345 (Or minlen should take into account lookahead.) */
346
2c2d71f5
JH
347/* A failure to find a constant substring means that there is no need to make
348 an expensive call to REx engine, thus we celebrate a failure. Similarly,
349 finding a substring too deep into the string means that less calls to
30944b6d
IZ
350 regtry() should be needed.
351
352 REx compiler's optimizer found 4 possible hints:
353 a) Anchored substring;
354 b) Fixed substring;
355 c) Whether we are anchored (beginning-of-line or \G);
356 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 357 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
358 string which does not contradict any of them.
359 */
2c2d71f5 360
6eb5f6b9
JH
361/* Most of decisions we do here should have been done at compile time.
362 The nodes of the REx which we used for the search should have been
363 deleted from the finite automaton. */
364
cad2e5aa
JH
365char *
366Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
367 char *strend, U32 flags, re_scream_pos_data *data)
368{
97aff369 369 dVAR;
b7953727 370 register I32 start_shift = 0;
cad2e5aa 371 /* Should be nonnegative! */
b7953727 372 register I32 end_shift = 0;
2c2d71f5
JH
373 register char *s;
374 register SV *check;
a1933d95 375 char *strbeg;
cad2e5aa 376 char *t;
a3b680e6 377 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 378 I32 ml_anch;
bd61b366
SS
379 register char *other_last = NULL; /* other substr checked before this */
380 char *check_at = NULL; /* check substr found at this pos */
1df70142 381 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 382#ifdef DEBUGGING
890ce7af
AL
383 const char * const i_strpos = strpos;
384 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 385#endif
a3621e74
YO
386
387 GET_RE_DEBUG_FLAGS_DECL;
388
a30b2f1f 389 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 390
b8d68ded 391 if (prog->reganch & ROPT_UTF8) {
a3621e74 392 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
393 "UTF-8 regex...\n"));
394 PL_reg_flags |= RF_utf8;
395 }
396
a3621e74 397 DEBUG_EXECUTE_r({
1df70142 398 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
399 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
400 strpos;
1df70142 401 const int len = PL_reg_match_utf8 ?
b8d68ded 402 strlen(s) : strend - strpos;
2a782b5b
JH
403 if (!PL_colorset)
404 reginitcolors();
b8d68ded 405 if (PL_reg_match_utf8)
a3621e74 406 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 407 "UTF-8 target...\n"));
2a782b5b 408 PerlIO_printf(Perl_debug_log,
a0288114 409 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 410 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
411 prog->precomp,
412 PL_colors[1],
413 (strlen(prog->precomp) > 60 ? "..." : ""),
414 PL_colors[0],
415 (int)(len > 60 ? 60 : len),
416 s, PL_colors[1],
417 (len > 60 ? "..." : "")
418 );
419 });
cad2e5aa 420
c344f387
JH
421 /* CHR_DIST() would be more correct here but it makes things slow. */
422 if (prog->minlen > strend - strpos) {
a3621e74 423 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 424 "String too short... [re_intuit_start]\n"));
cad2e5aa 425 goto fail;
2c2d71f5 426 }
a1933d95 427 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 428 PL_regeol = strend;
33b8afdf
JH
429 if (do_utf8) {
430 if (!prog->check_utf8 && prog->check_substr)
431 to_utf8_substr(prog);
432 check = prog->check_utf8;
433 } else {
434 if (!prog->check_substr && prog->check_utf8)
435 to_byte_substr(prog);
436 check = prog->check_substr;
437 }
438 if (check == &PL_sv_undef) {
a3621e74 439 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
440 "Non-utf string cannot match utf check string\n"));
441 goto fail;
442 }
2c2d71f5 443 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
444 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
445 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 446 && !multiline ) ); /* Check after \n? */
cad2e5aa 447
7e25d62c
JH
448 if (!ml_anch) {
449 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
450 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 451 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
452 && sv && !SvROK(sv)
453 && (strpos != strbeg)) {
a3621e74 454 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
455 goto fail;
456 }
457 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 458 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 459 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
460 I32 slen;
461
1aa99e6b 462 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
463 if (SvTAIL(check)) {
464 slen = SvCUR(check); /* >= 1 */
cad2e5aa 465
9041c2e3 466 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 467 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 468 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 469 goto fail_finish;
cad2e5aa
JH
470 }
471 /* Now should match s[0..slen-2] */
472 slen--;
3f7c398e 473 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 474 || (slen > 1
3f7c398e 475 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 476 report_neq:
a3621e74 477 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
478 goto fail_finish;
479 }
cad2e5aa 480 }
3f7c398e 481 else if (*SvPVX_const(check) != *s
653099ff 482 || ((slen = SvCUR(check)) > 1
3f7c398e 483 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 484 goto report_neq;
c315bfe8 485 check_at = s;
2c2d71f5 486 goto success_at_start;
7e25d62c 487 }
cad2e5aa 488 }
2c2d71f5 489 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 490 s = strpos;
2c2d71f5 491 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 492 end_shift = prog->minlen - start_shift -
653099ff 493 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 494 if (!ml_anch) {
a3b680e6 495 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 496 - (SvTAIL(check) != 0);
a3b680e6 497 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
498
499 if (end_shift < eshift)
500 end_shift = eshift;
501 }
cad2e5aa 502 }
2c2d71f5 503 else { /* Can match at random position */
cad2e5aa
JH
504 ml_anch = 0;
505 s = strpos;
2c2d71f5
JH
506 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
507 /* Should be nonnegative! */
508 end_shift = prog->minlen - start_shift -
653099ff 509 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
510 }
511
2c2d71f5 512#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 513 if (end_shift < 0)
6bbae5e6 514 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
515#endif
516
2c2d71f5
JH
517 restart:
518 /* Find a possible match in the region s..strend by looking for
519 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 520 if (flags & REXEC_SCREAM) {
cad2e5aa 521 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 522 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 523
2c2d71f5
JH
524 if (PL_screamfirst[BmRARE(check)] >= 0
525 || ( BmRARE(check) == '\n'
526 && (BmPREVIOUS(check) == SvCUR(check) - 1)
527 && SvTAIL(check) ))
9041c2e3 528 s = screaminstr(sv, check,
2c2d71f5 529 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 530 else
2c2d71f5 531 goto fail_finish;
4addbd3b
HS
532 /* we may be pointing at the wrong string */
533 if (s && RX_MATCH_COPIED(prog))
3f7c398e 534 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
535 if (data)
536 *data->scream_olds = s;
537 }
f33976b4 538 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
539 s = fbm_instr((U8*)(s + start_shift),
540 (U8*)(strend - end_shift),
7fba1cd6 541 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 542 else
1aa99e6b
IH
543 s = fbm_instr(HOP3(s, start_shift, strend),
544 HOP3(strend, -end_shift, strbeg),
7fba1cd6 545 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
546
547 /* Update the count-of-usability, remove useless subpatterns,
548 unshift s. */
2c2d71f5 549
a0288114 550 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 551 (s ? "Found" : "Did not find"),
33b8afdf 552 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 553 PL_colors[0],
7b0972df 554 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 555 SvPVX_const(check),
2c2d71f5
JH
556 PL_colors[1], (SvTAIL(check) ? "$" : ""),
557 (s ? " at offset " : "...\n") ) );
558
559 if (!s)
560 goto fail_finish;
561
6eb5f6b9
JH
562 check_at = s;
563
2c2d71f5 564 /* Finish the diagnostic message */
a3621e74 565 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
566
567 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
568 Start with the other substr.
569 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 570 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
571 *always* match. Probably should be marked during compile...
572 Probably it is right to do no SCREAM here...
573 */
574
33b8afdf 575 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 576 /* Take into account the "other" substring. */
2c2d71f5
JH
577 /* XXXX May be hopelessly wrong for UTF... */
578 if (!other_last)
6eb5f6b9 579 other_last = strpos;
33b8afdf 580 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
581 do_other_anchored:
582 {
890ce7af
AL
583 char * const last = HOP3c(s, -start_shift, strbeg);
584 char *last1, *last2;
2c2d71f5 585 char *s1 = s;
33b8afdf 586 SV* must;
2c2d71f5 587
2c2d71f5
JH
588 t = s - prog->check_offset_max;
589 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 590 && (!do_utf8
1aa99e6b 591 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 592 && t > strpos)))
30944b6d 593 /* EMPTY */;
2c2d71f5
JH
594 else
595 t = strpos;
1aa99e6b 596 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
597 if (t < other_last) /* These positions already checked */
598 t = other_last;
1aa99e6b 599 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
600 if (last < last1)
601 last1 = last;
602 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
603 /* On end-of-str: see comment below. */
33b8afdf
JH
604 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
605 if (must == &PL_sv_undef) {
606 s = (char*)NULL;
a3621e74 607 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
608 }
609 else
610 s = fbm_instr(
611 (unsigned char*)t,
612 HOP3(HOP3(last1, prog->anchored_offset, strend)
613 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
614 must,
7fba1cd6 615 multiline ? FBMrf_MULTILINE : 0
33b8afdf 616 );
a3621e74 617 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 618 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
619 (s ? "Found" : "Contradicts"),
620 PL_colors[0],
33b8afdf
JH
621 (int)(SvCUR(must)
622 - (SvTAIL(must)!=0)),
3f7c398e 623 SvPVX_const(must),
33b8afdf 624 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
625 if (!s) {
626 if (last1 >= last2) {
a3621e74 627 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
628 ", giving up...\n"));
629 goto fail_finish;
630 }
a3621e74 631 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 632 ", trying floating at offset %ld...\n",
1aa99e6b
IH
633 (long)(HOP3c(s1, 1, strend) - i_strpos)));
634 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
635 s = HOP3c(last, 1, strend);
2c2d71f5
JH
636 goto restart;
637 }
638 else {
a3621e74 639 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 640 (long)(s - i_strpos)));
1aa99e6b
IH
641 t = HOP3c(s, -prog->anchored_offset, strbeg);
642 other_last = HOP3c(s, 1, strend);
30944b6d 643 s = s1;
2c2d71f5
JH
644 if (t == strpos)
645 goto try_at_start;
2c2d71f5
JH
646 goto try_at_offset;
647 }
30944b6d 648 }
2c2d71f5
JH
649 }
650 else { /* Take into account the floating substring. */
33b8afdf
JH
651 char *last, *last1;
652 char *s1 = s;
653 SV* must;
654
655 t = HOP3c(s, -start_shift, strbeg);
656 last1 = last =
657 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
658 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
659 last = HOP3c(t, prog->float_max_offset, strend);
660 s = HOP3c(t, prog->float_min_offset, strend);
661 if (s < other_last)
662 s = other_last;
2c2d71f5 663 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
664 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
665 /* fbm_instr() takes into account exact value of end-of-str
666 if the check is SvTAIL(ed). Since false positives are OK,
667 and end-of-str is not later than strend we are OK. */
668 if (must == &PL_sv_undef) {
669 s = (char*)NULL;
a3621e74 670 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
671 }
672 else
2c2d71f5 673 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
674 (unsigned char*)last + SvCUR(must)
675 - (SvTAIL(must)!=0),
7fba1cd6 676 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 677 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
678 (s ? "Found" : "Contradicts"),
679 PL_colors[0],
680 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 681 SvPVX_const(must),
33b8afdf
JH
682 PL_colors[1], (SvTAIL(must) ? "$" : "")));
683 if (!s) {
684 if (last1 == last) {
a3621e74 685 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
686 ", giving up...\n"));
687 goto fail_finish;
2c2d71f5 688 }
a3621e74 689 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
690 ", trying anchored starting at offset %ld...\n",
691 (long)(s1 + 1 - i_strpos)));
692 other_last = last;
693 s = HOP3c(t, 1, strend);
694 goto restart;
695 }
696 else {
a3621e74 697 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
698 (long)(s - i_strpos)));
699 other_last = s; /* Fix this later. --Hugo */
700 s = s1;
701 if (t == strpos)
702 goto try_at_start;
703 goto try_at_offset;
704 }
2c2d71f5 705 }
cad2e5aa 706 }
2c2d71f5
JH
707
708 t = s - prog->check_offset_max;
2c2d71f5 709 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 710 && (!do_utf8
1aa99e6b
IH
711 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
712 && t > strpos))) {
2c2d71f5
JH
713 /* Fixed substring is found far enough so that the match
714 cannot start at strpos. */
715 try_at_offset:
cad2e5aa 716 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
717 /* Eventually fbm_*() should handle this, but often
718 anchored_offset is not 0, so this check will not be wasted. */
719 /* XXXX In the code below we prefer to look for "^" even in
720 presence of anchored substrings. And we search even
721 beyond the found float position. These pessimizations
722 are historical artefacts only. */
723 find_anchor:
2c2d71f5 724 while (t < strend - prog->minlen) {
cad2e5aa 725 if (*t == '\n') {
4ee3650e 726 if (t < check_at - prog->check_offset_min) {
33b8afdf 727 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
728 /* Since we moved from the found position,
729 we definitely contradict the found anchored
30944b6d
IZ
730 substr. Due to the above check we do not
731 contradict "check" substr.
732 Thus we can arrive here only if check substr
733 is float. Redo checking for "other"=="fixed".
734 */
9041c2e3 735 strpos = t + 1;
a3621e74 736 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 737 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
738 goto do_other_anchored;
739 }
4ee3650e
GS
740 /* We don't contradict the found floating substring. */
741 /* XXXX Why not check for STCLASS? */
cad2e5aa 742 s = t + 1;
a3621e74 743 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 744 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
745 goto set_useful;
746 }
4ee3650e
GS
747 /* Position contradicts check-string */
748 /* XXXX probably better to look for check-string
749 than for "\n", so one should lower the limit for t? */
a3621e74 750 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 751 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 752 other_last = strpos = s = t + 1;
cad2e5aa
JH
753 goto restart;
754 }
755 t++;
756 }
a3621e74 757 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 758 PL_colors[0], PL_colors[1]));
2c2d71f5 759 goto fail_finish;
cad2e5aa 760 }
f5952150 761 else {
a3621e74 762 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 763 PL_colors[0], PL_colors[1]));
f5952150 764 }
cad2e5aa
JH
765 s = t;
766 set_useful:
33b8afdf 767 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
768 }
769 else {
f5952150 770 /* The found string does not prohibit matching at strpos,
2c2d71f5 771 - no optimization of calling REx engine can be performed,
f5952150
GS
772 unless it was an MBOL and we are not after MBOL,
773 or a future STCLASS check will fail this. */
2c2d71f5
JH
774 try_at_start:
775 /* Even in this situation we may use MBOL flag if strpos is offset
776 wrt the start of the string. */
05b4157f 777 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 778 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
779 /* May be due to an implicit anchor of m{.*foo} */
780 && !(prog->reganch & ROPT_IMPLICIT))
781 {
cad2e5aa
JH
782 t = strpos;
783 goto find_anchor;
784 }
a3621e74 785 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 786 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 787 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 788 );
2c2d71f5 789 success_at_start:
30944b6d 790 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
791 && (do_utf8 ? (
792 prog->check_utf8 /* Could be deleted already */
793 && --BmUSEFUL(prog->check_utf8) < 0
794 && (prog->check_utf8 == prog->float_utf8)
795 ) : (
796 prog->check_substr /* Could be deleted already */
797 && --BmUSEFUL(prog->check_substr) < 0
798 && (prog->check_substr == prog->float_substr)
799 )))
66e933ab 800 {
cad2e5aa 801 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 802 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
803 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
804 if (do_utf8 ? prog->check_substr : prog->check_utf8)
805 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
806 prog->check_substr = prog->check_utf8 = NULL; /* disable */
807 prog->float_substr = prog->float_utf8 = NULL; /* clear */
808 check = NULL; /* abort */
cad2e5aa 809 s = strpos;
3cf5c195
IZ
810 /* XXXX This is a remnant of the old implementation. It
811 looks wasteful, since now INTUIT can use many
6eb5f6b9 812 other heuristics. */
cad2e5aa
JH
813 prog->reganch &= ~RE_USE_INTUIT;
814 }
815 else
816 s = strpos;
817 }
818
6eb5f6b9
JH
819 /* Last resort... */
820 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
821 if (prog->regstclass) {
822 /* minlen == 0 is possible if regstclass is \b or \B,
823 and the fixed substr is ''$.
824 Since minlen is already taken into account, s+1 is before strend;
825 accidentally, minlen >= 1 guaranties no false positives at s + 1
826 even for \b or \B. But (minlen? 1 : 0) below assumes that
827 regstclass does not come from lookahead... */
828 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
829 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 830 const U8* const str = (U8*)STRING(prog->regstclass);
06b5626a 831 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 832 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 833 : 1);
a3b680e6 834 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 835 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 836 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
837 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
838 cl_l, strend)
839 : strend);
6eb5f6b9
JH
840
841 t = s;
06b5626a 842 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
6eb5f6b9
JH
843 if (!s) {
844#ifdef DEBUGGING
cbbf8932 845 const char *what = NULL;
6eb5f6b9
JH
846#endif
847 if (endpos == strend) {
a3621e74 848 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
849 "Could not match STCLASS...\n") );
850 goto fail;
851 }
a3621e74 852 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 853 "This position contradicts STCLASS...\n") );
653099ff
GS
854 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
855 goto fail;
6eb5f6b9 856 /* Contradict one of substrings */
33b8afdf
JH
857 if (prog->anchored_substr || prog->anchored_utf8) {
858 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 859 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 860 hop_and_restart:
1aa99e6b 861 s = HOP3c(t, 1, strend);
66e933ab
GS
862 if (s + start_shift + end_shift > strend) {
863 /* XXXX Should be taken into account earlier? */
a3621e74 864 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
865 "Could not match STCLASS...\n") );
866 goto fail;
867 }
5e39e1e5
HS
868 if (!check)
869 goto giveup;
a3621e74 870 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 871 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
872 what, (long)(s + start_shift - i_strpos)) );
873 goto restart;
874 }
66e933ab 875 /* Have both, check_string is floating */
6eb5f6b9
JH
876 if (t + start_shift >= check_at) /* Contradicts floating=check */
877 goto retry_floating_check;
878 /* Recheck anchored substring, but not floating... */
9041c2e3 879 s = check_at;
5e39e1e5
HS
880 if (!check)
881 goto giveup;
a3621e74 882 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 883 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
884 (long)(other_last - i_strpos)) );
885 goto do_other_anchored;
886 }
60e71179
GS
887 /* Another way we could have checked stclass at the
888 current position only: */
889 if (ml_anch) {
890 s = t = t + 1;
5e39e1e5
HS
891 if (!check)
892 goto giveup;
a3621e74 893 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 894 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 895 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 896 goto try_at_offset;
66e933ab 897 }
33b8afdf 898 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 899 goto fail;
6eb5f6b9
JH
900 /* Check is floating subtring. */
901 retry_floating_check:
902 t = check_at - start_shift;
a3621e74 903 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
904 goto hop_and_restart;
905 }
b7953727 906 if (t != s) {
a3621e74 907 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 908 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
909 (long)(t - i_strpos), (long)(s - i_strpos))
910 );
911 }
912 else {
a3621e74 913 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
914 "Does not contradict STCLASS...\n");
915 );
916 }
6eb5f6b9 917 }
5e39e1e5 918 giveup:
a3621e74 919 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
920 PL_colors[4], (check ? "Guessed" : "Giving up"),
921 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 922 return s;
2c2d71f5
JH
923
924 fail_finish: /* Substring not found */
33b8afdf
JH
925 if (prog->check_substr || prog->check_utf8) /* could be removed already */
926 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 927 fail:
a3621e74 928 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 929 PL_colors[4], PL_colors[5]));
bd61b366 930 return NULL;
cad2e5aa 931}
9661b544 932
6eb5f6b9 933/* We know what class REx starts with. Try to find this position... */
3c3eec57 934STATIC char *
097eb12c 935S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, const char *strend, I32 norun)
a687059c 936{
27da23d5 937 dVAR;
1df70142 938 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 939 char *m;
d8093b23 940 STRLEN ln;
5dab1207 941 STRLEN lnc;
078c425b 942 register STRLEN uskip;
d8093b23 943 unsigned int c1;
944 unsigned int c2;
6eb5f6b9
JH
945 char *e;
946 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 947 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 948
6eb5f6b9
JH
949 /* We know what class it must start with. */
950 switch (OP(c)) {
6eb5f6b9 951 case ANYOF:
388cc4de 952 if (do_utf8) {
078c425b 953 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
954 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
955 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a
DM
956 reginclass(prog, c, (U8*)s, 0, do_utf8) :
957 REGINCLASS(prog, c, (U8*)s)) {
388cc4de
HS
958 if (tmp && (norun || regtry(prog, s)))
959 goto got_it;
960 else
961 tmp = doevery;
962 }
963 else
964 tmp = 1;
078c425b 965 s += uskip;
388cc4de
HS
966 }
967 }
968 else {
969 while (s < strend) {
970 STRLEN skip = 1;
971
32fc9b6a 972 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
973 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
974 /* The assignment of 2 is intentional:
975 * for the folded sharp s, the skip is 2. */
976 (skip = SHARP_S_SKIP))) {
977 if (tmp && (norun || regtry(prog, s)))
978 goto got_it;
979 else
980 tmp = doevery;
981 }
982 else
983 tmp = 1;
984 s += skip;
985 }
a0d0e21e 986 }
6eb5f6b9 987 break;
f33976b4
DB
988 case CANY:
989 while (s < strend) {
990 if (tmp && (norun || regtry(prog, s)))
991 goto got_it;
992 else
993 tmp = doevery;
994 s++;
995 }
996 break;
6eb5f6b9 997 case EXACTF:
5dab1207
NIS
998 m = STRING(c);
999 ln = STR_LEN(c); /* length to match in octets/bytes */
1000 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1001 if (UTF) {
a2a2844f 1002 STRLEN ulen1, ulen2;
5dab1207 1003 U8 *sm = (U8 *) m;
89ebb4a3
JH
1004 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1005 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1006 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1007
1008 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1009 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1010
89ebb4a3 1011 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1012 0, uniflags);
89ebb4a3 1013 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1014 0, uniflags);
5dab1207
NIS
1015 lnc = 0;
1016 while (sm < ((U8 *) m + ln)) {
1017 lnc++;
1018 sm += UTF8SKIP(sm);
1019 }
1aa99e6b
IH
1020 }
1021 else {
1022 c1 = *(U8*)m;
1023 c2 = PL_fold[c1];
1024 }
6eb5f6b9
JH
1025 goto do_exactf;
1026 case EXACTFL:
5dab1207
NIS
1027 m = STRING(c);
1028 ln = STR_LEN(c);
1029 lnc = (I32) ln;
d8093b23 1030 c1 = *(U8*)m;
6eb5f6b9
JH
1031 c2 = PL_fold_locale[c1];
1032 do_exactf:
db12adc6 1033 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1034
6eb5f6b9
JH
1035 if (norun && e < s)
1036 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1037
60a8b682
JH
1038 /* The idea in the EXACTF* cases is to first find the
1039 * first character of the EXACTF* node and then, if
1040 * necessary, case-insensitively compare the full
1041 * text of the node. The c1 and c2 are the first
1042 * characters (though in Unicode it gets a bit
1043 * more complicated because there are more cases
7f16dd3d
JH
1044 * than just upper and lower: one needs to use
1045 * the so-called folding case for case-insensitive
1046 * matching (called "loose matching" in Unicode).
1047 * ibcmp_utf8() will do just that. */
60a8b682 1048
1aa99e6b 1049 if (do_utf8) {
575cac57 1050 UV c, f;
89ebb4a3 1051 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1052 STRLEN len, foldlen;
4ad0818d 1053 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1054 if (c1 == c2) {
5dab1207
NIS
1055 /* Upper and lower of 1st char are equal -
1056 * probably not a "letter". */
1aa99e6b 1057 while (s <= e) {
89ebb4a3 1058 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1059 uniflags);
80aecb99
JH
1060 if ( c == c1
1061 && (ln == len ||
66423254 1062 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1063 m, (char **)0, ln, (bool)UTF))
55da9344 1064 && (norun || regtry(prog, s)) )
1aa99e6b 1065 goto got_it;
80aecb99 1066 else {
1df70142 1067 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1068 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1069 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1070 if ( f != c
1071 && (f == c1 || f == c2)
1072 && (ln == foldlen ||
66423254
JH
1073 !ibcmp_utf8((char *) foldbuf,
1074 (char **)0, foldlen, do_utf8,
d07ddd77 1075 m,
eb160463 1076 (char **)0, ln, (bool)UTF))
80aecb99
JH
1077 && (norun || regtry(prog, s)) )
1078 goto got_it;
1079 }
1aa99e6b
IH
1080 s += len;
1081 }
09091399
JH
1082 }
1083 else {
1aa99e6b 1084 while (s <= e) {
89ebb4a3 1085 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1086 uniflags);
80aecb99 1087
60a8b682 1088 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1089 * Note that not all the possible combinations
1090 * are handled here: some of them are handled
1091 * by the standard folding rules, and some of
1092 * them (the character class or ANYOF cases)
1093 * are handled during compiletime in
1094 * regexec.c:S_regclass(). */
880bd946
JH
1095 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1096 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1097 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1098
1099 if ( (c == c1 || c == c2)
1100 && (ln == len ||
66423254 1101 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1102 m, (char **)0, ln, (bool)UTF))
55da9344 1103 && (norun || regtry(prog, s)) )
1aa99e6b 1104 goto got_it;
80aecb99 1105 else {
1df70142 1106 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1107 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1108 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1109 if ( f != c
1110 && (f == c1 || f == c2)
1111 && (ln == foldlen ||
a6872d42 1112 !ibcmp_utf8((char *) foldbuf,
66423254 1113 (char **)0, foldlen, do_utf8,
d07ddd77 1114 m,
eb160463 1115 (char **)0, ln, (bool)UTF))
80aecb99
JH
1116 && (norun || regtry(prog, s)) )
1117 goto got_it;
1118 }
1aa99e6b
IH
1119 s += len;
1120 }
09091399 1121 }
1aa99e6b
IH
1122 }
1123 else {
1124 if (c1 == c2)
1125 while (s <= e) {
1126 if ( *(U8*)s == c1
1127 && (ln == 1 || !(OP(c) == EXACTF
1128 ? ibcmp(s, m, ln)
1129 : ibcmp_locale(s, m, ln)))
1130 && (norun || regtry(prog, s)) )
1131 goto got_it;
1132 s++;
1133 }
1134 else
1135 while (s <= e) {
1136 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1137 && (ln == 1 || !(OP(c) == EXACTF
1138 ? ibcmp(s, m, ln)
1139 : ibcmp_locale(s, m, ln)))
1140 && (norun || regtry(prog, s)) )
1141 goto got_it;
1142 s++;
1143 }
b3c9acc1
IZ
1144 }
1145 break;
bbce6d69 1146 case BOUNDL:
3280af22 1147 PL_reg_flags |= RF_tainted;
bbce6d69 1148 /* FALL THROUGH */
a0d0e21e 1149 case BOUND:
ffc61ed2 1150 if (do_utf8) {
12d33761 1151 if (s == PL_bostr)
ffc61ed2
JH
1152 tmp = '\n';
1153 else {
6136c704 1154 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1155 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1156 }
1157 tmp = ((OP(c) == BOUND ?
9041c2e3 1158 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1159 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1160 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1161 if (tmp == !(OP(c) == BOUND ?
3568d838 1162 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1163 isALNUM_LC_utf8((U8*)s)))
1164 {
1165 tmp = !tmp;
1166 if ((norun || regtry(prog, s)))
1167 goto got_it;
1168 }
078c425b 1169 s += uskip;
a687059c 1170 }
a0d0e21e 1171 }
667bb95a 1172 else {
12d33761 1173 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1174 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1175 while (s < strend) {
1176 if (tmp ==
1177 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1178 tmp = !tmp;
1179 if ((norun || regtry(prog, s)))
1180 goto got_it;
1181 }
1182 s++;
a0ed51b3 1183 }
a0ed51b3 1184 }
6eb5f6b9 1185 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1186 goto got_it;
1187 break;
bbce6d69 1188 case NBOUNDL:
3280af22 1189 PL_reg_flags |= RF_tainted;
bbce6d69 1190 /* FALL THROUGH */
a0d0e21e 1191 case NBOUND:
ffc61ed2 1192 if (do_utf8) {
12d33761 1193 if (s == PL_bostr)
ffc61ed2
JH
1194 tmp = '\n';
1195 else {
6136c704 1196 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1197 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1198 }
1199 tmp = ((OP(c) == NBOUND ?
9041c2e3 1200 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1201 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1202 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1203 if (tmp == !(OP(c) == NBOUND ?
3568d838 1204 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1205 isALNUM_LC_utf8((U8*)s)))
1206 tmp = !tmp;
1207 else if ((norun || regtry(prog, s)))
1208 goto got_it;
078c425b 1209 s += uskip;
ffc61ed2 1210 }
a0d0e21e 1211 }
667bb95a 1212 else {
12d33761 1213 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1214 tmp = ((OP(c) == NBOUND ?
1215 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1216 while (s < strend) {
1217 if (tmp ==
1218 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1219 tmp = !tmp;
1220 else if ((norun || regtry(prog, s)))
1221 goto got_it;
1222 s++;
1223 }
a0ed51b3 1224 }
6eb5f6b9 1225 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1226 goto got_it;
1227 break;
a0d0e21e 1228 case ALNUM:
ffc61ed2 1229 if (do_utf8) {
1a4fad37 1230 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1231 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1232 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1233 if (tmp && (norun || regtry(prog, s)))
1234 goto got_it;
1235 else
1236 tmp = doevery;
1237 }
bbce6d69 1238 else
ffc61ed2 1239 tmp = 1;
078c425b 1240 s += uskip;
bbce6d69 1241 }
bbce6d69 1242 }
ffc61ed2
JH
1243 else {
1244 while (s < strend) {
1245 if (isALNUM(*s)) {
1246 if (tmp && (norun || regtry(prog, s)))
1247 goto got_it;
1248 else
1249 tmp = doevery;
1250 }
a0ed51b3 1251 else
ffc61ed2
JH
1252 tmp = 1;
1253 s++;
a0ed51b3 1254 }
a0ed51b3
LW
1255 }
1256 break;
bbce6d69 1257 case ALNUML:
3280af22 1258 PL_reg_flags |= RF_tainted;
ffc61ed2 1259 if (do_utf8) {
078c425b 1260 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1261 if (isALNUM_LC_utf8((U8*)s)) {
1262 if (tmp && (norun || regtry(prog, s)))
1263 goto got_it;
1264 else
1265 tmp = doevery;
1266 }
a687059c 1267 else
ffc61ed2 1268 tmp = 1;
078c425b 1269 s += uskip;
a0d0e21e 1270 }
a0d0e21e 1271 }
ffc61ed2
JH
1272 else {
1273 while (s < strend) {
1274 if (isALNUM_LC(*s)) {
1275 if (tmp && (norun || regtry(prog, s)))
1276 goto got_it;
1277 else
1278 tmp = doevery;
1279 }
a0ed51b3 1280 else
ffc61ed2
JH
1281 tmp = 1;
1282 s++;
a0ed51b3 1283 }
a0ed51b3
LW
1284 }
1285 break;
a0d0e21e 1286 case NALNUM:
ffc61ed2 1287 if (do_utf8) {
1a4fad37 1288 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1289 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1290 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1291 if (tmp && (norun || regtry(prog, s)))
1292 goto got_it;
1293 else
1294 tmp = doevery;
1295 }
bbce6d69 1296 else
ffc61ed2 1297 tmp = 1;
078c425b 1298 s += uskip;
bbce6d69 1299 }
bbce6d69 1300 }
ffc61ed2
JH
1301 else {
1302 while (s < strend) {
1303 if (!isALNUM(*s)) {
1304 if (tmp && (norun || regtry(prog, s)))
1305 goto got_it;
1306 else
1307 tmp = doevery;
1308 }
a0ed51b3 1309 else
ffc61ed2
JH
1310 tmp = 1;
1311 s++;
a0ed51b3 1312 }
a0ed51b3
LW
1313 }
1314 break;
bbce6d69 1315 case NALNUML:
3280af22 1316 PL_reg_flags |= RF_tainted;
ffc61ed2 1317 if (do_utf8) {
078c425b 1318 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1319 if (!isALNUM_LC_utf8((U8*)s)) {
1320 if (tmp && (norun || regtry(prog, s)))
1321 goto got_it;
1322 else
1323 tmp = doevery;
1324 }
a687059c 1325 else
ffc61ed2 1326 tmp = 1;
078c425b 1327 s += uskip;
a687059c 1328 }
a0d0e21e 1329 }
ffc61ed2
JH
1330 else {
1331 while (s < strend) {
1332 if (!isALNUM_LC(*s)) {
1333 if (tmp && (norun || regtry(prog, s)))
1334 goto got_it;
1335 else
1336 tmp = doevery;
1337 }
a0ed51b3 1338 else
ffc61ed2
JH
1339 tmp = 1;
1340 s++;
a0ed51b3 1341 }
a0ed51b3
LW
1342 }
1343 break;
a0d0e21e 1344 case SPACE:
ffc61ed2 1345 if (do_utf8) {
1a4fad37 1346 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1347 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1348 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1349 if (tmp && (norun || regtry(prog, s)))
1350 goto got_it;
1351 else
1352 tmp = doevery;
1353 }
a0d0e21e 1354 else
ffc61ed2 1355 tmp = 1;
078c425b 1356 s += uskip;
2304df62 1357 }
a0d0e21e 1358 }
ffc61ed2
JH
1359 else {
1360 while (s < strend) {
1361 if (isSPACE(*s)) {
1362 if (tmp && (norun || regtry(prog, s)))
1363 goto got_it;
1364 else
1365 tmp = doevery;
1366 }
a0ed51b3 1367 else
ffc61ed2
JH
1368 tmp = 1;
1369 s++;
a0ed51b3 1370 }
a0ed51b3
LW
1371 }
1372 break;
bbce6d69 1373 case SPACEL:
3280af22 1374 PL_reg_flags |= RF_tainted;
ffc61ed2 1375 if (do_utf8) {
078c425b 1376 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1377 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1378 if (tmp && (norun || regtry(prog, s)))
1379 goto got_it;
1380 else
1381 tmp = doevery;
1382 }
bbce6d69 1383 else
ffc61ed2 1384 tmp = 1;
078c425b 1385 s += uskip;
bbce6d69 1386 }
bbce6d69 1387 }
ffc61ed2
JH
1388 else {
1389 while (s < strend) {
1390 if (isSPACE_LC(*s)) {
1391 if (tmp && (norun || regtry(prog, s)))
1392 goto got_it;
1393 else
1394 tmp = doevery;
1395 }
a0ed51b3 1396 else
ffc61ed2
JH
1397 tmp = 1;
1398 s++;
a0ed51b3 1399 }
a0ed51b3
LW
1400 }
1401 break;
a0d0e21e 1402 case NSPACE:
ffc61ed2 1403 if (do_utf8) {
1a4fad37 1404 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1405 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1406 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1407 if (tmp && (norun || regtry(prog, s)))
1408 goto got_it;
1409 else
1410 tmp = doevery;
1411 }
a0d0e21e 1412 else
ffc61ed2 1413 tmp = 1;
078c425b 1414 s += uskip;
a687059c 1415 }
a0d0e21e 1416 }
ffc61ed2
JH
1417 else {
1418 while (s < strend) {
1419 if (!isSPACE(*s)) {
1420 if (tmp && (norun || regtry(prog, s)))
1421 goto got_it;
1422 else
1423 tmp = doevery;
1424 }
a0ed51b3 1425 else
ffc61ed2
JH
1426 tmp = 1;
1427 s++;
a0ed51b3 1428 }
a0ed51b3
LW
1429 }
1430 break;
bbce6d69 1431 case NSPACEL:
3280af22 1432 PL_reg_flags |= RF_tainted;
ffc61ed2 1433 if (do_utf8) {
078c425b 1434 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1435 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1436 if (tmp && (norun || regtry(prog, s)))
1437 goto got_it;
1438 else
1439 tmp = doevery;
1440 }
bbce6d69 1441 else
ffc61ed2 1442 tmp = 1;
078c425b 1443 s += uskip;
bbce6d69 1444 }
bbce6d69 1445 }
ffc61ed2
JH
1446 else {
1447 while (s < strend) {
1448 if (!isSPACE_LC(*s)) {
1449 if (tmp && (norun || regtry(prog, s)))
1450 goto got_it;
1451 else
1452 tmp = doevery;
1453 }
a0ed51b3 1454 else
ffc61ed2
JH
1455 tmp = 1;
1456 s++;
a0ed51b3 1457 }
a0ed51b3
LW
1458 }
1459 break;
a0d0e21e 1460 case DIGIT:
ffc61ed2 1461 if (do_utf8) {
1a4fad37 1462 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1463 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1464 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1465 if (tmp && (norun || regtry(prog, s)))
1466 goto got_it;
1467 else
1468 tmp = doevery;
1469 }
a0d0e21e 1470 else
ffc61ed2 1471 tmp = 1;
078c425b 1472 s += uskip;
2b69d0c2 1473 }
a0d0e21e 1474 }
ffc61ed2
JH
1475 else {
1476 while (s < strend) {
1477 if (isDIGIT(*s)) {
1478 if (tmp && (norun || regtry(prog, s)))
1479 goto got_it;
1480 else
1481 tmp = doevery;
1482 }
a0ed51b3 1483 else
ffc61ed2
JH
1484 tmp = 1;
1485 s++;
a0ed51b3 1486 }
a0ed51b3
LW
1487 }
1488 break;
b8c5462f
JH
1489 case DIGITL:
1490 PL_reg_flags |= RF_tainted;
ffc61ed2 1491 if (do_utf8) {
078c425b 1492 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1493 if (isDIGIT_LC_utf8((U8*)s)) {
1494 if (tmp && (norun || regtry(prog, s)))
1495 goto got_it;
1496 else
1497 tmp = doevery;
1498 }
b8c5462f 1499 else
ffc61ed2 1500 tmp = 1;
078c425b 1501 s += uskip;
b8c5462f 1502 }
b8c5462f 1503 }
ffc61ed2
JH
1504 else {
1505 while (s < strend) {
1506 if (isDIGIT_LC(*s)) {
1507 if (tmp && (norun || regtry(prog, s)))
1508 goto got_it;
1509 else
1510 tmp = doevery;
1511 }
b8c5462f 1512 else
ffc61ed2
JH
1513 tmp = 1;
1514 s++;
b8c5462f 1515 }
b8c5462f
JH
1516 }
1517 break;
a0d0e21e 1518 case NDIGIT:
ffc61ed2 1519 if (do_utf8) {
1a4fad37 1520 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1521 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1522 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1523 if (tmp && (norun || regtry(prog, s)))
1524 goto got_it;
1525 else
1526 tmp = doevery;
1527 }
a0d0e21e 1528 else
ffc61ed2 1529 tmp = 1;
078c425b 1530 s += uskip;
a687059c 1531 }
a0d0e21e 1532 }
ffc61ed2
JH
1533 else {
1534 while (s < strend) {
1535 if (!isDIGIT(*s)) {
1536 if (tmp && (norun || regtry(prog, s)))
1537 goto got_it;
1538 else
1539 tmp = doevery;
1540 }
a0ed51b3 1541 else
ffc61ed2
JH
1542 tmp = 1;
1543 s++;
a0ed51b3 1544 }
a0ed51b3
LW
1545 }
1546 break;
b8c5462f
JH
1547 case NDIGITL:
1548 PL_reg_flags |= RF_tainted;
ffc61ed2 1549 if (do_utf8) {
078c425b 1550 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1551 if (!isDIGIT_LC_utf8((U8*)s)) {
1552 if (tmp && (norun || regtry(prog, s)))
1553 goto got_it;
1554 else
1555 tmp = doevery;
1556 }
b8c5462f 1557 else
ffc61ed2 1558 tmp = 1;
078c425b 1559 s += uskip;
b8c5462f 1560 }
a0ed51b3 1561 }
ffc61ed2
JH
1562 else {
1563 while (s < strend) {
1564 if (!isDIGIT_LC(*s)) {
1565 if (tmp && (norun || regtry(prog, s)))
1566 goto got_it;
1567 else
1568 tmp = doevery;
1569 }
cf93c79d 1570 else
ffc61ed2
JH
1571 tmp = 1;
1572 s++;
b8c5462f 1573 }
b8c5462f
JH
1574 }
1575 break;
b3c9acc1 1576 default:
3c3eec57
GS
1577 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1578 break;
d6a28714 1579 }
6eb5f6b9
JH
1580 return 0;
1581 got_it:
1582 return s;
1583}
1584
1585/*
1586 - regexec_flags - match a regexp against a string
1587 */
1588I32
1589Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1590 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1591/* strend: pointer to null at end of string */
1592/* strbeg: real beginning of string */
1593/* minend: end of match must be >=minend after stringarg. */
1594/* data: May be used for some additional optimizations. */
1595/* nosave: For optimizations. */
1596{
97aff369 1597 dVAR;
6eb5f6b9
JH
1598 register char *s;
1599 register regnode *c;
1600 register char *startpos = stringarg;
6eb5f6b9
JH
1601 I32 minlen; /* must match at least this many chars */
1602 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1603 I32 end_shift = 0; /* Same for the end. */ /* CC */
1604 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1605 char *scream_olds = NULL;
6eb5f6b9 1606 SV* oreplsv = GvSV(PL_replgv);
1df70142 1607 const bool do_utf8 = DO_UTF8(sv);
a3b680e6 1608 const I32 multiline = prog->reganch & PMf_MULTILINE;
2a782b5b 1609#ifdef DEBUGGING
6136c704
AL
1610 SV * const dsv0 = PERL_DEBUG_PAD_ZERO(0);
1611 SV * const dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1612#endif
a3621e74
YO
1613
1614 GET_RE_DEBUG_FLAGS_DECL;
1615
9d4ba2ae 1616 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1617
1618 /* Be paranoid... */
1619 if (prog == NULL || startpos == NULL) {
1620 Perl_croak(aTHX_ "NULL regexp parameter");
1621 return 0;
1622 }
1623
bac06658
JH
1624#ifdef DEBUGGING
1625 PL_regnarrate = DEBUG_r_TEST;
1626#endif
1627
1628 RX_MATCH_UTF8_set(prog, do_utf8);
1629
6eb5f6b9 1630 minlen = prog->minlen;
61a36c01 1631 if (strend - startpos < minlen) {
a3621e74 1632 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1633 "String too short [regexec_flags]...\n"));
1634 goto phooey;
1aa99e6b 1635 }
6eb5f6b9 1636
6eb5f6b9
JH
1637 /* Check validity of program. */
1638 if (UCHARAT(prog->program) != REG_MAGIC) {
1639 Perl_croak(aTHX_ "corrupted regexp program");
1640 }
1641
1642 PL_reg_flags = 0;
1643 PL_reg_eval_set = 0;
1644 PL_reg_maxiter = 0;
1645
1646 if (prog->reganch & ROPT_UTF8)
1647 PL_reg_flags |= RF_utf8;
1648
1649 /* Mark beginning of line for ^ and lookbehind. */
1650 PL_regbol = startpos;
1651 PL_bostr = strbeg;
1652 PL_reg_sv = sv;
1653
1654 /* Mark end of line for $ (and such) */
1655 PL_regeol = strend;
1656
1657 /* see how far we have to get to not match where we matched before */
1658 PL_regtill = startpos+minend;
1659
6eb5f6b9
JH
1660 /* If there is a "must appear" string, look for it. */
1661 s = startpos;
1662
1663 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1664 MAGIC *mg;
1665
1666 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1667 PL_reg_ganch = startpos;
1668 else if (sv && SvTYPE(sv) >= SVt_PVMG
1669 && SvMAGIC(sv)
14befaf4
DM
1670 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1671 && mg->mg_len >= 0) {
6eb5f6b9
JH
1672 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1673 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1674 if (s > PL_reg_ganch)
6eb5f6b9
JH
1675 goto phooey;
1676 s = PL_reg_ganch;
1677 }
1678 }
1679 else /* pos() not defined */
1680 PL_reg_ganch = strbeg;
1681 }
1682
a0714e2c 1683 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1684 re_scream_pos_data d;
1685
1686 d.scream_olds = &scream_olds;
1687 d.scream_pos = &scream_pos;
1688 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1689 if (!s) {
a3621e74 1690 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1691 goto phooey; /* not present */
3fa9c3d7 1692 }
6eb5f6b9
JH
1693 }
1694
a3621e74 1695 DEBUG_EXECUTE_r({
1df70142
AL
1696 const char * const s0 = UTF
1697 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1698 UNI_DISPLAY_REGEX)
1699 : prog->precomp;
1700 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1701 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1702 UNI_DISPLAY_REGEX) : startpos;
1df70142 1703 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1704 if (!PL_colorset)
1705 reginitcolors();
1706 PerlIO_printf(Perl_debug_log,
a0288114 1707 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1708 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1709 len0, len0, s0,
2a782b5b 1710 PL_colors[1],
9e55ce06 1711 len0 > 60 ? "..." : "",
2a782b5b 1712 PL_colors[0],
9e55ce06
JH
1713 (int)(len1 > 60 ? 60 : len1),
1714 s1, PL_colors[1],
1715 (len1 > 60 ? "..." : "")
2a782b5b
JH
1716 );
1717 });
6eb5f6b9
JH
1718
1719 /* Simplest case: anchored match need be tried only once. */
1720 /* [unless only anchor is BOL and multiline is set] */
1721 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1722 if (s == startpos && regtry(prog, startpos))
1723 goto got_it;
7fba1cd6 1724 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1725 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1726 {
1727 char *end;
1728
1729 if (minlen)
1730 dontbother = minlen - 1;
1aa99e6b 1731 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1732 /* for multiline we only have to try after newlines */
33b8afdf 1733 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1734 if (s == startpos)
1735 goto after_try;
1736 while (1) {
1737 if (regtry(prog, s))
1738 goto got_it;
1739 after_try:
1740 if (s >= end)
1741 goto phooey;
1742 if (prog->reganch & RE_USE_INTUIT) {
1743 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1744 if (!s)
1745 goto phooey;
1746 }
1747 else
1748 s++;
1749 }
1750 } else {
1751 if (s > startpos)
1752 s--;
1753 while (s < end) {
1754 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1755 if (regtry(prog, s))
1756 goto got_it;
1757 }
1758 }
1759 }
1760 }
1761 goto phooey;
1762 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1763 if (regtry(prog, PL_reg_ganch))
1764 goto got_it;
1765 goto phooey;
1766 }
1767
1768 /* Messy cases: unanchored match. */
33b8afdf 1769 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1770 /* we have /x+whatever/ */
1771 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1772 char ch;
bf93d4cc
GS
1773#ifdef DEBUGGING
1774 int did_match = 0;
1775#endif
33b8afdf
JH
1776 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1777 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1778 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1779
1aa99e6b 1780 if (do_utf8) {
6eb5f6b9
JH
1781 while (s < strend) {
1782 if (*s == ch) {
a3621e74 1783 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1784 if (regtry(prog, s)) goto got_it;
1785 s += UTF8SKIP(s);
1786 while (s < strend && *s == ch)
1787 s += UTF8SKIP(s);
1788 }
1789 s += UTF8SKIP(s);
1790 }
1791 }
1792 else {
1793 while (s < strend) {
1794 if (*s == ch) {
a3621e74 1795 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1796 if (regtry(prog, s)) goto got_it;
1797 s++;
1798 while (s < strend && *s == ch)
1799 s++;
1800 }
1801 s++;
1802 }
1803 }
a3621e74 1804 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1805 PerlIO_printf(Perl_debug_log,
b7953727
JH
1806 "Did not find anchored character...\n")
1807 );
6eb5f6b9 1808 }
a0714e2c
SS
1809 else if (prog->anchored_substr != NULL
1810 || prog->anchored_utf8 != NULL
1811 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1812 && prog->float_max_offset < strend - s)) {
1813 SV *must;
1814 I32 back_max;
1815 I32 back_min;
1816 char *last;
6eb5f6b9 1817 char *last1; /* Last position checked before */
bf93d4cc
GS
1818#ifdef DEBUGGING
1819 int did_match = 0;
1820#endif
33b8afdf
JH
1821 if (prog->anchored_substr || prog->anchored_utf8) {
1822 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1823 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1824 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1825 back_max = back_min = prog->anchored_offset;
1826 } else {
1827 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1828 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1829 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1830 back_max = prog->float_max_offset;
1831 back_min = prog->float_min_offset;
1832 }
1833 if (must == &PL_sv_undef)
1834 /* could not downgrade utf8 check substring, so must fail */
1835 goto phooey;
1836
1837 last = HOP3c(strend, /* Cannot start after this */
1838 -(I32)(CHR_SVLEN(must)
1839 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1840
1841 if (s > PL_bostr)
1842 last1 = HOPc(s, -1);
1843 else
1844 last1 = s - 1; /* bogus */
1845
a0288114 1846 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1847 check_substr==must. */
1848 scream_pos = -1;
1849 dontbother = end_shift;
1850 strend = HOPc(strend, -dontbother);
1851 while ( (s <= last) &&
9041c2e3 1852 ((flags & REXEC_SCREAM)
1aa99e6b 1853 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1854 end_shift, &scream_pos, 0))
1aa99e6b 1855 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1856 (unsigned char*)strend, must,
7fba1cd6 1857 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1858 /* we may be pointing at the wrong string */
1859 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1860 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1861 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1862 if (HOPc(s, -back_max) > last1) {
1863 last1 = HOPc(s, -back_min);
1864 s = HOPc(s, -back_max);
1865 }
1866 else {
52657f30 1867 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1868
1869 last1 = HOPc(s, -back_min);
52657f30 1870 s = t;
6eb5f6b9 1871 }
1aa99e6b 1872 if (do_utf8) {
6eb5f6b9
JH
1873 while (s <= last1) {
1874 if (regtry(prog, s))
1875 goto got_it;
1876 s += UTF8SKIP(s);
1877 }
1878 }
1879 else {
1880 while (s <= last1) {
1881 if (regtry(prog, s))
1882 goto got_it;
1883 s++;
1884 }
1885 }
1886 }
a3621e74 1887 DEBUG_EXECUTE_r(if (!did_match)
b7953727 1888 PerlIO_printf(Perl_debug_log,
a0288114 1889 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1890 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1891 ? "anchored" : "floating"),
1892 PL_colors[0],
1893 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 1894 SvPVX_const(must),
b7953727
JH
1895 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1896 );
6eb5f6b9
JH
1897 goto phooey;
1898 }
155aba94 1899 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1900 if (minlen) {
1901 I32 op = (U8)OP(prog->regstclass);
66e933ab 1902 /* don't bother with what can't match */
f14c76ed
RGS
1903 if (PL_regkind[op] != EXACT && op != CANY)
1904 strend = HOPc(strend, -(minlen - 1));
1905 }
a3621e74 1906 DEBUG_EXECUTE_r({
ffc61ed2 1907 SV *prop = sv_newmortal();
cfd0369c
NC
1908 const char *s0;
1909 const char *s1;
9e55ce06
JH
1910 int len0;
1911 int len1;
1912
32fc9b6a 1913 regprop(prog, prop, c);
9e55ce06 1914 s0 = UTF ?
3f7c398e 1915 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 1916 UNI_DISPLAY_REGEX) :
cfd0369c 1917 SvPVX_const(prop);
9e55ce06
JH
1918 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1919 s1 = UTF ?
c728cb41 1920 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1921 len1 = UTF ? SvCUR(dsv1) : strend - s;
1922 PerlIO_printf(Perl_debug_log,
a0288114 1923 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
9e55ce06
JH
1924 len0, len0, s0,
1925 len1, len1, s1);
ffc61ed2 1926 });
06b5626a 1927 if (find_byclass(prog, c, s, strend, 0))
6eb5f6b9 1928 goto got_it;
a3621e74 1929 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1930 }
1931 else {
1932 dontbother = 0;
a0714e2c 1933 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1934 /* Trim the end. */
d6a28714 1935 char *last;
33b8afdf
JH
1936 SV* float_real;
1937
1938 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1939 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1940 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1941
1942 if (flags & REXEC_SCREAM) {
33b8afdf 1943 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1944 end_shift, &scream_pos, 1); /* last one */
1945 if (!last)
ffc61ed2 1946 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1947 /* we may be pointing at the wrong string */
1948 else if (RX_MATCH_COPIED(prog))
3f7c398e 1949 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1950 }
d6a28714
JH
1951 else {
1952 STRLEN len;
cfd0369c 1953 const char * const little = SvPV_const(float_real, len);
d6a28714 1954
33b8afdf 1955 if (SvTAIL(float_real)) {
d6a28714
JH
1956 if (memEQ(strend - len + 1, little, len - 1))
1957 last = strend - len + 1;
7fba1cd6 1958 else if (!multiline)
9041c2e3 1959 last = memEQ(strend - len, little, len)
bd61b366 1960 ? strend - len : NULL;
b8c5462f 1961 else
d6a28714
JH
1962 goto find_last;
1963 } else {
1964 find_last:
9041c2e3 1965 if (len)
d6a28714 1966 last = rninstr(s, strend, little, little + len);
b8c5462f 1967 else
a0288114 1968 last = strend; /* matching "$" */
b8c5462f 1969 }
b8c5462f 1970 }
bf93d4cc 1971 if (last == NULL) {
a3621e74 1972 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 1973 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 1974 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1975 goto phooey; /* Should not happen! */
1976 }
d6a28714
JH
1977 dontbother = strend - last + prog->float_min_offset;
1978 }
1979 if (minlen && (dontbother < minlen))
1980 dontbother = minlen - 1;
1981 strend -= dontbother; /* this one's always in bytes! */
1982 /* We don't know much -- general case. */
1aa99e6b 1983 if (do_utf8) {
d6a28714
JH
1984 for (;;) {
1985 if (regtry(prog, s))
1986 goto got_it;
1987 if (s >= strend)
1988 break;
b8c5462f 1989 s += UTF8SKIP(s);
d6a28714
JH
1990 };
1991 }
1992 else {
1993 do {
1994 if (regtry(prog, s))
1995 goto got_it;
1996 } while (s++ < strend);
1997 }
1998 }
1999
2000 /* Failure. */
2001 goto phooey;
2002
2003got_it:
2004 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2005
2006 if (PL_reg_eval_set) {
2007 /* Preserve the current value of $^R */
2008 if (oreplsv != GvSV(PL_replgv))
2009 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2010 restored, the value remains
2011 the same. */
4f639d21 2012 restore_pos(aTHX_ prog);
d6a28714
JH
2013 }
2014
2015 /* make sure $`, $&, $', and $digit will work later */
2016 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2017 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2018 if (flags & REXEC_COPY_STR) {
2019 I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2020#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2021 if ((SvIsCOW(sv)
2022 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2023 if (DEBUG_C_TEST) {
2024 PerlIO_printf(Perl_debug_log,
2025 "Copy on write: regexp capture, type %d\n",
2026 (int) SvTYPE(sv));
2027 }
2028 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2029 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2030 assert (SvPOKp(prog->saved_copy));
2031 } else
2032#endif
2033 {
2034 RX_MATCH_COPIED_on(prog);
2035 s = savepvn(strbeg, i);
2036 prog->subbeg = s;
2037 }
d6a28714 2038 prog->sublen = i;
d6a28714
JH
2039 }
2040 else {
2041 prog->subbeg = strbeg;
2042 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2043 }
2044 }
9041c2e3 2045
d6a28714
JH
2046 return 1;
2047
2048phooey:
a3621e74 2049 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2050 PL_colors[4], PL_colors[5]));
d6a28714 2051 if (PL_reg_eval_set)
4f639d21 2052 restore_pos(aTHX_ prog);
d6a28714
JH
2053 return 0;
2054}
2055
2056/*
2057 - regtry - try match at specific point
2058 */
2059STATIC I32 /* 0 failure, 1 success */
2060S_regtry(pTHX_ regexp *prog, char *startpos)
2061{
97aff369 2062 dVAR;
d6a28714
JH
2063 register I32 *sp;
2064 register I32 *ep;
2065 CHECKPOINT lastcp;
a3621e74 2066 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2067
02db2b7b
IZ
2068#ifdef DEBUGGING
2069 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2070#endif
d6a28714
JH
2071 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2072 MAGIC *mg;
2073
2074 PL_reg_eval_set = RS_init;
a3621e74 2075 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2076 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2077 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2078 ));
e8347627 2079 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2080 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2081 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2082 SAVETMPS;
2083 /* Apparently this is not needed, judging by wantarray. */
e8347627 2084 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2085 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2086
2087 if (PL_reg_sv) {
2088 /* Make $_ available to executed code. */
2089 if (PL_reg_sv != DEFSV) {
59f00321 2090 SAVE_DEFSV;
d6a28714 2091 DEFSV = PL_reg_sv;
b8c5462f 2092 }
d6a28714 2093
9041c2e3 2094 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2095 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2096 /* prepare for quick setting of pos */
d300d9fa
NC
2097#ifdef PERL_OLD_COPY_ON_WRITE
2098 if (SvIsCOW(sv))
2099 sv_force_normal_flags(sv, 0);
2100#endif
2101 mg = sv_magicext(PL_reg_sv, (SV*)0, PERL_MAGIC_regex_global,
2102 &PL_vtbl_mglob, NULL, 0);
d6a28714 2103 mg->mg_len = -1;
b8c5462f 2104 }
d6a28714
JH
2105 PL_reg_magic = mg;
2106 PL_reg_oldpos = mg->mg_len;
4f639d21 2107 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2108 }
09687e5a 2109 if (!PL_reg_curpm) {
a02a5408 2110 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2111#ifdef USE_ITHREADS
2112 {
2113 SV* repointer = newSViv(0);
577e12cc 2114 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2115 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2116 av_push(PL_regex_padav,repointer);
2117 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2118 PL_regex_pad = AvARRAY(PL_regex_padav);
2119 }
2120#endif
2121 }
aaa362c4 2122 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2123 PL_reg_oldcurpm = PL_curpm;
2124 PL_curpm = PL_reg_curpm;
2125 if (RX_MATCH_COPIED(prog)) {
2126 /* Here is a serious problem: we cannot rewrite subbeg,
2127 since it may be needed if this match fails. Thus
2128 $` inside (?{}) could fail... */
2129 PL_reg_oldsaved = prog->subbeg;
2130 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2131#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2132 PL_nrs = prog->saved_copy;
2133#endif
d6a28714
JH
2134 RX_MATCH_COPIED_off(prog);
2135 }
2136 else
bd61b366 2137 PL_reg_oldsaved = NULL;
d6a28714
JH
2138 prog->subbeg = PL_bostr;
2139 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2140 }
973dddac 2141 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2142 PL_reginput = startpos;
2143 PL_regstartp = prog->startp;
2144 PL_regendp = prog->endp;
2145 PL_reglastparen = &prog->lastparen;
a01268b5 2146 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2147 prog->lastparen = 0;
03994de8 2148 prog->lastcloseparen = 0;
d6a28714 2149 PL_regsize = 0;
a3621e74 2150 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2151 if (PL_reg_start_tmpl <= prog->nparens) {
2152 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2153 if(PL_reg_start_tmp)
2154 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2155 else
a02a5408 2156 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2157 }
2158
2159 /* XXXX What this code is doing here?!!! There should be no need
2160 to do this again and again, PL_reglastparen should take care of
3dd2943c 2161 this! --ilya*/
dafc8851
JH
2162
2163 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2164 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2165 * PL_reglastparen), is not needed at all by the test suite
2166 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2167 * enough, for building DynaLoader, or otherwise this
2168 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2169 * will happen. Meanwhile, this code *is* needed for the
2170 * above-mentioned test suite tests to succeed. The common theme
2171 * on those tests seems to be returning null fields from matches.
2172 * --jhi */
dafc8851 2173#if 1
d6a28714
JH
2174 sp = prog->startp;
2175 ep = prog->endp;
2176 if (prog->nparens) {
097eb12c 2177 register I32 i;
eb160463 2178 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2179 *++sp = -1;
2180 *++ep = -1;
2181 }
2182 }
dafc8851 2183#endif
02db2b7b 2184 REGCP_SET(lastcp);
4f639d21 2185 if (regmatch(prog, prog->program + 1)) {
d6a28714
JH
2186 prog->endp[0] = PL_reginput - PL_bostr;
2187 return 1;
2188 }
02db2b7b 2189 REGCP_UNWIND(lastcp);
d6a28714
JH
2190 return 0;
2191}
2192
02db2b7b
IZ
2193#define RE_UNWIND_BRANCH 1
2194#define RE_UNWIND_BRANCHJ 2
2195
2196union re_unwind_t;
2197
2198typedef struct { /* XX: makes sense to enlarge it... */
2199 I32 type;
2200 I32 prev;
2201 CHECKPOINT lastcp;
2202} re_unwind_generic_t;
2203
2204typedef struct {
2205 I32 type;
2206 I32 prev;
2207 CHECKPOINT lastcp;
2208 I32 lastparen;
2209 regnode *next;
2210 char *locinput;
2211 I32 nextchr;
3a2830be 2212 int minmod;
02db2b7b
IZ
2213#ifdef DEBUGGING
2214 int regindent;
2215#endif
2216} re_unwind_branch_t;
2217
2218typedef union re_unwind_t {
2219 I32 type;
2220 re_unwind_generic_t generic;
2221 re_unwind_branch_t branch;
2222} re_unwind_t;
2223
8ba1375e
MJD
2224#define sayYES goto yes
2225#define sayNO goto no
e0f9d4a8 2226#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2227#define sayYES_FINAL goto yes_final
2228#define sayYES_LOUD goto yes_loud
2229#define sayNO_FINAL goto no_final
2230#define sayNO_SILENT goto do_no
2231#define saySAME(x) if (x) goto yes; else goto no
2232
3ab3c9b4
HS
2233#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2234#define POSCACHE_SEEN 1 /* we know what we're caching */
2235#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2236#define CACHEsayYES STMT_START { \
d8319b27 2237 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
3ab3c9b4
HS
2238 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2239 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2240 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2241 /* cache records failure, but this is success */ \
2242 DEBUG_r( \
2243 PerlIO_printf(Perl_debug_log, \
2244 "%*s (remove success from failure cache)\n", \
2245 REPORT_CODE_OFF+PL_regindent*2, "") \
2246 ); \
d8319b27 2247 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2248 } \
2249 } \
2250 sayYES; \
2251} STMT_END
2252#define CACHEsayNO STMT_START { \
d8319b27 2253 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
3ab3c9b4
HS
2254 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2255 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2256 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2257 /* cache records success, but this is failure */ \
2258 DEBUG_r( \
2259 PerlIO_printf(Perl_debug_log, \
2260 "%*s (remove failure from success cache)\n", \
2261 REPORT_CODE_OFF+PL_regindent*2, "") \
2262 ); \
d8319b27 2263 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2264 } \
2265 } \
2266 sayNO; \
2267} STMT_END
2268
a3621e74
YO
2269/* this is used to determine how far from the left messages like
2270 'failed...' are printed. Currently 29 makes these messages line
2271 up with the opcode they refer to. Earlier perls used 25 which
2272 left these messages outdented making reviewing a debug output
2273 quite difficult.
2274*/
2275#define REPORT_CODE_OFF 29
2276
2277
2278/* Make sure there is a test for this +1 options in re_tests */
2279#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2280
5d9a96ca
DM
2281/* grab a new slab and return the first slot in it */
2282
2283STATIC regmatch_state *
2284S_push_slab(pTHX)
2285{
2286 regmatch_slab *s = PL_regmatch_slab->next;
2287 if (!s) {
2288 Newx(s, 1, regmatch_slab);
2289 s->prev = PL_regmatch_slab;
2290 s->next = NULL;
2291 PL_regmatch_slab->next = s;
2292 }
2293 PL_regmatch_slab = s;
2294 return &s->states[0];
2295}
5b47454d 2296
95b24440
DM
2297/* simulate a recursive call to regmatch */
2298
2299#define REGMATCH(ns, where) \
5d9a96ca
DM
2300 st->scan = scan; \
2301 scan = (ns); \
2302 st->resume_state = resume_##where; \
95b24440
DM
2303 goto start_recurse; \
2304 resume_point_##where:
2305
aa283a38
DM
2306
2307/* push a new regex state. Set newst to point to it */
2308
2309#define PUSH_STATE(newst, resume) \
2310 depth++; \
2311 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH STATE(%d)\n", depth)); \
2312 st->scan = scan; \
2313 st->next = next; \
2314 st->n = n; \
2315 st->locinput = locinput; \
2316 st->resume_state = resume; \
2317 newst = st+1; \
2318 if (newst > &(PL_regmatch_slab->states[PERL_REGMATCH_SLAB_SLOTS-1])) \
2319 newst = S_push_slab(aTHX); \
2320 PL_regmatch_state = newst; \
2321 newst->cc = 0; \
2322 newst->minmod = 0; \
2323 newst->sw = 0; \
2324 newst->logical = 0; \
2325 newst->unwind = 0; \
2326 locinput = PL_reginput; \
2327 nextchr = UCHARAT(locinput);
2328
2329#define POP_STATE \
2330 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth)); \
2331 depth--; \
2332 st--; \
2333 if (st < &PL_regmatch_slab->states[0]) { \
2334 PL_regmatch_slab = PL_regmatch_slab->prev; \
2335 st = &PL_regmatch_slab->states[PERL_REGMATCH_SLAB_SLOTS-1]; \
2336 } \
2337 PL_regmatch_state = st; \
2338 scan = st->scan; \
2339 next = st->next; \
2340 n = st->n; \
2341 locinput = st->locinput; \
2342 nextchr = UCHARAT(locinput);
2343
d6a28714
JH
2344/*
2345 - regmatch - main matching routine
2346 *
2347 * Conceptually the strategy is simple: check to see whether the current
2348 * node matches, call self recursively to see whether the rest matches,
2349 * and then act accordingly. In practice we make some effort to avoid
2350 * recursion, in particular by going through "ordinary" nodes (that don't
2351 * need to know whether the rest of the match failed) by a loop instead of
2352 * by recursion.
2353 */
2354/* [lwall] I've hoisted the register declarations to the outer block in order to
2355 * maybe save a little bit of pushing and popping on the stack. It also takes
2356 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2357 *
2358 * This function used to be heavily recursive, but since this had the
2359 * effect of blowing the CPU stack on complex regexes, it has been
2360 * restructured to be iterative, and to save state onto the heap rather
2361 * than the stack. Essentially whereever regmatch() used to be called, it
2362 * pushes the current state, notes where to return, then jumps back into
2363 * the main loop.
2364 *
2365 * Originally the structure of this function used to look something like
2366
2367 S_regmatch() {
2368 int a = 1, b = 2;
2369 ...
2370 while (scan != NULL) {
5d9a96ca 2371 a++; // do stuff with a and b
95b24440
DM
2372 ...
2373 switch (OP(scan)) {
2374 case FOO: {
2375 int local = 3;
2376 ...
2377 if (regmatch(...)) // recurse
2378 goto yes;
2379 }
2380 ...
2381 }
2382 }
2383 yes:
2384 return 1;
2385 }
2386
2387 * Now it looks something like this:
2388
5d9a96ca 2389 typedef struct {
95b24440
DM
2390 int a, b, local;
2391 int resume_state;
5d9a96ca 2392 } regmatch_state;
95b24440
DM
2393
2394 S_regmatch() {
5d9a96ca
DM
2395 regmatch_state *st = new();
2396 int depth=0;
2397 st->a++; // do stuff with a and b
95b24440
DM
2398 ...
2399 while (scan != NULL) {
2400 ...
2401 switch (OP(scan)) {
2402 case FOO: {
5d9a96ca 2403 st->local = 3;
95b24440 2404 ...
5d9a96ca
DM
2405 st->scan = scan;
2406 scan = ...;
2407 st->resume_state = resume_FOO;
2408 goto start_recurse; // recurse
95b24440 2409
5d9a96ca
DM
2410 resume_point_FOO:
2411 if (result)
95b24440
DM
2412 goto yes;
2413 }
2414 ...
2415 }
5d9a96ca
DM
2416 start_recurse:
2417 st = new(); push a new state
2418 st->a = 1; st->b = 2;
2419 depth++;
95b24440 2420 }
5d9a96ca 2421 yes:
95b24440 2422 result = 1;
5d9a96ca
DM
2423 if (depth--) {
2424 st = pop();
95b24440
DM
2425 switch (resume_state) {
2426 case resume_FOO:
2427 goto resume_point_FOO;
2428 ...
2429 }
2430 }
2431 return result
2432 }
2433
2434 * WARNING: this means that any line in this function that contains a
2435 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2436 * regmatch() using gotos instead. Thus the values of any local variables
2437 * not saved in the regmatch_state structure will have been lost when
2438 * execution resumes on the next line .
5d9a96ca
DM
2439 *
2440 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2441 * PL_regmatch_state always points to the currently active state, and
2442 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2443 * The first time regmatch is called, the first slab is allocated, and is
2444 * never freed until interpreter desctruction. When the slab is full,
2445 * a new one is allocated chained to the end. At exit from regmatch, slabs
2446 * allocated since entry are freed.
d6a28714 2447 */
95b24440
DM
2448
2449
d6a28714 2450STATIC I32 /* 0 failure, 1 success */
4f639d21 2451S_regmatch(pTHX_ regexp *rex, regnode *prog)
d6a28714 2452{
27da23d5 2453 dVAR;
95b24440 2454 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2455 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2456
5d9a96ca
DM
2457 regmatch_slab *orig_slab;
2458 regmatch_state *orig_state;
a3621e74 2459
5d9a96ca
DM
2460 /* the current state. This is a cached copy of PL_regmatch_state */
2461 register regmatch_state *st;
95b24440 2462
5d9a96ca
DM
2463 /* cache heavy used fields of st in registers */
2464 register regnode *scan;
2465 register regnode *next;
2466 register I32 n = 0; /* initialize to shut up compiler warning */
2467 register char *locinput = PL_reginput;
95b24440 2468
5d9a96ca
DM
2469 /* these variables are NOT saved during a recusive RFEGMATCH: */
2470 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2471 bool result; /* return value of S_regmatch */
2472 regnode *inner; /* Next node in internal branch. */
2473 int depth = 0; /* depth of recursion */
aa283a38
DM
2474 regmatch_state *newst; /* when pushing a state, this is the new one */
2475 regmatch_state *cur_eval = NULL; /* most recent (??{}) state */
95b24440
DM
2476
2477#ifdef DEBUGGING
ab74612d 2478 SV *re_debug_flags = NULL;
a3621e74 2479 GET_RE_DEBUG_FLAGS;
d6a28714
JH
2480 PL_regindent++;
2481#endif
2482
5d9a96ca
DM
2483 /* on first ever call to regmatch, allocate first slab */
2484 if (!PL_regmatch_slab) {
2485 Newx(PL_regmatch_slab, 1, regmatch_slab);
2486 PL_regmatch_slab->prev = NULL;
2487 PL_regmatch_slab->next = NULL;
2488 PL_regmatch_state = &PL_regmatch_slab->states[0] - 1;
2489 }
2490
2491 /* remember current high-water mark for exit */
2492 /* XXX this should be done with SAVE* instead */
2493 orig_slab = PL_regmatch_slab;
2494 orig_state = PL_regmatch_state;
2495
2496 /* grab next free state slot */
2497 st = ++PL_regmatch_state;
2498 if (st > &(PL_regmatch_slab->states[PERL_REGMATCH_SLAB_SLOTS-1]))
2499 st = PL_regmatch_state = S_push_slab(aTHX);
2500
2501 st->minmod = 0;
2502 st->sw = 0;
2503 st->logical = 0;
2504 st->unwind = 0;
2505 st->cc = NULL;
d6a28714
JH
2506 /* Note that nextchr is a byte even in UTF */
2507 nextchr = UCHARAT(locinput);
2508 scan = prog;
2509 while (scan != NULL) {
8ba1375e 2510
a3621e74 2511 DEBUG_EXECUTE_r( {
6136c704 2512 SV * const prop = sv_newmortal();
1df70142
AL
2513 const int docolor = *PL_colors[0];
2514 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2515 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2516 /* The part of the string before starttry has one color
2517 (pref0_len chars), between starttry and current
2518 position another one (pref_len - pref0_len chars),
2519 after the current position the third one.
2520 We assume that pref0_len <= pref_len, otherwise we
2521 decrease pref0_len. */
9041c2e3 2522 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2523 ? (5 + taill) - l : locinput - PL_bostr;
2524 int pref0_len;
d6a28714 2525
df1ffd02 2526 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2527 pref_len++;
2528 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2529 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2530 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2531 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2532 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2533 l--;
d6a28714
JH
2534 if (pref0_len < 0)
2535 pref0_len = 0;
2536 if (pref0_len > pref_len)
2537 pref0_len = pref_len;
32fc9b6a 2538 regprop(rex, prop, scan);
2a782b5b 2539 {
1df70142 2540 const char * const s0 =
f14c76ed 2541 do_utf8 && OP(scan) != CANY ?
95b24440 2542 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
c728cb41 2543 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2544 locinput - pref_len;
1df70142
AL
2545 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2546 const char * const s1 = do_utf8 && OP(scan) != CANY ?
95b24440
DM
2547 pv_uni_display(PERL_DEBUG_PAD(1),
2548 (U8*)(locinput - pref_len + pref0_len),
c728cb41 2549 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2550 locinput - pref_len + pref0_len;
1df70142
AL
2551 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2552 const char * const s2 = do_utf8 && OP(scan) != CANY ?
95b24440 2553 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
c728cb41 2554 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2555 locinput;
1df70142 2556 const int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2557 PerlIO_printf(Perl_debug_log,
2558 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2559 (IV)(locinput - PL_bostr),
2560 PL_colors[4],
2561 len0, s0,
2562 PL_colors[5],
2563 PL_colors[2],
2564 len1, s1,
2565 PL_colors[3],
2566 (docolor ? "" : "> <"),
2567 PL_colors[0],
2568 len2, s2,
2569 PL_colors[1],
2570 15 - l - pref_len + 1,
2571 "",
4f639d21 2572 (IV)(scan - rex->program), PL_regindent*2, "",
3f7c398e 2573 SvPVX_const(prop));
2a782b5b
JH
2574 }
2575 });
d6a28714
JH
2576
2577 next = scan + NEXT_OFF(scan);
2578 if (next == scan)
2579 next = NULL;
2580
2581 switch (OP(scan)) {
2582 case BOL:
7fba1cd6 2583 if (locinput == PL_bostr)
d6a28714
JH
2584 {
2585 /* regtill = regbol; */
b8c5462f
JH
2586 break;
2587 }
d6a28714
JH
2588 sayNO;
2589 case MBOL:
12d33761
HS
2590 if (locinput == PL_bostr ||
2591 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2592 {
b8c5462f
JH
2593 break;
2594 }
d6a28714
JH
2595 sayNO;
2596 case SBOL:
c2a73568 2597 if (locinput == PL_bostr)
b8c5462f 2598 break;
d6a28714
JH
2599 sayNO;
2600 case GPOS:
2601 if (locinput == PL_reg_ganch)
2602 break;
2603 sayNO;
2604 case EOL:
d6a28714
JH
2605 goto seol;
2606 case MEOL:
d6a28714 2607 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2608 sayNO;
b8c5462f 2609 break;
d6a28714
JH
2610 case SEOL:
2611 seol:
2612 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2613 sayNO;
d6a28714 2614 if (PL_regeol - locinput > 1)
b8c5462f 2615 sayNO;
b8c5462f 2616 break;
d6a28714
JH
2617 case EOS:
2618 if (PL_regeol != locinput)
b8c5462f 2619 sayNO;
d6a28714 2620 break;
ffc61ed2 2621 case SANY:
d6a28714 2622 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2623 sayNO;
f33976b4
DB
2624 if (do_utf8) {
2625 locinput += PL_utf8skip[nextchr];
2626 if (locinput > PL_regeol)
2627 sayNO;
2628 nextchr = UCHARAT(locinput);
2629 }
2630 else
2631 nextchr = UCHARAT(++locinput);
2632 break;
2633 case CANY:
2634 if (!nextchr && locinput >= PL_regeol)
2635 sayNO;
b8c5462f 2636 nextchr = UCHARAT(++locinput);
a0d0e21e 2637 break;
ffc61ed2 2638 case REG_ANY:
1aa99e6b
IH
2639 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2640 sayNO;
2641 if (do_utf8) {
b8c5462f 2642 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2643 if (locinput > PL_regeol)
2644 sayNO;
a0ed51b3 2645 nextchr = UCHARAT(locinput);
a0ed51b3 2646 }
1aa99e6b
IH
2647 else
2648 nextchr = UCHARAT(++locinput);
a0ed51b3 2649 break;
a3621e74
YO
2650
2651
2652
2653 /*
2654 traverse the TRIE keeping track of all accepting states
2655 we transition through until we get to a failing node.
2656
a3621e74
YO
2657
2658 */
5b47454d 2659 case TRIE:
a3621e74
YO
2660 case TRIEF:
2661 case TRIEFL:
2662 {
a3621e74
YO
2663 U8 *uc = ( U8* )locinput;
2664 U32 state = 1;
2665 U16 charid = 0;
2666 U32 base = 0;
2667 UV uvc = 0;
2668 STRLEN len = 0;
2669 STRLEN foldlen = 0;
a3621e74
YO
2670 U8 *uscan = (U8*)NULL;
2671 STRLEN bufflen=0;
95b24440 2672 SV *sv_accept_buff = NULL;
5b47454d
DM
2673 const enum { trie_plain, trie_utf8, trie_uft8_fold }
2674 trie_type = do_utf8 ?
2675 (OP(scan) == TRIE ? trie_utf8 : trie_uft8_fold)
2676 : trie_plain;
2677
7087a21c
NC
2678 /* what trie are we using right now */
2679 reg_trie_data *trie
32fc9b6a 2680 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
d8319b27 2681 st->u.trie.accepted = 0; /* how many accepting states we have seen */
95b24440 2682 result = 0;
a3621e74
YO
2683
2684 while ( state && uc <= (U8*)PL_regeol ) {
2685
5b47454d 2686 if (trie->states[ state ].wordnum) {
d8319b27 2687 if (!st->u.trie.accepted ) {
5b47454d
DM
2688 ENTER;
2689 SAVETMPS;
2690 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2691 sv_accept_buff=newSV(bufflen *
2692 sizeof(reg_trie_accepted) - 1);
2693 SvCUR_set(sv_accept_buff,
2694 sizeof(reg_trie_accepted));
2695 SvPOK_on(sv_accept_buff);
2696 sv_2mortal(sv_accept_buff);
d8319b27 2697 st->u.trie.accept_buff =
5b47454d
DM
2698 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2699 }
2700 else {
d8319b27 2701 if (st->u.trie.accepted >= bufflen) {
5b47454d 2702 bufflen *= 2;
d8319b27 2703 st->u.trie.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2704 SvGROW(sv_accept_buff,
2705 bufflen * sizeof(reg_trie_accepted));
2706 }
2707 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2708 + sizeof(reg_trie_accepted));
2709 }
d8319b27
DM
2710 st->u.trie.accept_buff[st->u.trie.accepted].wordnum = trie->states[state].wordnum;
2711 st->u.trie.accept_buff[st->u.trie.accepted].endpos = uc;
2712 ++st->u.trie.accepted;
5b47454d 2713 }
a3621e74
YO
2714
2715 base = trie->states[ state ].trans.base;
2716
2717 DEBUG_TRIE_EXECUTE_r(
2718 PerlIO_printf( Perl_debug_log,
e4584336 2719 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2720 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27 2721 (UV)state, (UV)base, (UV)st->u.trie.accepted );
a3621e74
YO
2722 );
2723
2724 if ( base ) {
5b47454d
DM
2725 switch (trie_type) {
2726 case trie_uft8_fold:
a3621e74
YO
2727 if ( foldlen>0 ) {
2728 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2729 foldlen -= len;
2730 uscan += len;
2731 len=0;
2732 } else {
1df70142 2733 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2734 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2735 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2736 foldlen -= UNISKIP( uvc );
2737 uscan = foldbuf + UNISKIP( uvc );
2738 }
5b47454d
DM
2739 break;
2740 case trie_utf8:
2741 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2742 &len, uniflags );
2743 break;
2744 case trie_plain:
e4584336 2745 uvc = (UV)*uc;
a3621e74
YO
2746 len = 1;
2747 }
2748
5b47454d
DM
2749 if (uvc < 256) {
2750 charid = trie->charmap[ uvc ];
2751 }
2752 else {
2753 charid = 0;
2754 if (trie->widecharmap) {
2755 SV** svpp = (SV**)NULL;
2756 svpp = hv_fetch(trie->widecharmap,
2757 (char*)&uvc, sizeof(UV), 0);
2758 if (svpp)
2759 charid = (U16)SvIV(*svpp);
2760 }
2761 }
a3621e74 2762
5b47454d
DM
2763 if (charid &&
2764 (base + charid > trie->uniquecharcount )
2765 && (base + charid - 1 - trie->uniquecharcount
2766 < trie->lasttrans)
2767 && trie->trans[base + charid - 1 -
2768 trie->uniquecharcount].check == state)
2769 {
2770 state = trie->trans[base + charid - 1 -
2771 trie->uniquecharcount ].next;
2772 }
2773 else {
2774 state = 0;
2775 }
2776 uc += len;
2777
2778 }
2779 else {
a3621e74
YO
2780 state = 0;
2781 }
2782 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2783 PerlIO_printf( Perl_debug_log,
2784 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2785 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2786 );
2787 }
d8319b27 2788 if (!st->u.trie.accepted )
a3621e74 2789 sayNO;
a3621e74
YO
2790
2791 /*
2792 There was at least one accepting state that we
2793 transitioned through. Presumably the number of accepting
2794 states is going to be low, typically one or two. So we
2795 simply scan through to find the one with lowest wordnum.
2796 Once we find it, we swap the last state into its place
2797 and decrement the size. We then try to match the rest of
2798 the pattern at the point where the word ends, if we
2799 succeed then we end the loop, otherwise the loop
2800 eventually terminates once all of the accepting states
2801 have been tried.
2802 */
a3621e74 2803
d8319b27 2804 if ( st->u.trie.accepted == 1 ) {
a3621e74 2805 DEBUG_EXECUTE_r({
097eb12c 2806 SV ** const tmp = av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 );
a3621e74
YO
2807 PerlIO_printf( Perl_debug_log,
2808 "%*s %sonly one match : #%d <%s>%s\n",
e4584336 2809 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
d8319b27 2810 st->u.trie.accept_buff[ 0 ].wordnum,
cfd0369c 2811 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
a3621e74
YO
2812 PL_colors[5] );
2813 });
d8319b27 2814 PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos;
a3621e74
YO
2815 /* in this case we free tmps/leave before we call regmatch
2816 as we wont be using accept_buff again. */
2817 FREETMPS;
2818 LEAVE;
95b24440
DM
2819 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
2820 /*** all unsaved local vars undefined at this point */
a3621e74
YO
2821 } else {
2822 DEBUG_EXECUTE_r(
e4584336 2823 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
d8319b27 2824 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)st->u.trie.accepted,
a3621e74
YO
2825 PL_colors[5] );
2826 );
d8319b27 2827 while ( !result && st->u.trie.accepted-- ) {
a3621e74
YO
2828 U32 best = 0;
2829 U32 cur;
d8319b27 2830 for( cur = 1 ; cur <= st->u.trie.accepted ; cur++ ) {
e4584336
RB
2831 DEBUG_TRIE_EXECUTE_r(
2832 PerlIO_printf( Perl_debug_log,
2833 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2834 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27
DM
2835 (IV)best, st->u.trie.accept_buff[ best ].wordnum, (IV)cur,
2836 st->u.trie.accept_buff[ cur ].wordnum, PL_colors[5] );
e4584336 2837 );
a3621e74 2838
d8319b27
DM
2839 if (st->u.trie.accept_buff[cur].wordnum <
2840 st->u.trie.accept_buff[best].wordnum)
e822a8b4 2841 best = cur;
a3621e74
YO
2842 }
2843 DEBUG_EXECUTE_r({
87830502 2844 reg_trie_data * const trie = (reg_trie_data*)
32fc9b6a 2845 rex->data->data[ARG(scan)];
d8319b27 2846 SV ** const tmp = av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 );
a3621e74 2847 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
e4584336 2848 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
d8319b27 2849 st->u.trie.accept_buff[best].wordnum,
cfd0369c 2850 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",scan,
a3621e74
YO
2851 PL_colors[5] );
2852 });
d8319b27
DM
2853 if ( best<st->u.trie.accepted ) {
2854 reg_trie_accepted tmp = st->u.trie.accept_buff[ best ];
2855 st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ];
2856 st->u.trie.accept_buff[ st->u.trie.accepted ] = tmp;
2857 best = st->u.trie.accepted;
a3621e74 2858 }
d8319b27 2859 PL_reginput = (char *)st->u.trie.accept_buff[ best ].endpos;
a3621e74
YO
2860
2861 /*
2862 as far as I can tell we only need the SAVETMPS/FREETMPS
2863 for re's with EVAL in them but I'm leaving them in for
2864 all until I can be sure.
2865 */
2866 SAVETMPS;
95b24440
DM
2867 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
2868 /*** all unsaved local vars undefined at this point */
a3621e74
YO
2869 FREETMPS;
2870 }
2871 FREETMPS;
2872 LEAVE;
2873 }
2874
95b24440 2875 if (result) {
a3621e74
YO
2876 sayYES;
2877 } else {
2878 sayNO;
2879 }
2880 }
2881 /* unreached codepoint */
95b24440
DM
2882 case EXACT: {
2883 char *s = STRING(scan);
5d9a96ca 2884 st->ln = STR_LEN(scan);
eb160463 2885 if (do_utf8 != UTF) {
bc517b45 2886 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2887 char *l = locinput;
5d9a96ca 2888 const char *e = s + st->ln;
a72c7584 2889
5ff6fc6d
JH
2890 if (do_utf8) {
2891 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2892 while (s < e) {
a3b680e6 2893 STRLEN ulen;
1aa99e6b 2894 if (l >= PL_regeol)
5ff6fc6d
JH
2895 sayNO;
2896 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2897 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2898 uniflags))
5ff6fc6d 2899 sayNO;
bc517b45 2900 l += ulen;
5ff6fc6d 2901 s ++;
1aa99e6b 2902 }
5ff6fc6d
JH
2903 }
2904 else {
2905 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2906 while (s < e) {
a3b680e6 2907 STRLEN ulen;
1aa99e6b
IH
2908 if (l >= PL_regeol)
2909 sayNO;
5ff6fc6d 2910 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2911 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2912 uniflags))
1aa99e6b 2913 sayNO;
bc517b45 2914 s += ulen;
a72c7584 2915 l ++;
1aa99e6b 2916 }
5ff6fc6d 2917 }
1aa99e6b
IH
2918 locinput = l;
2919 nextchr = UCHARAT(locinput);
2920 break;
2921 }
bc517b45 2922 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2923 /* Inline the first character, for speed. */
2924 if (UCHARAT(s) != nextchr)
2925 sayNO;
5d9a96ca 2926 if (PL_regeol - locinput < st->ln)
d6a28714 2927 sayNO;
5d9a96ca 2928 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 2929 sayNO;
5d9a96ca 2930 locinput += st->ln;
d6a28714
JH
2931 nextchr = UCHARAT(locinput);
2932 break;
95b24440 2933 }
d6a28714 2934 case EXACTFL:
b8c5462f
JH
2935 PL_reg_flags |= RF_tainted;
2936 /* FALL THROUGH */
95b24440
DM
2937 case EXACTF: {
2938 char *s = STRING(scan);
5d9a96ca 2939 st->ln = STR_LEN(scan);
d6a28714 2940
d07ddd77
JH
2941 if (do_utf8 || UTF) {
2942 /* Either target or the pattern are utf8. */
d6a28714 2943 char *l = locinput;
d07ddd77 2944 char *e = PL_regeol;
bc517b45 2945
5d9a96ca 2946 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 2947 l, &e, 0, do_utf8)) {
5486206c
JH
2948 /* One more case for the sharp s:
2949 * pack("U0U*", 0xDF) =~ /ss/i,
2950 * the 0xC3 0x9F are the UTF-8
2951 * byte sequence for the U+00DF. */
2952 if (!(do_utf8 &&
2953 toLOWER(s[0]) == 's' &&
5d9a96ca 2954 st->ln >= 2 &&
5486206c
JH
2955 toLOWER(s[1]) == 's' &&
2956 (U8)l[0] == 0xC3 &&
2957 e - l >= 2 &&
2958 (U8)l[1] == 0x9F))
2959 sayNO;
2960 }
d07ddd77
JH
2961 locinput = e;
2962 nextchr = UCHARAT(locinput);
2963 break;
a0ed51b3 2964 }
d6a28714 2965
bc517b45
JH
2966 /* Neither the target and the pattern are utf8. */
2967
d6a28714
JH
2968 /* Inline the first character, for speed. */
2969 if (UCHARAT(s) != nextchr &&
2970 UCHARAT(s) != ((OP(scan) == EXACTF)
2971 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2972 sayNO;
5d9a96ca 2973 if (PL_regeol - locinput < st->ln)
b8c5462f 2974 sayNO;
5d9a96ca
DM
2975 if (st->ln > 1 && (OP(scan) == EXACTF
2976 ? ibcmp(s, locinput, st->ln)
2977 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 2978 sayNO;
5d9a96ca 2979 locinput += st->ln;
d6a28714 2980 nextchr = UCHARAT(locinput);
a0d0e21e 2981 break;
95b24440 2982 }
d6a28714 2983 case ANYOF:
ffc61ed2 2984 if (do_utf8) {
9e55ce06
JH
2985 STRLEN inclasslen = PL_regeol - locinput;
2986
32fc9b6a 2987 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2988 sayNO_ANYOF;
ffc61ed2
JH
2989 if (locinput >= PL_regeol)
2990 sayNO;
0f0076b4 2991 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2992 nextchr = UCHARAT(locinput);
e0f9d4a8 2993 break;
ffc61ed2
JH
2994 }
2995 else {
2996 if (nextchr < 0)
2997 nextchr = UCHARAT(locinput);
32fc9b6a 2998 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 2999 sayNO_ANYOF;
ffc61ed2
JH
3000 if (!nextchr && locinput >= PL_regeol)
3001 sayNO;
3002 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3003 break;
3004 }
3005 no_anyof:
3006 /* If we might have the case of the German sharp s
3007 * in a casefolding Unicode character class. */
3008
ebc501f0
JH
3009 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3010 locinput += SHARP_S_SKIP;
e0f9d4a8 3011 nextchr = UCHARAT(locinput);
ffc61ed2 3012 }
e0f9d4a8
JH
3013 else
3014 sayNO;
b8c5462f 3015 break;
d6a28714 3016 case ALNUML:
b8c5462f
JH
3017 PL_reg_flags |= RF_tainted;
3018 /* FALL THROUGH */
d6a28714 3019 case ALNUM:
b8c5462f 3020 if (!nextchr)
4633a7c4 3021 sayNO;
ffc61ed2 3022 if (do_utf8) {
1a4fad37 3023 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3024 if (!(OP(scan) == ALNUM
3568d838 3025 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3026 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3027 {
3028 sayNO;
a0ed51b3 3029 }
b8c5462f 3030 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3031 nextchr = UCHARAT(locinput);
3032 break;
3033 }
ffc61ed2 3034 if (!(OP(scan) == ALNUM
d6a28714 3035 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3036 sayNO;
b8c5462f 3037 nextchr = UCHARAT(++locinput);
a0d0e21e 3038 break;
d6a28714 3039 case NALNUML:
b8c5462f
JH
3040 PL_reg_flags |= RF_tainted;
3041 /* FALL THROUGH */
d6a28714
JH
3042 case NALNUM:
3043 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3044 sayNO;
ffc61ed2 3045 if (do_utf8) {
1a4fad37 3046 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3047 if (OP(scan) == NALNUM
3568d838 3048 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3049 : isALNUM_LC_utf8((U8*)locinput))
3050 {
b8c5462f 3051 sayNO;
d6a28714 3052 }
b8c5462f
JH
3053 locinput += PL_utf8skip[nextchr];
3054 nextchr = UCHARAT(locinput);
3055 break;
3056 }
ffc61ed2 3057 if (OP(scan) == NALNUM
d6a28714 3058 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3059 sayNO;
76e3520e 3060 nextchr = UCHARAT(++locinput);
a0d0e21e 3061 break;
d6a28714
JH
3062 case BOUNDL:
3063 case NBOUNDL:
3280af22 3064 PL_reg_flags |= RF_tainted;
bbce6d69 3065 /* FALL THROUGH */
d6a28714
JH
3066 case BOUND:
3067 case NBOUND:
3068 /* was last char in word? */
ffc61ed2 3069 if (do_utf8) {
12d33761 3070 if (locinput == PL_bostr)
5d9a96ca 3071 st->ln = '\n';
ffc61ed2 3072 else {
a3b680e6 3073 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3074
4ad0818d 3075 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3076 }
3077 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3078 st->ln = isALNUM_uni(st->ln);
1a4fad37 3079 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3080 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3081 }
3082 else {
5d9a96ca 3083 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3084 n = isALNUM_LC_utf8((U8*)locinput);
3085 }
a0ed51b3 3086 }
d6a28714 3087 else {
5d9a96ca 3088 st->ln = (locinput != PL_bostr) ?
12d33761 3089 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3090 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3091 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3092 n = isALNUM(nextchr);
3093 }
3094 else {
5d9a96ca 3095 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3096 n = isALNUM_LC(nextchr);
3097 }
d6a28714 3098 }
5d9a96ca 3099 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3100 OP(scan) == BOUNDL))
3101 sayNO;
a0ed51b3 3102 break;
d6a28714 3103 case SPACEL:
3280af22 3104 PL_reg_flags |= RF_tainted;
bbce6d69 3105 /* FALL THROUGH */
d6a28714 3106 case SPACE:
9442cb0e 3107 if (!nextchr)
4633a7c4 3108 sayNO;
1aa99e6b 3109 if (do_utf8) {
fd400ab9 3110 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3111 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3112 if (!(OP(scan) == SPACE
3568d838 3113 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3114 : isSPACE_LC_utf8((U8*)locinput)))
3115 {
3116 sayNO;
3117 }
3118 locinput += PL_utf8skip[nextchr];
3119 nextchr = UCHARAT(locinput);
3120 break;
d6a28714 3121 }
ffc61ed2
JH
3122 if (!(OP(scan) == SPACE
3123 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3124 sayNO;
3125 nextchr = UCHARAT(++locinput);
3126 }
3127 else {
3128 if (!(OP(scan) == SPACE
3129 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3130 sayNO;
3131 nextchr = UCHARAT(++locinput);
a0ed51b3 3132 }
a0ed51b3 3133 break;
d6a28714 3134 case NSPACEL:
3280af22 3135 PL_reg_flags |= RF_tainted;
bbce6d69 3136 /* FALL THROUGH */
d6a28714 3137 case NSPACE:
9442cb0e 3138 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3139 sayNO;
1aa99e6b 3140 if (do_utf8) {
1a4fad37 3141 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3142 if (OP(scan) == NSPACE
3568d838 3143 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3144 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3145 {
3146 sayNO;
3147 }
3148 locinput += PL_utf8skip[nextchr];
3149 nextchr = UCHARAT(locinput);
3150 break;
a0ed51b3 3151 }
ffc61ed2 3152 if (OP(scan) == NSPACE
d6a28714 3153 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3154 sayNO;
76e3520e 3155 nextchr = UCHARAT(++locinput);
a0d0e21e 3156 break;
d6a28714 3157 case DIGITL:
a0ed51b3
LW
3158 PL_reg_flags |= RF_tainted;
3159 /* FALL THROUGH */
d6a28714 3160 case DIGIT:
9442cb0e 3161 if (!nextchr)
a0ed51b3 3162 sayNO;
1aa99e6b 3163 if (do_utf8) {
1a4fad37 3164 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3165 if (!(OP(scan) == DIGIT
3568d838 3166 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3167 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3168 {
a0ed51b3 3169 sayNO;
dfe13c55 3170 }
6f06b55f 3171 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3172 nextchr = UCHARAT(locinput);
3173 break;
3174 }
ffc61ed2 3175 if (!(OP(scan) == DIGIT
9442cb0e 3176 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3177 sayNO;
3178 nextchr = UCHARAT(++locinput);
3179 break;
d6a28714 3180 case NDIGITL:
b8c5462f
JH
3181 PL_reg_flags |= RF_tainted;
3182 /* FALL THROUGH */
d6a28714 3183 case NDIGIT:
9442cb0e 3184 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3185 sayNO;
1aa99e6b 3186 if (do_utf8) {
1a4fad37 3187 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3188 if (OP(scan) == NDIGIT
3568d838 3189 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3190 : isDIGIT_LC_utf8((U8*)locinput))
3191 {
a0ed51b3 3192 sayNO;
9442cb0e 3193 }
6f06b55f 3194 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3195 nextchr = UCHARAT(locinput);
3196 break;
3197 }
ffc61ed2 3198 if (OP(scan) == NDIGIT
9442cb0e 3199 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3200 sayNO;
3201 nextchr = UCHARAT(++locinput);
3202 break;
3203 case CLUMP:
b7c83a7e 3204 if (locinput >= PL_regeol)
a0ed51b3 3205 sayNO;
b7c83a7e 3206 if (do_utf8) {
1a4fad37 3207 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3208 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3209 sayNO;
3210 locinput += PL_utf8skip[nextchr];
3211 while (locinput < PL_regeol &&
3212 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3213 locinput += UTF8SKIP(locinput);
3214 if (locinput > PL_regeol)
3215 sayNO;
eb08e2da
JH
3216 }
3217 else
3218 locinput++;
a0ed51b3
LW
3219 nextchr = UCHARAT(locinput);
3220 break;
c8756f30 3221 case REFFL:
3280af22 3222 PL_reg_flags |= RF_tainted;
c8756f30 3223 /* FALL THROUGH */
c277df42 3224 case REF:
95b24440
DM
3225 case REFF: {
3226 char *s;
c277df42 3227 n = ARG(scan); /* which paren pair */
5d9a96ca 3228 st->ln = PL_regstartp[n];
2c2d71f5 3229 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3230 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3231 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3232 if (st->ln == PL_regendp[n])
a0d0e21e 3233 break;
a0ed51b3 3234
5d9a96ca 3235 s = PL_bostr + st->ln;
1aa99e6b 3236 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3237 char *l = locinput;
a3b680e6 3238 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3239 /*
3240 * Note that we can't do the "other character" lookup trick as
3241 * in the 8-bit case (no pun intended) because in Unicode we
3242 * have to map both upper and title case to lower case.
3243 */
3244 if (OP(scan) == REFF) {
3245 while (s < e) {
a3b680e6
AL
3246 STRLEN ulen1, ulen2;
3247 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3248 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3249
a0ed51b3
LW
3250 if (l >= PL_regeol)
3251 sayNO;
a2a2844f
JH
3252 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3253 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3254 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3255 sayNO;
a2a2844f
JH
3256 s += ulen1;
3257 l += ulen2;
a0ed51b3
LW
3258 }
3259 }
3260 locinput = l;
3261 nextchr = UCHARAT(locinput);
3262 break;
3263 }
3264
a0d0e21e 3265 /* Inline the first character, for speed. */
76e3520e 3266 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3267 (OP(scan) == REF ||
3268 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3269 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3270 sayNO;
5d9a96ca
DM
3271 st->ln = PL_regendp[n] - st->ln;
3272 if (locinput + st->ln > PL_regeol)
4633a7c4 3273 sayNO;
5d9a96ca
DM
3274 if (st->ln > 1 && (OP(scan) == REF
3275 ? memNE(s, locinput, st->ln)
c8756f30 3276 : (OP(scan) == REFF
5d9a96ca
DM
3277 ? ibcmp(s, locinput, st->ln)
3278 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3279 sayNO;
5d9a96ca 3280 locinput += st->ln;
76e3520e 3281 nextchr = UCHARAT(locinput);
a0d0e21e 3282 break;
95b24440 3283 }
a0d0e21e
LW
3284
3285 case NOTHING:
c277df42 3286 case TAIL:
a0d0e21e
LW
3287 break;
3288 case BACK:
3289 break;
c277df42
IZ
3290 case EVAL:
3291 {
c277df42 3292 SV *ret;
8e5e9ebe 3293 {
4aabdb9b
DM
3294 /* execute the code in the {...} */
3295 dSP;
6136c704 3296 SV ** const before = SP;
4aabdb9b
DM
3297 OP_4tree * const oop = PL_op;
3298 COP * const ocurcop = PL_curcop;
3299 PAD *old_comppad;
4aabdb9b
DM
3300
3301 n = ARG(scan);
32fc9b6a 3302 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3303 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3304 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3305 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3306
8e5e9ebe
RGS
3307 CALLRUNOPS(aTHX); /* Scalar context. */
3308 SPAGAIN;
3309 if (SP == before)
075aa684 3310 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3311 else {
3312 ret = POPs;
3313 PUTBACK;
3314 }
4aabdb9b
DM
3315
3316 PL_op = oop;
3317 PAD_RESTORE_LOCAL(old_comppad);
3318 PL_curcop = ocurcop;
3319 if (!st->logical) {
3320 /* /(?{...})/ */
3321 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3322 break;
3323 }
8e5e9ebe 3324 }
4aabdb9b
DM
3325 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3326 regexp *re;
4aabdb9b 3327 {
4f639d21
DM
3328 /* extract RE object from returned value; compiling if
3329 * necessary */
3330
6136c704 3331 MAGIC *mg = NULL;
4aabdb9b 3332 SV *sv;
faf82a0b
AE
3333 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3334 mg = mg_find(sv, PERL_MAGIC_qr);
3335 else if (SvSMAGICAL(ret)) {
3336 if (SvGMAGICAL(ret))
3337 sv_unmagic(ret, PERL_MAGIC_qr);
3338 else
3339 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3340 }
faf82a0b 3341
0f5d15d6
IZ
3342 if (mg) {
3343 re = (regexp *)mg->mg_obj;
df0003d4 3344 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3345 }
3346 else {
3347 STRLEN len;
6136c704 3348 const char * const t = SvPV_const(ret, len);
0f5d15d6 3349 PMOP pm;
a3b680e6 3350 const I32 osize = PL_regsize;
0f5d15d6 3351
5fcd1c1b 3352 Zero(&pm, 1, PMOP);
4aabdb9b 3353 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3354 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3355 if (!(SvFLAGS(ret)
faf82a0b
AE
3356 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3357 | SVs_GMG)))
14befaf4
DM
3358 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3359 PERL_MAGIC_qr,0,0);
0f5d15d6 3360 PL_regsize = osize;
0f5d15d6 3361 }
4aabdb9b 3362 }
aa283a38
DM
3363
3364 /* run the pattern returned from (??{...}) */
3365
4aabdb9b
DM
3366 DEBUG_EXECUTE_r(
3367 PerlIO_printf(Perl_debug_log,
3368 "Entering embedded \"%s%.60s%s%s\"\n",
3369 PL_colors[0],
3370 re->precomp,
3371 PL_colors[1],
3372 (strlen(re->precomp) > 60 ? "..." : ""))
3373 );
2c2d71f5 3374
4aabdb9b
DM
3375 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
3376 REGCP_SET(st->u.eval.lastcp);
4aabdb9b
DM
3377 *PL_reglastparen = 0;
3378 *PL_reglastcloseparen = 0;
4aabdb9b 3379 PL_reginput = locinput;
4aabdb9b
DM
3380
3381 /* XXXX This is too dramatic a measure... */
3382 PL_reg_maxiter = 0;
3383
5d9a96ca 3384 st->logical = 0;
aa283a38
DM
3385 st->u.eval.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3386 ((re->reganch & ROPT_UTF8) != 0);
3387 if (st->u.eval.toggleutf) PL_reg_flags ^= RF_utf8;
3388 st->u.eval.prev_rex = rex;
aa283a38 3389 rex = re;
aa283a38
DM
3390
3391 st->u.eval.prev_eval = cur_eval;
3392 st->u.eval.prev_slab = PL_regmatch_slab;
3393 st->u.eval.depth = depth;
3394 cur_eval = st;
3395 PUSH_STATE(newst, resume_EVAL);
3396 st = newst;
3397
3398 /* now continue from first node in postoned RE */
3399 next = re->program + 1;
3400 break;
4aabdb9b 3401 /* NOTREACHED */
a0ed51b3 3402 }
4aabdb9b
DM
3403 /* /(?(?{...})X|Y)/ */
3404 st->sw = SvTRUE(ret);
3405 st->logical = 0;
c277df42
IZ
3406 break;
3407 }
a0d0e21e 3408 case OPEN:
c277df42 3409 n = ARG(scan); /* which paren pair */
3280af22
NIS
3410 PL_reg_start_tmp[n] = locinput;
3411 if (n > PL_regsize)
3412 PL_regsize = n;
a0d0e21e
LW
3413 break;
3414 case CLOSE:
c277df42 3415 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3416 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3417 PL_regendp[n] = locinput - PL_bostr;
eb160463 3418 if (n > (I32)*PL_reglastparen)
3280af22 3419 *PL_reglastparen = n;
a01268b5 3420 *PL_reglastcloseparen = n;
a0d0e21e 3421 break;
c277df42
IZ
3422 case GROUPP:
3423 n = ARG(scan); /* which paren pair */
5d9a96ca 3424 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3425 break;
3426 case IFTHEN:
2c2d71f5 3427 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3428 if (st->sw)
c277df42
IZ
3429 next = NEXTOPER(NEXTOPER(scan));
3430 else {
3431 next = scan + ARG(scan);
3432 if (OP(next) == IFTHEN) /* Fake one. */
3433 next = NEXTOPER(NEXTOPER(next));
3434 }
3435 break;
3436 case LOGICAL:
5d9a96ca 3437 st->logical = scan->flags;
c277df42 3438 break;
2ab05381 3439/*******************************************************************
a0374537
DM
3440 cc points to the regmatch_state associated with the most recent CURLYX.
3441 This struct contains info about the innermost (...)* loop (an
3442 "infoblock"), and a pointer to the next outer cc.
2ab05381
IZ
3443
3444 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3445
95b24440 3446 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3447
a0374537 3448 2) This regnode populates cc, and calls regmatch() recursively
95b24440 3449 with the starting point at WHILEM node;
2ab05381
IZ
3450