This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
eliminate PL_regprecomp, PL_regprogram, PL_regnpar and PL_regdata
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
50/* *These* symbols are masked to allow static link. */
51# define Perl_pregexec my_pregexec
9041c2e3 52# define Perl_reginitcolors my_reginitcolors
490a3f88 53# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
9041c2e3 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 **** Alterations to Henry's code are...
79 ****
4bb101f2 80 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 81 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 82 ****
9ef589d8
LW
83 **** You may distribute under the terms of either the GNU General Public
84 **** License or the Artistic License, as specified in the README file.
a687059c
LW
85 *
86 * Beware that some of this code is subtly aware of the way operator
87 * precedence is structured in regular expressions. Serious changes in
88 * regular-expression syntax might require a total rethink.
89 */
90#include "EXTERN.h"
864dbfa3 91#define PERL_IN_REGEXEC_C
a687059c 92#include "perl.h"
0f5d15d6 93
a687059c
LW
94#include "regcomp.h"
95
c277df42
IZ
96#define RF_tainted 1 /* tainted information used? */
97#define RF_warned 2 /* warned about big count? */
ce862d02 98#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
99#define RF_utf8 8 /* String contains multibyte chars? */
100
eb160463 101#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
102
103#define RS_init 1 /* eval environment created */
104#define RS_set 2 /* replsv value is set */
c277df42 105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
ba7b4546 110#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 111
c277df42
IZ
112/*
113 * Forwards.
114 */
115
33b8afdf 116#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 117#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 118
52657f30
AL
119#define HOPc(pos,off) ((char *)(PL_reg_match_utf8 \
120 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
121 : (U8*)(pos + off)))
122#define HOPBACKc(pos, off) ((char*) \
123 ((PL_reg_match_utf8) \
124 ? reghopmaybe3((U8*)pos, -off, ((U8*)(off < 0 ? PL_regeol : PL_bostr))) \
efb30f32
HS
125 : (pos - off >= PL_bostr) \
126 ? (U8*)(pos - off) \
52657f30 127 : (U8*)NULL) \
efb30f32 128)
efb30f32 129
1aa99e6b 130#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c 131#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 132#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 133
1a4fad37
AL
134#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
135 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
136#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
137#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
138#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
139#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 140
5f80c4cf 141/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
142#define JUMPABLE(rn) ( \
143 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
144 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
145 OP(rn) == PLUS || OP(rn) == MINMOD || \
146 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
147)
148
cca55fe3
JP
149#define HAS_TEXT(rn) ( \
150 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
151)
e2d8ce26 152
a84d97b6
HS
153/*
154 Search for mandatory following text node; for lookahead, the text must
155 follow but for lookbehind (rn->flags != 0) we skip to the next step.
156*/
cca55fe3 157#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 158 while (JUMPABLE(rn)) \
a84d97b6 159 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 160 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
161 else if (OP(rn) == PLUS) \
162 rn = NEXTOPER(rn); \
a84d97b6
HS
163 else if (OP(rn) == IFMATCH) \
164 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 165 else rn += NEXT_OFF(rn); \
5f80c4cf 166} STMT_END
74750237 167
acfe0abc 168static void restore_pos(pTHX_ void *arg);
51371543 169
76e3520e 170STATIC CHECKPOINT
cea2e8a9 171S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 172{
97aff369 173 dVAR;
a3b680e6 174 const int retval = PL_savestack_ix;
b1ce53c5 175#define REGCP_PAREN_ELEMS 4
a3b680e6 176 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
177 int p;
178
e49a9654
IH
179 if (paren_elems_to_push < 0)
180 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
181
a01268b5 182#define REGCP_OTHER_ELEMS 6
4b3c1a47 183 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 184 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 185/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
186 SSPUSHINT(PL_regendp[p]);
187 SSPUSHINT(PL_regstartp[p]);
3280af22 188 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
189 SSPUSHINT(p);
190 }
b1ce53c5 191/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
192 SSPUSHINT(PL_regsize);
193 SSPUSHINT(*PL_reglastparen);
a01268b5 194 SSPUSHINT(*PL_reglastcloseparen);
3280af22 195 SSPUSHPTR(PL_reginput);
41123dfd
JH
196#define REGCP_FRAME_ELEMS 2
197/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
198 * are needed for the regexp context stack bookkeeping. */
199 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 200 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 201
a0d0e21e
LW
202 return retval;
203}
204
c277df42 205/* These are needed since we do not localize EVAL nodes: */
a3621e74 206# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 207 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 208 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 209
a3621e74 210# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 211 PerlIO_printf(Perl_debug_log, \
faccc32b 212 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 213 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 214
76e3520e 215STATIC char *
4f639d21 216S_regcppop(pTHX_ regexp *rex)
a0d0e21e 217{
97aff369 218 dVAR;
b1ce53c5 219 I32 i;
a0d0e21e
LW
220 U32 paren = 0;
221 char *input;
b1ce53c5 222
a3621e74
YO
223 GET_RE_DEBUG_FLAGS_DECL;
224
b1ce53c5 225 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 226 i = SSPOPINT;
b1ce53c5
JH
227 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
228 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 229 input = (char *) SSPOPPTR;
a01268b5 230 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
231 *PL_reglastparen = SSPOPINT;
232 PL_regsize = SSPOPINT;
b1ce53c5
JH
233
234 /* Now restore the parentheses context. */
41123dfd
JH
235 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
236 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 237 I32 tmps;
a0d0e21e 238 paren = (U32)SSPOPINT;
3280af22 239 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
240 PL_regstartp[paren] = SSPOPINT;
241 tmps = SSPOPINT;
3280af22
NIS
242 if (paren <= *PL_reglastparen)
243 PL_regendp[paren] = tmps;
a3621e74 244 DEBUG_EXECUTE_r(
c3464db5 245 PerlIO_printf(Perl_debug_log,
b900a521 246 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 247 (UV)paren, (IV)PL_regstartp[paren],
b900a521 248 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 249 (IV)PL_regendp[paren],
3280af22 250 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 251 );
a0d0e21e 252 }
a3621e74 253 DEBUG_EXECUTE_r(
4f639d21 254 if ((I32)(*PL_reglastparen + 1) <= rex->nparens) {
c3464db5 255 PerlIO_printf(Perl_debug_log,
faccc32b 256 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 257 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
258 }
259 );
daf18116 260#if 1
dafc8851
JH
261 /* It would seem that the similar code in regtry()
262 * already takes care of this, and in fact it is in
263 * a better location to since this code can #if 0-ed out
264 * but the code in regtry() is needed or otherwise tests
265 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
266 * (as of patchlevel 7877) will fail. Then again,
267 * this code seems to be necessary or otherwise
268 * building DynaLoader will fail:
269 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
270 * --jhi */
4f639d21 271 for (paren = *PL_reglastparen + 1; (I32)paren <= rex->nparens; paren++) {
eb160463 272 if ((I32)paren > PL_regsize)
cf93c79d
IZ
273 PL_regstartp[paren] = -1;
274 PL_regendp[paren] = -1;
a0d0e21e 275 }
dafc8851 276#endif
a0d0e21e
LW
277 return input;
278}
279
02db2b7b 280#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 281
95b24440 282#define TRYPAREN(paren, n, input, where) { \
29d1e993
HS
283 if (paren) { \
284 if (n) { \
285 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
286 PL_regendp[paren] = input - PL_bostr; \
287 } \
288 else \
289 PL_regendp[paren] = -1; \
290 } \
95b24440
DM
291 REGMATCH(next, where); \
292 if (result) \
29d1e993
HS
293 sayYES; \
294 if (paren && n) \
295 PL_regendp[paren] = -1; \
296}
297
298
a687059c 299/*
e50aee73 300 * pregexec and friends
a687059c
LW
301 */
302
303/*
c277df42 304 - pregexec - match a regexp against a string
a687059c 305 */
c277df42 306I32
864dbfa3 307Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 308 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
309/* strend: pointer to null at end of string */
310/* strbeg: real beginning of string */
311/* minend: end of match must be >=minend after stringarg. */
312/* nosave: For optimizations. */
313{
314 return
9041c2e3 315 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
316 nosave ? 0 : REXEC_COPY_STR);
317}
0f5d15d6 318
22e551b9 319
9041c2e3 320/*
cad2e5aa
JH
321 * Need to implement the following flags for reg_anch:
322 *
323 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
324 * USE_INTUIT_ML
325 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
326 * INTUIT_AUTORITATIVE_ML
327 * INTUIT_ONCE_NOML - Intuit can match in one location only.
328 * INTUIT_ONCE_ML
329 *
330 * Another flag for this function: SECOND_TIME (so that float substrs
331 * with giant delta may be not rechecked).
332 */
333
334/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
335
3f7c398e 336/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
337 Otherwise, only SvCUR(sv) is used to get strbeg. */
338
339/* XXXX We assume that strpos is strbeg unless sv. */
340
6eb5f6b9
JH
341/* XXXX Some places assume that there is a fixed substring.
342 An update may be needed if optimizer marks as "INTUITable"
343 RExen without fixed substrings. Similarly, it is assumed that
344 lengths of all the strings are no more than minlen, thus they
345 cannot come from lookahead.
346 (Or minlen should take into account lookahead.) */
347
2c2d71f5
JH
348/* A failure to find a constant substring means that there is no need to make
349 an expensive call to REx engine, thus we celebrate a failure. Similarly,
350 finding a substring too deep into the string means that less calls to
30944b6d
IZ
351 regtry() should be needed.
352
353 REx compiler's optimizer found 4 possible hints:
354 a) Anchored substring;
355 b) Fixed substring;
356 c) Whether we are anchored (beginning-of-line or \G);
357 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 358 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
359 string which does not contradict any of them.
360 */
2c2d71f5 361
6eb5f6b9
JH
362/* Most of decisions we do here should have been done at compile time.
363 The nodes of the REx which we used for the search should have been
364 deleted from the finite automaton. */
365
cad2e5aa
JH
366char *
367Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
368 char *strend, U32 flags, re_scream_pos_data *data)
369{
97aff369 370 dVAR;
b7953727 371 register I32 start_shift = 0;
cad2e5aa 372 /* Should be nonnegative! */
b7953727 373 register I32 end_shift = 0;
2c2d71f5
JH
374 register char *s;
375 register SV *check;
a1933d95 376 char *strbeg;
cad2e5aa 377 char *t;
a3b680e6 378 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 379 I32 ml_anch;
bd61b366
SS
380 register char *other_last = NULL; /* other substr checked before this */
381 char *check_at = NULL; /* check substr found at this pos */
1df70142 382 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 383#ifdef DEBUGGING
890ce7af
AL
384 const char * const i_strpos = strpos;
385 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 386#endif
a3621e74
YO
387
388 GET_RE_DEBUG_FLAGS_DECL;
389
a30b2f1f 390 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 391
b8d68ded 392 if (prog->reganch & ROPT_UTF8) {
a3621e74 393 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
394 "UTF-8 regex...\n"));
395 PL_reg_flags |= RF_utf8;
396 }
397
a3621e74 398 DEBUG_EXECUTE_r({
1df70142 399 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
400 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
401 strpos;
1df70142 402 const int len = PL_reg_match_utf8 ?
b8d68ded 403 strlen(s) : strend - strpos;
2a782b5b
JH
404 if (!PL_colorset)
405 reginitcolors();
b8d68ded 406 if (PL_reg_match_utf8)
a3621e74 407 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 408 "UTF-8 target...\n"));
2a782b5b 409 PerlIO_printf(Perl_debug_log,
a0288114 410 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 411 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
412 prog->precomp,
413 PL_colors[1],
414 (strlen(prog->precomp) > 60 ? "..." : ""),
415 PL_colors[0],
416 (int)(len > 60 ? 60 : len),
417 s, PL_colors[1],
418 (len > 60 ? "..." : "")
419 );
420 });
cad2e5aa 421
c344f387
JH
422 /* CHR_DIST() would be more correct here but it makes things slow. */
423 if (prog->minlen > strend - strpos) {
a3621e74 424 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 425 "String too short... [re_intuit_start]\n"));
cad2e5aa 426 goto fail;
2c2d71f5 427 }
a1933d95 428 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 429 PL_regeol = strend;
33b8afdf
JH
430 if (do_utf8) {
431 if (!prog->check_utf8 && prog->check_substr)
432 to_utf8_substr(prog);
433 check = prog->check_utf8;
434 } else {
435 if (!prog->check_substr && prog->check_utf8)
436 to_byte_substr(prog);
437 check = prog->check_substr;
438 }
439 if (check == &PL_sv_undef) {
a3621e74 440 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
441 "Non-utf string cannot match utf check string\n"));
442 goto fail;
443 }
2c2d71f5 444 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
445 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
446 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 447 && !multiline ) ); /* Check after \n? */
cad2e5aa 448
7e25d62c
JH
449 if (!ml_anch) {
450 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
451 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 452 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
453 && sv && !SvROK(sv)
454 && (strpos != strbeg)) {
a3621e74 455 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
456 goto fail;
457 }
458 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 459 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 460 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
461 I32 slen;
462
1aa99e6b 463 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
464 if (SvTAIL(check)) {
465 slen = SvCUR(check); /* >= 1 */
cad2e5aa 466
9041c2e3 467 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 468 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 469 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 470 goto fail_finish;
cad2e5aa
JH
471 }
472 /* Now should match s[0..slen-2] */
473 slen--;
3f7c398e 474 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 475 || (slen > 1
3f7c398e 476 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 477 report_neq:
a3621e74 478 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
479 goto fail_finish;
480 }
cad2e5aa 481 }
3f7c398e 482 else if (*SvPVX_const(check) != *s
653099ff 483 || ((slen = SvCUR(check)) > 1
3f7c398e 484 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 485 goto report_neq;
c315bfe8 486 check_at = s;
2c2d71f5 487 goto success_at_start;
7e25d62c 488 }
cad2e5aa 489 }
2c2d71f5 490 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 491 s = strpos;
2c2d71f5 492 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 493 end_shift = prog->minlen - start_shift -
653099ff 494 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 495 if (!ml_anch) {
a3b680e6 496 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 497 - (SvTAIL(check) != 0);
a3b680e6 498 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
499
500 if (end_shift < eshift)
501 end_shift = eshift;
502 }
cad2e5aa 503 }
2c2d71f5 504 else { /* Can match at random position */
cad2e5aa
JH
505 ml_anch = 0;
506 s = strpos;
2c2d71f5
JH
507 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
508 /* Should be nonnegative! */
509 end_shift = prog->minlen - start_shift -
653099ff 510 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
511 }
512
2c2d71f5 513#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 514 if (end_shift < 0)
6bbae5e6 515 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
516#endif
517
2c2d71f5
JH
518 restart:
519 /* Find a possible match in the region s..strend by looking for
520 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 521 if (flags & REXEC_SCREAM) {
cad2e5aa 522 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 523 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 524
2c2d71f5
JH
525 if (PL_screamfirst[BmRARE(check)] >= 0
526 || ( BmRARE(check) == '\n'
527 && (BmPREVIOUS(check) == SvCUR(check) - 1)
528 && SvTAIL(check) ))
9041c2e3 529 s = screaminstr(sv, check,
2c2d71f5 530 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 531 else
2c2d71f5 532 goto fail_finish;
4addbd3b
HS
533 /* we may be pointing at the wrong string */
534 if (s && RX_MATCH_COPIED(prog))
3f7c398e 535 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
536 if (data)
537 *data->scream_olds = s;
538 }
f33976b4 539 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
540 s = fbm_instr((U8*)(s + start_shift),
541 (U8*)(strend - end_shift),
7fba1cd6 542 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 543 else
1aa99e6b
IH
544 s = fbm_instr(HOP3(s, start_shift, strend),
545 HOP3(strend, -end_shift, strbeg),
7fba1cd6 546 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
547
548 /* Update the count-of-usability, remove useless subpatterns,
549 unshift s. */
2c2d71f5 550
a0288114 551 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 552 (s ? "Found" : "Did not find"),
33b8afdf 553 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 554 PL_colors[0],
7b0972df 555 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 556 SvPVX_const(check),
2c2d71f5
JH
557 PL_colors[1], (SvTAIL(check) ? "$" : ""),
558 (s ? " at offset " : "...\n") ) );
559
560 if (!s)
561 goto fail_finish;
562
6eb5f6b9
JH
563 check_at = s;
564
2c2d71f5 565 /* Finish the diagnostic message */
a3621e74 566 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
567
568 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
569 Start with the other substr.
570 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 571 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
572 *always* match. Probably should be marked during compile...
573 Probably it is right to do no SCREAM here...
574 */
575
33b8afdf 576 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 577 /* Take into account the "other" substring. */
2c2d71f5
JH
578 /* XXXX May be hopelessly wrong for UTF... */
579 if (!other_last)
6eb5f6b9 580 other_last = strpos;
33b8afdf 581 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
582 do_other_anchored:
583 {
890ce7af
AL
584 char * const last = HOP3c(s, -start_shift, strbeg);
585 char *last1, *last2;
2c2d71f5 586 char *s1 = s;
33b8afdf 587 SV* must;
2c2d71f5 588
2c2d71f5
JH
589 t = s - prog->check_offset_max;
590 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 591 && (!do_utf8
1aa99e6b 592 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 593 && t > strpos)))
30944b6d 594 /* EMPTY */;
2c2d71f5
JH
595 else
596 t = strpos;
1aa99e6b 597 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
598 if (t < other_last) /* These positions already checked */
599 t = other_last;
1aa99e6b 600 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
601 if (last < last1)
602 last1 = last;
603 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
604 /* On end-of-str: see comment below. */
33b8afdf
JH
605 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
606 if (must == &PL_sv_undef) {
607 s = (char*)NULL;
a3621e74 608 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
609 }
610 else
611 s = fbm_instr(
612 (unsigned char*)t,
613 HOP3(HOP3(last1, prog->anchored_offset, strend)
614 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
615 must,
7fba1cd6 616 multiline ? FBMrf_MULTILINE : 0
33b8afdf 617 );
a3621e74 618 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 619 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
620 (s ? "Found" : "Contradicts"),
621 PL_colors[0],
33b8afdf
JH
622 (int)(SvCUR(must)
623 - (SvTAIL(must)!=0)),
3f7c398e 624 SvPVX_const(must),
33b8afdf 625 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
626 if (!s) {
627 if (last1 >= last2) {
a3621e74 628 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
629 ", giving up...\n"));
630 goto fail_finish;
631 }
a3621e74 632 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 633 ", trying floating at offset %ld...\n",
1aa99e6b
IH
634 (long)(HOP3c(s1, 1, strend) - i_strpos)));
635 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
636 s = HOP3c(last, 1, strend);
2c2d71f5
JH
637 goto restart;
638 }
639 else {
a3621e74 640 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 641 (long)(s - i_strpos)));
1aa99e6b
IH
642 t = HOP3c(s, -prog->anchored_offset, strbeg);
643 other_last = HOP3c(s, 1, strend);
30944b6d 644 s = s1;
2c2d71f5
JH
645 if (t == strpos)
646 goto try_at_start;
2c2d71f5
JH
647 goto try_at_offset;
648 }
30944b6d 649 }
2c2d71f5
JH
650 }
651 else { /* Take into account the floating substring. */
33b8afdf
JH
652 char *last, *last1;
653 char *s1 = s;
654 SV* must;
655
656 t = HOP3c(s, -start_shift, strbeg);
657 last1 = last =
658 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
659 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
660 last = HOP3c(t, prog->float_max_offset, strend);
661 s = HOP3c(t, prog->float_min_offset, strend);
662 if (s < other_last)
663 s = other_last;
2c2d71f5 664 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
665 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
666 /* fbm_instr() takes into account exact value of end-of-str
667 if the check is SvTAIL(ed). Since false positives are OK,
668 and end-of-str is not later than strend we are OK. */
669 if (must == &PL_sv_undef) {
670 s = (char*)NULL;
a3621e74 671 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
672 }
673 else
2c2d71f5 674 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
675 (unsigned char*)last + SvCUR(must)
676 - (SvTAIL(must)!=0),
7fba1cd6 677 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 678 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
679 (s ? "Found" : "Contradicts"),
680 PL_colors[0],
681 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 682 SvPVX_const(must),
33b8afdf
JH
683 PL_colors[1], (SvTAIL(must) ? "$" : "")));
684 if (!s) {
685 if (last1 == last) {
a3621e74 686 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
687 ", giving up...\n"));
688 goto fail_finish;
2c2d71f5 689 }
a3621e74 690 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
691 ", trying anchored starting at offset %ld...\n",
692 (long)(s1 + 1 - i_strpos)));
693 other_last = last;
694 s = HOP3c(t, 1, strend);
695 goto restart;
696 }
697 else {
a3621e74 698 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
699 (long)(s - i_strpos)));
700 other_last = s; /* Fix this later. --Hugo */
701 s = s1;
702 if (t == strpos)
703 goto try_at_start;
704 goto try_at_offset;
705 }
2c2d71f5 706 }
cad2e5aa 707 }
2c2d71f5
JH
708
709 t = s - prog->check_offset_max;
2c2d71f5 710 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 711 && (!do_utf8
1aa99e6b
IH
712 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
713 && t > strpos))) {
2c2d71f5
JH
714 /* Fixed substring is found far enough so that the match
715 cannot start at strpos. */
716 try_at_offset:
cad2e5aa 717 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
718 /* Eventually fbm_*() should handle this, but often
719 anchored_offset is not 0, so this check will not be wasted. */
720 /* XXXX In the code below we prefer to look for "^" even in
721 presence of anchored substrings. And we search even
722 beyond the found float position. These pessimizations
723 are historical artefacts only. */
724 find_anchor:
2c2d71f5 725 while (t < strend - prog->minlen) {
cad2e5aa 726 if (*t == '\n') {
4ee3650e 727 if (t < check_at - prog->check_offset_min) {
33b8afdf 728 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
729 /* Since we moved from the found position,
730 we definitely contradict the found anchored
30944b6d
IZ
731 substr. Due to the above check we do not
732 contradict "check" substr.
733 Thus we can arrive here only if check substr
734 is float. Redo checking for "other"=="fixed".
735 */
9041c2e3 736 strpos = t + 1;
a3621e74 737 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 738 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
739 goto do_other_anchored;
740 }
4ee3650e
GS
741 /* We don't contradict the found floating substring. */
742 /* XXXX Why not check for STCLASS? */
cad2e5aa 743 s = t + 1;
a3621e74 744 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 745 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
746 goto set_useful;
747 }
4ee3650e
GS
748 /* Position contradicts check-string */
749 /* XXXX probably better to look for check-string
750 than for "\n", so one should lower the limit for t? */
a3621e74 751 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 752 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 753 other_last = strpos = s = t + 1;
cad2e5aa
JH
754 goto restart;
755 }
756 t++;
757 }
a3621e74 758 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 759 PL_colors[0], PL_colors[1]));
2c2d71f5 760 goto fail_finish;
cad2e5aa 761 }
f5952150 762 else {
a3621e74 763 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 764 PL_colors[0], PL_colors[1]));
f5952150 765 }
cad2e5aa
JH
766 s = t;
767 set_useful:
33b8afdf 768 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
769 }
770 else {
f5952150 771 /* The found string does not prohibit matching at strpos,
2c2d71f5 772 - no optimization of calling REx engine can be performed,
f5952150
GS
773 unless it was an MBOL and we are not after MBOL,
774 or a future STCLASS check will fail this. */
2c2d71f5
JH
775 try_at_start:
776 /* Even in this situation we may use MBOL flag if strpos is offset
777 wrt the start of the string. */
05b4157f 778 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 779 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
780 /* May be due to an implicit anchor of m{.*foo} */
781 && !(prog->reganch & ROPT_IMPLICIT))
782 {
cad2e5aa
JH
783 t = strpos;
784 goto find_anchor;
785 }
a3621e74 786 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 787 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 788 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 789 );
2c2d71f5 790 success_at_start:
30944b6d 791 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
792 && (do_utf8 ? (
793 prog->check_utf8 /* Could be deleted already */
794 && --BmUSEFUL(prog->check_utf8) < 0
795 && (prog->check_utf8 == prog->float_utf8)
796 ) : (
797 prog->check_substr /* Could be deleted already */
798 && --BmUSEFUL(prog->check_substr) < 0
799 && (prog->check_substr == prog->float_substr)
800 )))
66e933ab 801 {
cad2e5aa 802 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 803 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
804 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
805 if (do_utf8 ? prog->check_substr : prog->check_utf8)
806 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
807 prog->check_substr = prog->check_utf8 = NULL; /* disable */
808 prog->float_substr = prog->float_utf8 = NULL; /* clear */
809 check = NULL; /* abort */
cad2e5aa 810 s = strpos;
3cf5c195
IZ
811 /* XXXX This is a remnant of the old implementation. It
812 looks wasteful, since now INTUIT can use many
6eb5f6b9 813 other heuristics. */
cad2e5aa
JH
814 prog->reganch &= ~RE_USE_INTUIT;
815 }
816 else
817 s = strpos;
818 }
819
6eb5f6b9
JH
820 /* Last resort... */
821 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
822 if (prog->regstclass) {
823 /* minlen == 0 is possible if regstclass is \b or \B,
824 and the fixed substr is ''$.
825 Since minlen is already taken into account, s+1 is before strend;
826 accidentally, minlen >= 1 guaranties no false positives at s + 1
827 even for \b or \B. But (minlen? 1 : 0) below assumes that
828 regstclass does not come from lookahead... */
829 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
830 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 831 const U8* const str = (U8*)STRING(prog->regstclass);
06b5626a 832 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 833 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 834 : 1);
a3b680e6 835 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 836 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 837 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
838 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
839 cl_l, strend)
840 : strend);
6eb5f6b9
JH
841
842 t = s;
4f639d21 843 PL_reg_re = prog;
06b5626a 844 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
6eb5f6b9
JH
845 if (!s) {
846#ifdef DEBUGGING
cbbf8932 847 const char *what = NULL;
6eb5f6b9
JH
848#endif
849 if (endpos == strend) {
a3621e74 850 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
851 "Could not match STCLASS...\n") );
852 goto fail;
853 }
a3621e74 854 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 855 "This position contradicts STCLASS...\n") );
653099ff
GS
856 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
857 goto fail;
6eb5f6b9 858 /* Contradict one of substrings */
33b8afdf
JH
859 if (prog->anchored_substr || prog->anchored_utf8) {
860 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 861 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 862 hop_and_restart:
1aa99e6b 863 s = HOP3c(t, 1, strend);
66e933ab
GS
864 if (s + start_shift + end_shift > strend) {
865 /* XXXX Should be taken into account earlier? */
a3621e74 866 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
867 "Could not match STCLASS...\n") );
868 goto fail;
869 }
5e39e1e5
HS
870 if (!check)
871 goto giveup;
a3621e74 872 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 873 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
874 what, (long)(s + start_shift - i_strpos)) );
875 goto restart;
876 }
66e933ab 877 /* Have both, check_string is floating */
6eb5f6b9
JH
878 if (t + start_shift >= check_at) /* Contradicts floating=check */
879 goto retry_floating_check;
880 /* Recheck anchored substring, but not floating... */
9041c2e3 881 s = check_at;
5e39e1e5
HS
882 if (!check)
883 goto giveup;
a3621e74 884 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 885 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
886 (long)(other_last - i_strpos)) );
887 goto do_other_anchored;
888 }
60e71179
GS
889 /* Another way we could have checked stclass at the
890 current position only: */
891 if (ml_anch) {
892 s = t = t + 1;
5e39e1e5
HS
893 if (!check)
894 goto giveup;
a3621e74 895 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 896 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 897 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 898 goto try_at_offset;
66e933ab 899 }
33b8afdf 900 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 901 goto fail;
6eb5f6b9
JH
902 /* Check is floating subtring. */
903 retry_floating_check:
904 t = check_at - start_shift;
a3621e74 905 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
906 goto hop_and_restart;
907 }
b7953727 908 if (t != s) {
a3621e74 909 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 910 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
911 (long)(t - i_strpos), (long)(s - i_strpos))
912 );
913 }
914 else {
a3621e74 915 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
916 "Does not contradict STCLASS...\n");
917 );
918 }
6eb5f6b9 919 }
5e39e1e5 920 giveup:
a3621e74 921 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
922 PL_colors[4], (check ? "Guessed" : "Giving up"),
923 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 924 return s;
2c2d71f5
JH
925
926 fail_finish: /* Substring not found */
33b8afdf
JH
927 if (prog->check_substr || prog->check_utf8) /* could be removed already */
928 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 929 fail:
a3621e74 930 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 931 PL_colors[4], PL_colors[5]));
bd61b366 932 return NULL;
cad2e5aa 933}
9661b544 934
6eb5f6b9 935/* We know what class REx starts with. Try to find this position... */
3c3eec57 936STATIC char *
a3b680e6 937S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 norun)
a687059c 938{
27da23d5 939 dVAR;
1df70142 940 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 941 char *m;
d8093b23 942 STRLEN ln;
5dab1207 943 STRLEN lnc;
078c425b 944 register STRLEN uskip;
d8093b23
G
945 unsigned int c1;
946 unsigned int c2;
6eb5f6b9
JH
947 char *e;
948 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 949 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 950
6eb5f6b9
JH
951 /* We know what class it must start with. */
952 switch (OP(c)) {
6eb5f6b9 953 case ANYOF:
388cc4de 954 if (do_utf8) {
078c425b 955 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
956 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
957 !UTF8_IS_INVARIANT((U8)s[0]) ?
958 reginclass(c, (U8*)s, 0, do_utf8) :
959 REGINCLASS(c, (U8*)s)) {
960 if (tmp && (norun || regtry(prog, s)))
961 goto got_it;
962 else
963 tmp = doevery;
964 }
965 else
966 tmp = 1;
078c425b 967 s += uskip;
388cc4de
HS
968 }
969 }
970 else {
971 while (s < strend) {
972 STRLEN skip = 1;
973
974 if (REGINCLASS(c, (U8*)s) ||
975 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
976 /* The assignment of 2 is intentional:
977 * for the folded sharp s, the skip is 2. */
978 (skip = SHARP_S_SKIP))) {
979 if (tmp && (norun || regtry(prog, s)))
980 goto got_it;
981 else
982 tmp = doevery;
983 }
984 else
985 tmp = 1;
986 s += skip;
987 }
a0d0e21e 988 }
6eb5f6b9 989 break;
f33976b4
DB
990 case CANY:
991 while (s < strend) {
992 if (tmp && (norun || regtry(prog, s)))
993 goto got_it;
994 else
995 tmp = doevery;
996 s++;
997 }
998 break;
6eb5f6b9 999 case EXACTF:
5dab1207
NIS
1000 m = STRING(c);
1001 ln = STR_LEN(c); /* length to match in octets/bytes */
1002 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1003 if (UTF) {
a2a2844f 1004 STRLEN ulen1, ulen2;
5dab1207 1005 U8 *sm = (U8 *) m;
89ebb4a3
JH
1006 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1007 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1008 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1009
1010 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1011 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1012
89ebb4a3 1013 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1014 0, uniflags);
89ebb4a3 1015 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1016 0, uniflags);
5dab1207
NIS
1017 lnc = 0;
1018 while (sm < ((U8 *) m + ln)) {
1019 lnc++;
1020 sm += UTF8SKIP(sm);
1021 }
1aa99e6b
IH
1022 }
1023 else {
1024 c1 = *(U8*)m;
1025 c2 = PL_fold[c1];
1026 }
6eb5f6b9
JH
1027 goto do_exactf;
1028 case EXACTFL:
5dab1207
NIS
1029 m = STRING(c);
1030 ln = STR_LEN(c);
1031 lnc = (I32) ln;
d8093b23 1032 c1 = *(U8*)m;
6eb5f6b9
JH
1033 c2 = PL_fold_locale[c1];
1034 do_exactf:
db12adc6 1035 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1036
6eb5f6b9
JH
1037 if (norun && e < s)
1038 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1039
60a8b682
JH
1040 /* The idea in the EXACTF* cases is to first find the
1041 * first character of the EXACTF* node and then, if
1042 * necessary, case-insensitively compare the full
1043 * text of the node. The c1 and c2 are the first
1044 * characters (though in Unicode it gets a bit
1045 * more complicated because there are more cases
7f16dd3d
JH
1046 * than just upper and lower: one needs to use
1047 * the so-called folding case for case-insensitive
1048 * matching (called "loose matching" in Unicode).
1049 * ibcmp_utf8() will do just that. */
60a8b682 1050
1aa99e6b 1051 if (do_utf8) {
575cac57 1052 UV c, f;
89ebb4a3 1053 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1054 STRLEN len, foldlen;
4ad0818d 1055 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1056 if (c1 == c2) {
5dab1207
NIS
1057 /* Upper and lower of 1st char are equal -
1058 * probably not a "letter". */
1aa99e6b 1059 while (s <= e) {
89ebb4a3 1060 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1061 uniflags);
80aecb99
JH
1062 if ( c == c1
1063 && (ln == len ||
66423254 1064 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1065 m, (char **)0, ln, (bool)UTF))
55da9344 1066 && (norun || regtry(prog, s)) )
1aa99e6b 1067 goto got_it;
80aecb99 1068 else {
1df70142 1069 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1070 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1071 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1072 if ( f != c
1073 && (f == c1 || f == c2)
1074 && (ln == foldlen ||
66423254
JH
1075 !ibcmp_utf8((char *) foldbuf,
1076 (char **)0, foldlen, do_utf8,
d07ddd77 1077 m,
eb160463 1078 (char **)0, ln, (bool)UTF))
80aecb99
JH
1079 && (norun || regtry(prog, s)) )
1080 goto got_it;
1081 }
1aa99e6b
IH
1082 s += len;
1083 }
09091399
JH
1084 }
1085 else {
1aa99e6b 1086 while (s <= e) {
89ebb4a3 1087 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1088 uniflags);
80aecb99 1089
60a8b682 1090 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1091 * Note that not all the possible combinations
1092 * are handled here: some of them are handled
1093 * by the standard folding rules, and some of
1094 * them (the character class or ANYOF cases)
1095 * are handled during compiletime in
1096 * regexec.c:S_regclass(). */
880bd946
JH
1097 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1098 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1099 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1100
1101 if ( (c == c1 || c == c2)
1102 && (ln == len ||
66423254 1103 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1104 m, (char **)0, ln, (bool)UTF))
55da9344 1105 && (norun || regtry(prog, s)) )
1aa99e6b 1106 goto got_it;
80aecb99 1107 else {
1df70142 1108 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1109 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1110 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1111 if ( f != c
1112 && (f == c1 || f == c2)
1113 && (ln == foldlen ||
a6872d42 1114 !ibcmp_utf8((char *) foldbuf,
66423254 1115 (char **)0, foldlen, do_utf8,
d07ddd77 1116 m,
eb160463 1117 (char **)0, ln, (bool)UTF))
80aecb99
JH
1118 && (norun || regtry(prog, s)) )
1119 goto got_it;
1120 }
1aa99e6b
IH
1121 s += len;
1122 }
09091399 1123 }
1aa99e6b
IH
1124 }
1125 else {
1126 if (c1 == c2)
1127 while (s <= e) {
1128 if ( *(U8*)s == c1
1129 && (ln == 1 || !(OP(c) == EXACTF
1130 ? ibcmp(s, m, ln)
1131 : ibcmp_locale(s, m, ln)))
1132 && (norun || regtry(prog, s)) )
1133 goto got_it;
1134 s++;
1135 }
1136 else
1137 while (s <= e) {
1138 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1139 && (ln == 1 || !(OP(c) == EXACTF
1140 ? ibcmp(s, m, ln)
1141 : ibcmp_locale(s, m, ln)))
1142 && (norun || regtry(prog, s)) )
1143 goto got_it;
1144 s++;
1145 }
b3c9acc1
IZ
1146 }
1147 break;
bbce6d69 1148 case BOUNDL:
3280af22 1149 PL_reg_flags |= RF_tainted;
bbce6d69 1150 /* FALL THROUGH */
a0d0e21e 1151 case BOUND:
ffc61ed2 1152 if (do_utf8) {
12d33761 1153 if (s == PL_bostr)
ffc61ed2
JH
1154 tmp = '\n';
1155 else {
6136c704 1156 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1157 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1158 }
1159 tmp = ((OP(c) == BOUND ?
9041c2e3 1160 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1161 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1162 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1163 if (tmp == !(OP(c) == BOUND ?
3568d838 1164 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1165 isALNUM_LC_utf8((U8*)s)))
1166 {
1167 tmp = !tmp;
1168 if ((norun || regtry(prog, s)))
1169 goto got_it;
1170 }
078c425b 1171 s += uskip;
a687059c 1172 }
a0d0e21e 1173 }
667bb95a 1174 else {
12d33761 1175 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1176 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1177 while (s < strend) {
1178 if (tmp ==
1179 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1180 tmp = !tmp;
1181 if ((norun || regtry(prog, s)))
1182 goto got_it;
1183 }
1184 s++;
a0ed51b3 1185 }
a0ed51b3 1186 }
6eb5f6b9 1187 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1188 goto got_it;
1189 break;
bbce6d69 1190 case NBOUNDL:
3280af22 1191 PL_reg_flags |= RF_tainted;
bbce6d69 1192 /* FALL THROUGH */
a0d0e21e 1193 case NBOUND:
ffc61ed2 1194 if (do_utf8) {
12d33761 1195 if (s == PL_bostr)
ffc61ed2
JH
1196 tmp = '\n';
1197 else {
6136c704 1198 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1199 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1200 }
1201 tmp = ((OP(c) == NBOUND ?
9041c2e3 1202 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1203 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1204 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1205 if (tmp == !(OP(c) == NBOUND ?
3568d838 1206 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1207 isALNUM_LC_utf8((U8*)s)))
1208 tmp = !tmp;
1209 else if ((norun || regtry(prog, s)))
1210 goto got_it;
078c425b 1211 s += uskip;
ffc61ed2 1212 }
a0d0e21e 1213 }
667bb95a 1214 else {
12d33761 1215 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1216 tmp = ((OP(c) == NBOUND ?
1217 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1218 while (s < strend) {
1219 if (tmp ==
1220 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1221 tmp = !tmp;
1222 else if ((norun || regtry(prog, s)))
1223 goto got_it;
1224 s++;
1225 }
a0ed51b3 1226 }
6eb5f6b9 1227 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1228 goto got_it;
1229 break;
a0d0e21e 1230 case ALNUM:
ffc61ed2 1231 if (do_utf8) {
1a4fad37 1232 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1233 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1234 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1235 if (tmp && (norun || regtry(prog, s)))
1236 goto got_it;
1237 else
1238 tmp = doevery;
1239 }
bbce6d69 1240 else
ffc61ed2 1241 tmp = 1;
078c425b 1242 s += uskip;
bbce6d69 1243 }
bbce6d69 1244 }
ffc61ed2
JH
1245 else {
1246 while (s < strend) {
1247 if (isALNUM(*s)) {
1248 if (tmp && (norun || regtry(prog, s)))
1249 goto got_it;
1250 else
1251 tmp = doevery;
1252 }
a0ed51b3 1253 else
ffc61ed2
JH
1254 tmp = 1;
1255 s++;
a0ed51b3 1256 }
a0ed51b3
LW
1257 }
1258 break;
bbce6d69 1259 case ALNUML:
3280af22 1260 PL_reg_flags |= RF_tainted;
ffc61ed2 1261 if (do_utf8) {
078c425b 1262 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1263 if (isALNUM_LC_utf8((U8*)s)) {
1264 if (tmp && (norun || regtry(prog, s)))
1265 goto got_it;
1266 else
1267 tmp = doevery;
1268 }
a687059c 1269 else
ffc61ed2 1270 tmp = 1;
078c425b 1271 s += uskip;
a0d0e21e 1272 }
a0d0e21e 1273 }
ffc61ed2
JH
1274 else {
1275 while (s < strend) {
1276 if (isALNUM_LC(*s)) {
1277 if (tmp && (norun || regtry(prog, s)))
1278 goto got_it;
1279 else
1280 tmp = doevery;
1281 }
a0ed51b3 1282 else
ffc61ed2
JH
1283 tmp = 1;
1284 s++;
a0ed51b3 1285 }
a0ed51b3
LW
1286 }
1287 break;
a0d0e21e 1288 case NALNUM:
ffc61ed2 1289 if (do_utf8) {
1a4fad37 1290 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1291 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1292 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1293 if (tmp && (norun || regtry(prog, s)))
1294 goto got_it;
1295 else
1296 tmp = doevery;
1297 }
bbce6d69 1298 else
ffc61ed2 1299 tmp = 1;
078c425b 1300 s += uskip;
bbce6d69 1301 }
bbce6d69 1302 }
ffc61ed2
JH
1303 else {
1304 while (s < strend) {
1305 if (!isALNUM(*s)) {
1306 if (tmp && (norun || regtry(prog, s)))
1307 goto got_it;
1308 else
1309 tmp = doevery;
1310 }
a0ed51b3 1311 else
ffc61ed2
JH
1312 tmp = 1;
1313 s++;
a0ed51b3 1314 }
a0ed51b3
LW
1315 }
1316 break;
bbce6d69 1317 case NALNUML:
3280af22 1318 PL_reg_flags |= RF_tainted;
ffc61ed2 1319 if (do_utf8) {
078c425b 1320 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1321 if (!isALNUM_LC_utf8((U8*)s)) {
1322 if (tmp && (norun || regtry(prog, s)))
1323 goto got_it;
1324 else
1325 tmp = doevery;
1326 }
a687059c 1327 else
ffc61ed2 1328 tmp = 1;
078c425b 1329 s += uskip;
a687059c 1330 }
a0d0e21e 1331 }
ffc61ed2
JH
1332 else {
1333 while (s < strend) {
1334 if (!isALNUM_LC(*s)) {
1335 if (tmp && (norun || regtry(prog, s)))
1336 goto got_it;
1337 else
1338 tmp = doevery;
1339 }
a0ed51b3 1340 else
ffc61ed2
JH
1341 tmp = 1;
1342 s++;
a0ed51b3 1343 }
a0ed51b3
LW
1344 }
1345 break;
a0d0e21e 1346 case SPACE:
ffc61ed2 1347 if (do_utf8) {
1a4fad37 1348 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1349 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1350 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1351 if (tmp && (norun || regtry(prog, s)))
1352 goto got_it;
1353 else
1354 tmp = doevery;
1355 }
a0d0e21e 1356 else
ffc61ed2 1357 tmp = 1;
078c425b 1358 s += uskip;
2304df62 1359 }
a0d0e21e 1360 }
ffc61ed2
JH
1361 else {
1362 while (s < strend) {
1363 if (isSPACE(*s)) {
1364 if (tmp && (norun || regtry(prog, s)))
1365 goto got_it;
1366 else
1367 tmp = doevery;
1368 }
a0ed51b3 1369 else
ffc61ed2
JH
1370 tmp = 1;
1371 s++;
a0ed51b3 1372 }
a0ed51b3
LW
1373 }
1374 break;
bbce6d69 1375 case SPACEL:
3280af22 1376 PL_reg_flags |= RF_tainted;
ffc61ed2 1377 if (do_utf8) {
078c425b 1378 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1379 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1380 if (tmp && (norun || regtry(prog, s)))
1381 goto got_it;
1382 else
1383 tmp = doevery;
1384 }
bbce6d69 1385 else
ffc61ed2 1386 tmp = 1;
078c425b 1387 s += uskip;
bbce6d69 1388 }
bbce6d69 1389 }
ffc61ed2
JH
1390 else {
1391 while (s < strend) {
1392 if (isSPACE_LC(*s)) {
1393 if (tmp && (norun || regtry(prog, s)))
1394 goto got_it;
1395 else
1396 tmp = doevery;
1397 }
a0ed51b3 1398 else
ffc61ed2
JH
1399 tmp = 1;
1400 s++;
a0ed51b3 1401 }
a0ed51b3
LW
1402 }
1403 break;
a0d0e21e 1404 case NSPACE:
ffc61ed2 1405 if (do_utf8) {
1a4fad37 1406 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1407 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1408 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1409 if (tmp && (norun || regtry(prog, s)))
1410 goto got_it;
1411 else
1412 tmp = doevery;
1413 }
a0d0e21e 1414 else
ffc61ed2 1415 tmp = 1;
078c425b 1416 s += uskip;
a687059c 1417 }
a0d0e21e 1418 }
ffc61ed2
JH
1419 else {
1420 while (s < strend) {
1421 if (!isSPACE(*s)) {
1422 if (tmp && (norun || regtry(prog, s)))
1423 goto got_it;
1424 else
1425 tmp = doevery;
1426 }
a0ed51b3 1427 else
ffc61ed2
JH
1428 tmp = 1;
1429 s++;
a0ed51b3 1430 }
a0ed51b3
LW
1431 }
1432 break;
bbce6d69 1433 case NSPACEL:
3280af22 1434 PL_reg_flags |= RF_tainted;
ffc61ed2 1435 if (do_utf8) {
078c425b 1436 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1437 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1438 if (tmp && (norun || regtry(prog, s)))
1439 goto got_it;
1440 else
1441 tmp = doevery;
1442 }
bbce6d69 1443 else
ffc61ed2 1444 tmp = 1;
078c425b 1445 s += uskip;
bbce6d69 1446 }
bbce6d69 1447 }
ffc61ed2
JH
1448 else {
1449 while (s < strend) {
1450 if (!isSPACE_LC(*s)) {
1451 if (tmp && (norun || regtry(prog, s)))
1452 goto got_it;
1453 else
1454 tmp = doevery;
1455 }
a0ed51b3 1456 else
ffc61ed2
JH
1457 tmp = 1;
1458 s++;
a0ed51b3 1459 }
a0ed51b3
LW
1460 }
1461 break;
a0d0e21e 1462 case DIGIT:
ffc61ed2 1463 if (do_utf8) {
1a4fad37 1464 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1465 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1466 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1467 if (tmp && (norun || regtry(prog, s)))
1468 goto got_it;
1469 else
1470 tmp = doevery;
1471 }
a0d0e21e 1472 else
ffc61ed2 1473 tmp = 1;
078c425b 1474 s += uskip;
2b69d0c2 1475 }
a0d0e21e 1476 }
ffc61ed2
JH
1477 else {
1478 while (s < strend) {
1479 if (isDIGIT(*s)) {
1480 if (tmp && (norun || regtry(prog, s)))
1481 goto got_it;
1482 else
1483 tmp = doevery;
1484 }
a0ed51b3 1485 else
ffc61ed2
JH
1486 tmp = 1;
1487 s++;
a0ed51b3 1488 }
a0ed51b3
LW
1489 }
1490 break;
b8c5462f
JH
1491 case DIGITL:
1492 PL_reg_flags |= RF_tainted;
ffc61ed2 1493 if (do_utf8) {
078c425b 1494 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1495 if (isDIGIT_LC_utf8((U8*)s)) {
1496 if (tmp && (norun || regtry(prog, s)))
1497 goto got_it;
1498 else
1499 tmp = doevery;
1500 }
b8c5462f 1501 else
ffc61ed2 1502 tmp = 1;
078c425b 1503 s += uskip;
b8c5462f 1504 }
b8c5462f 1505 }
ffc61ed2
JH
1506 else {
1507 while (s < strend) {
1508 if (isDIGIT_LC(*s)) {
1509 if (tmp && (norun || regtry(prog, s)))
1510 goto got_it;
1511 else
1512 tmp = doevery;
1513 }
b8c5462f 1514 else
ffc61ed2
JH
1515 tmp = 1;
1516 s++;
b8c5462f 1517 }
b8c5462f
JH
1518 }
1519 break;
a0d0e21e 1520 case NDIGIT:
ffc61ed2 1521 if (do_utf8) {
1a4fad37 1522 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1523 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1524 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1525 if (tmp && (norun || regtry(prog, s)))
1526 goto got_it;
1527 else
1528 tmp = doevery;
1529 }
a0d0e21e 1530 else
ffc61ed2 1531 tmp = 1;
078c425b 1532 s += uskip;
a687059c 1533 }
a0d0e21e 1534 }
ffc61ed2
JH
1535 else {
1536 while (s < strend) {
1537 if (!isDIGIT(*s)) {
1538 if (tmp && (norun || regtry(prog, s)))
1539 goto got_it;
1540 else
1541 tmp = doevery;
1542 }
a0ed51b3 1543 else
ffc61ed2
JH
1544 tmp = 1;
1545 s++;
a0ed51b3 1546 }
a0ed51b3
LW
1547 }
1548 break;
b8c5462f
JH
1549 case NDIGITL:
1550 PL_reg_flags |= RF_tainted;
ffc61ed2 1551 if (do_utf8) {
078c425b 1552 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1553 if (!isDIGIT_LC_utf8((U8*)s)) {
1554 if (tmp && (norun || regtry(prog, s)))
1555 goto got_it;
1556 else
1557 tmp = doevery;
1558 }
b8c5462f 1559 else
ffc61ed2 1560 tmp = 1;
078c425b 1561 s += uskip;
b8c5462f 1562 }
a0ed51b3 1563 }
ffc61ed2
JH
1564 else {
1565 while (s < strend) {
1566 if (!isDIGIT_LC(*s)) {
1567 if (tmp && (norun || regtry(prog, s)))
1568 goto got_it;
1569 else
1570 tmp = doevery;
1571 }
cf93c79d 1572 else
ffc61ed2
JH
1573 tmp = 1;
1574 s++;
b8c5462f 1575 }
b8c5462f
JH
1576 }
1577 break;
b3c9acc1 1578 default:
3c3eec57
GS
1579 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1580 break;
d6a28714 1581 }
6eb5f6b9
JH
1582 return 0;
1583 got_it:
1584 return s;
1585}
1586
1587/*
1588 - regexec_flags - match a regexp against a string
1589 */
1590I32
1591Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1592 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1593/* strend: pointer to null at end of string */
1594/* strbeg: real beginning of string */
1595/* minend: end of match must be >=minend after stringarg. */
1596/* data: May be used for some additional optimizations. */
1597/* nosave: For optimizations. */
1598{
97aff369 1599 dVAR;
6eb5f6b9
JH
1600 register char *s;
1601 register regnode *c;
1602 register char *startpos = stringarg;
6eb5f6b9
JH
1603 I32 minlen; /* must match at least this many chars */
1604 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1605 I32 end_shift = 0; /* Same for the end. */ /* CC */
1606 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1607 char *scream_olds = NULL;
6eb5f6b9 1608 SV* oreplsv = GvSV(PL_replgv);
1df70142 1609 const bool do_utf8 = DO_UTF8(sv);
a3b680e6 1610 const I32 multiline = prog->reganch & PMf_MULTILINE;
2a782b5b 1611#ifdef DEBUGGING
6136c704
AL
1612 SV * const dsv0 = PERL_DEBUG_PAD_ZERO(0);
1613 SV * const dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1614#endif
a3621e74
YO
1615
1616 GET_RE_DEBUG_FLAGS_DECL;
1617
9d4ba2ae 1618 PERL_UNUSED_ARG(data);
a30b2f1f 1619 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9 1620
4f639d21 1621 PL_reg_re = prog;
6eb5f6b9 1622#ifdef DEBUGGING
aea4f609 1623 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1624#endif
1625
1626 /* Be paranoid... */
1627 if (prog == NULL || startpos == NULL) {
1628 Perl_croak(aTHX_ "NULL regexp parameter");
1629 return 0;
1630 }
1631
1632 minlen = prog->minlen;
61a36c01 1633 if (strend - startpos < minlen) {
a3621e74 1634 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1635 "String too short [regexec_flags]...\n"));
1636 goto phooey;
1aa99e6b 1637 }
6eb5f6b9 1638
6eb5f6b9
JH
1639 /* Check validity of program. */
1640 if (UCHARAT(prog->program) != REG_MAGIC) {
1641 Perl_croak(aTHX_ "corrupted regexp program");
1642 }
1643
1644 PL_reg_flags = 0;
1645 PL_reg_eval_set = 0;
1646 PL_reg_maxiter = 0;
1647
1648 if (prog->reganch & ROPT_UTF8)
1649 PL_reg_flags |= RF_utf8;
1650
1651 /* Mark beginning of line for ^ and lookbehind. */
1652 PL_regbol = startpos;
1653 PL_bostr = strbeg;
1654 PL_reg_sv = sv;
1655
1656 /* Mark end of line for $ (and such) */
1657 PL_regeol = strend;
1658
1659 /* see how far we have to get to not match where we matched before */
1660 PL_regtill = startpos+minend;
1661
1662 /* We start without call_cc context. */
1663 PL_reg_call_cc = 0;
1664
1665 /* If there is a "must appear" string, look for it. */
1666 s = startpos;
1667
1668 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1669 MAGIC *mg;
1670
1671 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1672 PL_reg_ganch = startpos;
1673 else if (sv && SvTYPE(sv) >= SVt_PVMG
1674 && SvMAGIC(sv)
14befaf4
DM
1675 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1676 && mg->mg_len >= 0) {
6eb5f6b9
JH
1677 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1678 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1679 if (s > PL_reg_ganch)
6eb5f6b9
JH
1680 goto phooey;
1681 s = PL_reg_ganch;
1682 }
1683 }
1684 else /* pos() not defined */
1685 PL_reg_ganch = strbeg;
1686 }
1687
a0714e2c 1688 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1689 re_scream_pos_data d;
1690
1691 d.scream_olds = &scream_olds;
1692 d.scream_pos = &scream_pos;
1693 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1694 if (!s) {
a3621e74 1695 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1696 goto phooey; /* not present */
3fa9c3d7 1697 }
6eb5f6b9
JH
1698 }
1699
a3621e74 1700 DEBUG_EXECUTE_r({
1df70142
AL
1701 const char * const s0 = UTF
1702 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1703 UNI_DISPLAY_REGEX)
1704 : prog->precomp;
1705 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1706 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1707 UNI_DISPLAY_REGEX) : startpos;
1df70142 1708 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1709 if (!PL_colorset)
1710 reginitcolors();
1711 PerlIO_printf(Perl_debug_log,
a0288114 1712 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1713 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1714 len0, len0, s0,
2a782b5b 1715 PL_colors[1],
9e55ce06 1716 len0 > 60 ? "..." : "",
2a782b5b 1717 PL_colors[0],
9e55ce06
JH
1718 (int)(len1 > 60 ? 60 : len1),
1719 s1, PL_colors[1],
1720 (len1 > 60 ? "..." : "")
2a782b5b
JH
1721 );
1722 });
6eb5f6b9
JH
1723
1724 /* Simplest case: anchored match need be tried only once. */
1725 /* [unless only anchor is BOL and multiline is set] */
1726 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1727 if (s == startpos && regtry(prog, startpos))
1728 goto got_it;
7fba1cd6 1729 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1730 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1731 {
1732 char *end;
1733
1734 if (minlen)
1735 dontbother = minlen - 1;
1aa99e6b 1736 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1737 /* for multiline we only have to try after newlines */
33b8afdf 1738 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1739 if (s == startpos)
1740 goto after_try;
1741 while (1) {
1742 if (regtry(prog, s))
1743 goto got_it;
1744 after_try:
1745 if (s >= end)
1746 goto phooey;
1747 if (prog->reganch & RE_USE_INTUIT) {
1748 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1749 if (!s)
1750 goto phooey;
1751 }
1752 else
1753 s++;
1754 }
1755 } else {
1756 if (s > startpos)
1757 s--;
1758 while (s < end) {
1759 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1760 if (regtry(prog, s))
1761 goto got_it;
1762 }
1763 }
1764 }
1765 }
1766 goto phooey;
1767 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1768 if (regtry(prog, PL_reg_ganch))
1769 goto got_it;
1770 goto phooey;
1771 }
1772
1773 /* Messy cases: unanchored match. */
33b8afdf 1774 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1775 /* we have /x+whatever/ */
1776 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1777 char ch;
bf93d4cc
GS
1778#ifdef DEBUGGING
1779 int did_match = 0;
1780#endif
33b8afdf
JH
1781 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1782 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1783 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1784
1aa99e6b 1785 if (do_utf8) {
6eb5f6b9
JH
1786 while (s < strend) {
1787 if (*s == ch) {
a3621e74 1788 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1789 if (regtry(prog, s)) goto got_it;
1790 s += UTF8SKIP(s);
1791 while (s < strend && *s == ch)
1792 s += UTF8SKIP(s);
1793 }
1794 s += UTF8SKIP(s);
1795 }
1796 }
1797 else {
1798 while (s < strend) {
1799 if (*s == ch) {
a3621e74 1800 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1801 if (regtry(prog, s)) goto got_it;
1802 s++;
1803 while (s < strend && *s == ch)
1804 s++;
1805 }
1806 s++;
1807 }
1808 }
a3621e74 1809 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1810 PerlIO_printf(Perl_debug_log,
b7953727
JH
1811 "Did not find anchored character...\n")
1812 );
6eb5f6b9 1813 }
a0714e2c
SS
1814 else if (prog->anchored_substr != NULL
1815 || prog->anchored_utf8 != NULL
1816 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1817 && prog->float_max_offset < strend - s)) {
1818 SV *must;
1819 I32 back_max;
1820 I32 back_min;
1821 char *last;
6eb5f6b9 1822 char *last1; /* Last position checked before */
bf93d4cc
GS
1823#ifdef DEBUGGING
1824 int did_match = 0;
1825#endif
33b8afdf
JH
1826 if (prog->anchored_substr || prog->anchored_utf8) {
1827 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1828 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1829 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1830 back_max = back_min = prog->anchored_offset;
1831 } else {
1832 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1833 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1834 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1835 back_max = prog->float_max_offset;
1836 back_min = prog->float_min_offset;
1837 }
1838 if (must == &PL_sv_undef)
1839 /* could not downgrade utf8 check substring, so must fail */
1840 goto phooey;
1841
1842 last = HOP3c(strend, /* Cannot start after this */
1843 -(I32)(CHR_SVLEN(must)
1844 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1845
1846 if (s > PL_bostr)
1847 last1 = HOPc(s, -1);
1848 else
1849 last1 = s - 1; /* bogus */
1850
a0288114 1851 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1852 check_substr==must. */
1853 scream_pos = -1;
1854 dontbother = end_shift;
1855 strend = HOPc(strend, -dontbother);
1856 while ( (s <= last) &&
9041c2e3 1857 ((flags & REXEC_SCREAM)
1aa99e6b 1858 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1859 end_shift, &scream_pos, 0))
1aa99e6b 1860 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1861 (unsigned char*)strend, must,
7fba1cd6 1862 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1863 /* we may be pointing at the wrong string */
1864 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1865 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1866 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1867 if (HOPc(s, -back_max) > last1) {
1868 last1 = HOPc(s, -back_min);
1869 s = HOPc(s, -back_max);
1870 }
1871 else {
52657f30 1872 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1873
1874 last1 = HOPc(s, -back_min);
52657f30 1875 s = t;
6eb5f6b9 1876 }
1aa99e6b 1877 if (do_utf8) {
6eb5f6b9
JH
1878 while (s <= last1) {
1879 if (regtry(prog, s))
1880 goto got_it;
1881 s += UTF8SKIP(s);
1882 }
1883 }
1884 else {
1885 while (s <= last1) {
1886 if (regtry(prog, s))
1887 goto got_it;
1888 s++;
1889 }
1890 }
1891 }
a3621e74 1892 DEBUG_EXECUTE_r(if (!did_match)
b7953727 1893 PerlIO_printf(Perl_debug_log,
a0288114 1894 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1895 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1896 ? "anchored" : "floating"),
1897 PL_colors[0],
1898 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 1899 SvPVX_const(must),
b7953727
JH
1900 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1901 );
6eb5f6b9
JH
1902 goto phooey;
1903 }
155aba94 1904 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1905 if (minlen) {
1906 I32 op = (U8)OP(prog->regstclass);
66e933ab 1907 /* don't bother with what can't match */
f14c76ed
RGS
1908 if (PL_regkind[op] != EXACT && op != CANY)
1909 strend = HOPc(strend, -(minlen - 1));
1910 }
a3621e74 1911 DEBUG_EXECUTE_r({
ffc61ed2 1912 SV *prop = sv_newmortal();
cfd0369c
NC
1913 const char *s0;
1914 const char *s1;
9e55ce06
JH
1915 int len0;
1916 int len1;
1917
ffc61ed2 1918 regprop(prop, c);
9e55ce06 1919 s0 = UTF ?
3f7c398e 1920 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 1921 UNI_DISPLAY_REGEX) :
cfd0369c 1922 SvPVX_const(prop);
9e55ce06
JH
1923 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1924 s1 = UTF ?
c728cb41 1925 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1926 len1 = UTF ? SvCUR(dsv1) : strend - s;
1927 PerlIO_printf(Perl_debug_log,
a0288114 1928 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
9e55ce06
JH
1929 len0, len0, s0,
1930 len1, len1, s1);
ffc61ed2 1931 });
06b5626a 1932 if (find_byclass(prog, c, s, strend, 0))
6eb5f6b9 1933 goto got_it;
a3621e74 1934 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1935 }
1936 else {
1937 dontbother = 0;
a0714e2c 1938 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1939 /* Trim the end. */
d6a28714 1940 char *last;
33b8afdf
JH
1941 SV* float_real;
1942
1943 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1944 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1945 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1946
1947 if (flags & REXEC_SCREAM) {
33b8afdf 1948 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1949 end_shift, &scream_pos, 1); /* last one */
1950 if (!last)
ffc61ed2 1951 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1952 /* we may be pointing at the wrong string */
1953 else if (RX_MATCH_COPIED(prog))
3f7c398e 1954 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1955 }
d6a28714
JH
1956 else {
1957 STRLEN len;
cfd0369c 1958 const char * const little = SvPV_const(float_real, len);
d6a28714 1959
33b8afdf 1960 if (SvTAIL(float_real)) {
d6a28714
JH
1961 if (memEQ(strend - len + 1, little, len - 1))
1962 last = strend - len + 1;
7fba1cd6 1963 else if (!multiline)
9041c2e3 1964 last = memEQ(strend - len, little, len)
bd61b366 1965 ? strend - len : NULL;
b8c5462f 1966 else
d6a28714
JH
1967 goto find_last;
1968 } else {
1969 find_last:
9041c2e3 1970 if (len)
d6a28714 1971 last = rninstr(s, strend, little, little + len);
b8c5462f 1972 else
a0288114 1973 last = strend; /* matching "$" */
b8c5462f 1974 }
b8c5462f 1975 }
bf93d4cc 1976 if (last == NULL) {
a3621e74 1977 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 1978 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 1979 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1980 goto phooey; /* Should not happen! */
1981 }
d6a28714
JH
1982 dontbother = strend - last + prog->float_min_offset;
1983 }
1984 if (minlen && (dontbother < minlen))
1985 dontbother = minlen - 1;
1986 strend -= dontbother; /* this one's always in bytes! */
1987 /* We don't know much -- general case. */
1aa99e6b 1988 if (do_utf8) {
d6a28714
JH
1989 for (;;) {
1990 if (regtry(prog, s))
1991 goto got_it;
1992 if (s >= strend)
1993 break;
b8c5462f 1994 s += UTF8SKIP(s);
d6a28714
JH
1995 };
1996 }
1997 else {
1998 do {
1999 if (regtry(prog, s))
2000 goto got_it;
2001 } while (s++ < strend);
2002 }
2003 }
2004
2005 /* Failure. */
2006 goto phooey;
2007
2008got_it:
2009 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2010
2011 if (PL_reg_eval_set) {
2012 /* Preserve the current value of $^R */
2013 if (oreplsv != GvSV(PL_replgv))
2014 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2015 restored, the value remains
2016 the same. */
4f639d21 2017 restore_pos(aTHX_ prog);
d6a28714
JH
2018 }
2019
2020 /* make sure $`, $&, $', and $digit will work later */
2021 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2022 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2023 if (flags & REXEC_COPY_STR) {
2024 I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2025#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2026 if ((SvIsCOW(sv)
2027 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2028 if (DEBUG_C_TEST) {
2029 PerlIO_printf(Perl_debug_log,
2030 "Copy on write: regexp capture, type %d\n",
2031 (int) SvTYPE(sv));
2032 }
2033 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2034 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2035 assert (SvPOKp(prog->saved_copy));
2036 } else
2037#endif
2038 {
2039 RX_MATCH_COPIED_on(prog);
2040 s = savepvn(strbeg, i);
2041 prog->subbeg = s;
2042 }
d6a28714 2043 prog->sublen = i;
d6a28714
JH
2044 }
2045 else {
2046 prog->subbeg = strbeg;
2047 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2048 }
2049 }
9041c2e3 2050
d6a28714
JH
2051 return 1;
2052
2053phooey:
a3621e74 2054 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2055 PL_colors[4], PL_colors[5]));
d6a28714 2056 if (PL_reg_eval_set)
4f639d21 2057 restore_pos(aTHX_ prog);
d6a28714
JH
2058 return 0;
2059}
2060
2061/*
2062 - regtry - try match at specific point
2063 */
2064STATIC I32 /* 0 failure, 1 success */
2065S_regtry(pTHX_ regexp *prog, char *startpos)
2066{
97aff369 2067 dVAR;
d6a28714
JH
2068 register I32 i;
2069 register I32 *sp;
2070 register I32 *ep;
2071 CHECKPOINT lastcp;
a3621e74 2072 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2073
02db2b7b
IZ
2074#ifdef DEBUGGING
2075 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2076#endif
d6a28714
JH
2077 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2078 MAGIC *mg;
2079
2080 PL_reg_eval_set = RS_init;
a3621e74 2081 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2082 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2083 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2084 ));
e8347627 2085 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2086 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2087 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2088 SAVETMPS;
2089 /* Apparently this is not needed, judging by wantarray. */
e8347627 2090 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2091 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2092
2093 if (PL_reg_sv) {
2094 /* Make $_ available to executed code. */
2095 if (PL_reg_sv != DEFSV) {
59f00321 2096 SAVE_DEFSV;
d6a28714 2097 DEFSV = PL_reg_sv;
b8c5462f 2098 }
d6a28714 2099
9041c2e3 2100 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2101 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2102 /* prepare for quick setting of pos */
d300d9fa
NC
2103#ifdef PERL_OLD_COPY_ON_WRITE
2104 if (SvIsCOW(sv))
2105 sv_force_normal_flags(sv, 0);
2106#endif
2107 mg = sv_magicext(PL_reg_sv, (SV*)0, PERL_MAGIC_regex_global,
2108 &PL_vtbl_mglob, NULL, 0);
d6a28714 2109 mg->mg_len = -1;
b8c5462f 2110 }
d6a28714
JH
2111 PL_reg_magic = mg;
2112 PL_reg_oldpos = mg->mg_len;
4f639d21 2113 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2114 }
09687e5a 2115 if (!PL_reg_curpm) {
a02a5408 2116 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2117#ifdef USE_ITHREADS
2118 {
2119 SV* repointer = newSViv(0);
577e12cc 2120 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2121 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2122 av_push(PL_regex_padav,repointer);
2123 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2124 PL_regex_pad = AvARRAY(PL_regex_padav);
2125 }
2126#endif
2127 }
aaa362c4 2128 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2129 PL_reg_oldcurpm = PL_curpm;
2130 PL_curpm = PL_reg_curpm;
2131 if (RX_MATCH_COPIED(prog)) {
2132 /* Here is a serious problem: we cannot rewrite subbeg,
2133 since it may be needed if this match fails. Thus
2134 $` inside (?{}) could fail... */
2135 PL_reg_oldsaved = prog->subbeg;
2136 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2137#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2138 PL_nrs = prog->saved_copy;
2139#endif
d6a28714
JH
2140 RX_MATCH_COPIED_off(prog);
2141 }
2142 else
bd61b366 2143 PL_reg_oldsaved = NULL;
d6a28714
JH
2144 prog->subbeg = PL_bostr;
2145 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2146 }
973dddac 2147 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2148 PL_reginput = startpos;
2149 PL_regstartp = prog->startp;
2150 PL_regendp = prog->endp;
2151 PL_reglastparen = &prog->lastparen;
a01268b5 2152 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2153 prog->lastparen = 0;
03994de8 2154 prog->lastcloseparen = 0;
d6a28714 2155 PL_regsize = 0;
a3621e74 2156 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2157 if (PL_reg_start_tmpl <= prog->nparens) {
2158 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2159 if(PL_reg_start_tmp)
2160 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2161 else
a02a5408 2162 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2163 }
2164
2165 /* XXXX What this code is doing here?!!! There should be no need
2166 to do this again and again, PL_reglastparen should take care of
3dd2943c 2167 this! --ilya*/
dafc8851
JH
2168
2169 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2170 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2171 * PL_reglastparen), is not needed at all by the test suite
2172 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2173 * enough, for building DynaLoader, or otherwise this
2174 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2175 * will happen. Meanwhile, this code *is* needed for the
2176 * above-mentioned test suite tests to succeed. The common theme
2177 * on those tests seems to be returning null fields from matches.
2178 * --jhi */
dafc8851 2179#if 1
d6a28714
JH
2180 sp = prog->startp;
2181 ep = prog->endp;
2182 if (prog->nparens) {
eb160463 2183 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2184 *++sp = -1;
2185 *++ep = -1;
2186 }
2187 }
dafc8851 2188#endif
02db2b7b 2189 REGCP_SET(lastcp);
4f639d21 2190 if (regmatch(prog, prog->program + 1)) {
d6a28714
JH
2191 prog->endp[0] = PL_reginput - PL_bostr;
2192 return 1;
2193 }
02db2b7b 2194 REGCP_UNWIND(lastcp);
d6a28714
JH
2195 return 0;
2196}
2197
02db2b7b
IZ
2198#define RE_UNWIND_BRANCH 1
2199#define RE_UNWIND_BRANCHJ 2
2200
2201union re_unwind_t;
2202
2203typedef struct { /* XX: makes sense to enlarge it... */
2204 I32 type;
2205 I32 prev;
2206 CHECKPOINT lastcp;
2207} re_unwind_generic_t;
2208
2209typedef struct {
2210 I32 type;
2211 I32 prev;
2212 CHECKPOINT lastcp;
2213 I32 lastparen;
2214 regnode *next;
2215 char *locinput;
2216 I32 nextchr;
2217#ifdef DEBUGGING
2218 int regindent;
2219#endif
2220} re_unwind_branch_t;
2221
2222typedef union re_unwind_t {
2223 I32 type;
2224 re_unwind_generic_t generic;
2225 re_unwind_branch_t branch;
2226} re_unwind_t;
2227
8ba1375e
MJD
2228#define sayYES goto yes
2229#define sayNO goto no
e0f9d4a8 2230#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2231#define sayYES_FINAL goto yes_final
2232#define sayYES_LOUD goto yes_loud
2233#define sayNO_FINAL goto no_final
2234#define sayNO_SILENT goto do_no
2235#define saySAME(x) if (x) goto yes; else goto no
2236
3ab3c9b4
HS
2237#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2238#define POSCACHE_SEEN 1 /* we know what we're caching */
2239#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2240#define CACHEsayYES STMT_START { \
d8319b27 2241 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
3ab3c9b4
HS
2242 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2243 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2244 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2245 /* cache records failure, but this is success */ \
2246 DEBUG_r( \
2247 PerlIO_printf(Perl_debug_log, \
2248 "%*s (remove success from failure cache)\n", \
2249 REPORT_CODE_OFF+PL_regindent*2, "") \
2250 ); \
d8319b27 2251 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2252 } \
2253 } \
2254 sayYES; \
2255} STMT_END
2256#define CACHEsayNO STMT_START { \
d8319b27 2257 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
3ab3c9b4
HS
2258 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2259 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2260 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2261 /* cache records success, but this is failure */ \
2262 DEBUG_r( \
2263 PerlIO_printf(Perl_debug_log, \
2264 "%*s (remove failure from success cache)\n", \
2265 REPORT_CODE_OFF+PL_regindent*2, "") \
2266 ); \
d8319b27 2267 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2268 } \
2269 } \
2270 sayNO; \
2271} STMT_END
2272
a3621e74
YO
2273/* this is used to determine how far from the left messages like
2274 'failed...' are printed. Currently 29 makes these messages line
2275 up with the opcode they refer to. Earlier perls used 25 which
2276 left these messages outdented making reviewing a debug output
2277 quite difficult.
2278*/
2279#define REPORT_CODE_OFF 29
2280
2281
2282/* Make sure there is a test for this +1 options in re_tests */
2283#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2284
5d9a96ca
DM
2285/* grab a new slab and return the first slot in it */
2286
2287STATIC regmatch_state *
2288S_push_slab(pTHX)
2289{
2290 regmatch_slab *s = PL_regmatch_slab->next;
2291 if (!s) {
2292 Newx(s, 1, regmatch_slab);
2293 s->prev = PL_regmatch_slab;
2294 s->next = NULL;
2295 PL_regmatch_slab->next = s;
2296 }
2297 PL_regmatch_slab = s;
2298 return &s->states[0];
2299}
5b47454d 2300
95b24440
DM
2301/* simulate a recursive call to regmatch */
2302
2303#define REGMATCH(ns, where) \
5d9a96ca
DM
2304 st->scan = scan; \
2305 scan = (ns); \
2306 st->resume_state = resume_##where; \
95b24440
DM
2307 goto start_recurse; \
2308 resume_point_##where:
2309
d6a28714
JH
2310/*
2311 - regmatch - main matching routine
2312 *
2313 * Conceptually the strategy is simple: check to see whether the current
2314 * node matches, call self recursively to see whether the rest matches,
2315 * and then act accordingly. In practice we make some effort to avoid
2316 * recursion, in particular by going through "ordinary" nodes (that don't
2317 * need to know whether the rest of the match failed) by a loop instead of
2318 * by recursion.
2319 */
2320/* [lwall] I've hoisted the register declarations to the outer block in order to
2321 * maybe save a little bit of pushing and popping on the stack. It also takes
2322 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2323 *
2324 * This function used to be heavily recursive, but since this had the
2325 * effect of blowing the CPU stack on complex regexes, it has been
2326 * restructured to be iterative, and to save state onto the heap rather
2327 * than the stack. Essentially whereever regmatch() used to be called, it
2328 * pushes the current state, notes where to return, then jumps back into
2329 * the main loop.
2330 *
2331 * Originally the structure of this function used to look something like
2332
2333 S_regmatch() {
2334 int a = 1, b = 2;
2335 ...
2336 while (scan != NULL) {
5d9a96ca 2337 a++; // do stuff with a and b
95b24440
DM
2338 ...
2339 switch (OP(scan)) {
2340 case FOO: {
2341 int local = 3;
2342 ...
2343 if (regmatch(...)) // recurse
2344 goto yes;
2345 }
2346 ...
2347 }
2348 }
2349 yes:
2350 return 1;
2351 }
2352
2353 * Now it looks something like this:
2354
5d9a96ca 2355 typedef struct {
95b24440
DM
2356 int a, b, local;
2357 int resume_state;
5d9a96ca 2358 } regmatch_state;
95b24440
DM
2359
2360 S_regmatch() {
5d9a96ca
DM
2361 regmatch_state *st = new();
2362 int depth=0;
2363 st->a++; // do stuff with a and b
95b24440
DM
2364 ...
2365 while (scan != NULL) {
2366 ...
2367 switch (OP(scan)) {
2368 case FOO: {
5d9a96ca 2369 st->local = 3;
95b24440 2370 ...
5d9a96ca
DM
2371 st->scan = scan;
2372 scan = ...;
2373 st->resume_state = resume_FOO;
2374 goto start_recurse; // recurse
95b24440 2375
5d9a96ca
DM
2376 resume_point_FOO:
2377 if (result)
95b24440
DM
2378 goto yes;
2379 }
2380 ...
2381 }
5d9a96ca
DM
2382 start_recurse:
2383 st = new(); push a new state
2384 st->a = 1; st->b = 2;
2385 depth++;
95b24440 2386 }
5d9a96ca 2387 yes:
95b24440 2388 result = 1;
5d9a96ca
DM
2389 if (depth--) {
2390 st = pop();
95b24440
DM
2391 switch (resume_state) {
2392 case resume_FOO:
2393 goto resume_point_FOO;
2394 ...
2395 }
2396 }
2397 return result
2398 }
2399
2400 * WARNING: this means that any line in this function that contains a
2401 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2402 * regmatch() using gotos instead. Thus the values of any local variables
2403 * not saved in the regmatch_state structure will have been lost when
2404 * execution resumes on the next line .
5d9a96ca
DM
2405 *
2406 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2407 * PL_regmatch_state always points to the currently active state, and
2408 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2409 * The first time regmatch is called, the first slab is allocated, and is
2410 * never freed until interpreter desctruction. When the slab is full,
2411 * a new one is allocated chained to the end. At exit from regmatch, slabs
2412 * allocated since entry are freed.
d6a28714 2413 */
95b24440
DM
2414
2415
d6a28714 2416STATIC I32 /* 0 failure, 1 success */
4f639d21 2417S_regmatch(pTHX_ regexp *rex, regnode *prog)
d6a28714 2418{
27da23d5 2419 dVAR;
95b24440 2420 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2421 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2422
5d9a96ca
DM
2423 regmatch_slab *orig_slab;
2424 regmatch_state *orig_state;
a3621e74 2425
5d9a96ca
DM
2426 /* the current state. This is a cached copy of PL_regmatch_state */
2427 register regmatch_state *st;
95b24440 2428
5d9a96ca
DM
2429 /* cache heavy used fields of st in registers */
2430 register regnode *scan;
2431 register regnode *next;
2432 register I32 n = 0; /* initialize to shut up compiler warning */
2433 register char *locinput = PL_reginput;
95b24440 2434
5d9a96ca
DM
2435 /* these variables are NOT saved during a recusive RFEGMATCH: */
2436 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2437 bool result; /* return value of S_regmatch */
2438 regnode *inner; /* Next node in internal branch. */
2439 int depth = 0; /* depth of recursion */
95b24440
DM
2440
2441#ifdef DEBUGGING
ab74612d 2442 SV *re_debug_flags = NULL;
a3621e74 2443 GET_RE_DEBUG_FLAGS;
d6a28714
JH
2444 PL_regindent++;
2445#endif
2446
5d9a96ca
DM
2447 /* on first ever call to regmatch, allocate first slab */
2448 if (!PL_regmatch_slab) {
2449 Newx(PL_regmatch_slab, 1, regmatch_slab);
2450 PL_regmatch_slab->prev = NULL;
2451 PL_regmatch_slab->next = NULL;
2452 PL_regmatch_state = &PL_regmatch_slab->states[0] - 1;
2453 }
2454
2455 /* remember current high-water mark for exit */
2456 /* XXX this should be done with SAVE* instead */
2457 orig_slab = PL_regmatch_slab;
2458 orig_state = PL_regmatch_state;
2459
2460 /* grab next free state slot */
2461 st = ++PL_regmatch_state;
2462 if (st > &(PL_regmatch_slab->states[PERL_REGMATCH_SLAB_SLOTS-1]))
2463 st = PL_regmatch_state = S_push_slab(aTHX);
2464
2465 st->minmod = 0;
2466 st->sw = 0;
2467 st->logical = 0;
2468 st->unwind = 0;
2469 st->cc = NULL;
d6a28714
JH
2470 /* Note that nextchr is a byte even in UTF */
2471 nextchr = UCHARAT(locinput);
2472 scan = prog;
2473 while (scan != NULL) {
8ba1375e 2474
a3621e74 2475 DEBUG_EXECUTE_r( {
6136c704 2476 SV * const prop = sv_newmortal();
1df70142
AL
2477 const int docolor = *PL_colors[0];
2478 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2479 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2480 /* The part of the string before starttry has one color
2481 (pref0_len chars), between starttry and current
2482 position another one (pref_len - pref0_len chars),
2483 after the current position the third one.
2484 We assume that pref0_len <= pref_len, otherwise we
2485 decrease pref0_len. */
9041c2e3 2486 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2487 ? (5 + taill) - l : locinput - PL_bostr;
2488 int pref0_len;
d6a28714 2489
df1ffd02 2490 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2491 pref_len++;
2492 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2493 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2494 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2495 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2496 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2497 l--;
d6a28714
JH
2498 if (pref0_len < 0)
2499 pref0_len = 0;
2500 if (pref0_len > pref_len)
2501 pref0_len = pref_len;
2502 regprop(prop, scan);
2a782b5b 2503 {
1df70142 2504 const char * const s0 =
f14c76ed 2505 do_utf8 && OP(scan) != CANY ?
95b24440 2506 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
c728cb41 2507 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2508 locinput - pref_len;
1df70142
AL
2509 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2510 const char * const s1 = do_utf8 && OP(scan) != CANY ?
95b24440
DM
2511 pv_uni_display(PERL_DEBUG_PAD(1),
2512 (U8*)(locinput - pref_len + pref0_len),
c728cb41 2513 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2514 locinput - pref_len + pref0_len;
1df70142
AL
2515 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2516 const char * const s2 = do_utf8 && OP(scan) != CANY ?
95b24440 2517 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
c728cb41 2518 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2519 locinput;
1df70142 2520 const int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2521 PerlIO_printf(Perl_debug_log,
2522 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2523 (IV)(locinput - PL_bostr),
2524 PL_colors[4],
2525 len0, s0,
2526 PL_colors[5],
2527 PL_colors[2],
2528 len1, s1,
2529 PL_colors[3],
2530 (docolor ? "" : "> <"),
2531 PL_colors[0],
2532 len2, s2,
2533 PL_colors[1],
2534 15 - l - pref_len + 1,
2535 "",
4f639d21 2536 (IV)(scan - rex->program), PL_regindent*2, "",
3f7c398e 2537 SvPVX_const(prop));
2a782b5b
JH
2538 }
2539 });
d6a28714
JH
2540
2541 next = scan + NEXT_OFF(scan);
2542 if (next == scan)
2543 next = NULL;
2544
2545 switch (OP(scan)) {
2546 case BOL:
7fba1cd6 2547 if (locinput == PL_bostr)
d6a28714
JH
2548 {
2549 /* regtill = regbol; */
b8c5462f
JH
2550 break;
2551 }
d6a28714
JH
2552 sayNO;
2553 case MBOL:
12d33761
HS
2554 if (locinput == PL_bostr ||
2555 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2556 {
b8c5462f
JH
2557 break;
2558 }
d6a28714
JH
2559 sayNO;
2560 case SBOL:
c2a73568 2561 if (locinput == PL_bostr)
b8c5462f 2562 break;
d6a28714
JH
2563 sayNO;
2564 case GPOS:
2565 if (locinput == PL_reg_ganch)
2566 break;
2567 sayNO;
2568 case EOL:
d6a28714
JH
2569 goto seol;
2570 case MEOL:
d6a28714 2571 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2572 sayNO;
b8c5462f 2573 break;
d6a28714
JH
2574 case SEOL:
2575 seol:
2576 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2577 sayNO;
d6a28714 2578 if (PL_regeol - locinput > 1)
b8c5462f 2579 sayNO;
b8c5462f 2580 break;
d6a28714
JH
2581 case EOS:
2582 if (PL_regeol != locinput)
b8c5462f 2583 sayNO;
d6a28714 2584 break;
ffc61ed2 2585 case SANY:
d6a28714 2586 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2587 sayNO;
f33976b4
DB
2588 if (do_utf8) {
2589 locinput += PL_utf8skip[nextchr];
2590 if (locinput > PL_regeol)
2591 sayNO;
2592 nextchr = UCHARAT(locinput);
2593 }
2594 else
2595 nextchr = UCHARAT(++locinput);
2596 break;
2597 case CANY:
2598 if (!nextchr && locinput >= PL_regeol)
2599 sayNO;
b8c5462f 2600 nextchr = UCHARAT(++locinput);
a0d0e21e 2601 break;
ffc61ed2 2602 case REG_ANY:
1aa99e6b
IH
2603 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2604 sayNO;
2605 if (do_utf8) {
b8c5462f 2606 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2607 if (locinput > PL_regeol)
2608 sayNO;
a0ed51b3 2609 nextchr = UCHARAT(locinput);
a0ed51b3 2610 }
1aa99e6b
IH
2611 else
2612 nextchr = UCHARAT(++locinput);
a0ed51b3 2613 break;
a3621e74
YO
2614
2615
2616
2617 /*
2618 traverse the TRIE keeping track of all accepting states
2619 we transition through until we get to a failing node.
2620
a3621e74
YO
2621
2622 */
5b47454d 2623 case TRIE:
a3621e74
YO
2624 case TRIEF:
2625 case TRIEFL:
2626 {
a3621e74
YO
2627 U8 *uc = ( U8* )locinput;
2628 U32 state = 1;
2629 U16 charid = 0;
2630 U32 base = 0;
2631 UV uvc = 0;
2632 STRLEN len = 0;
2633 STRLEN foldlen = 0;
a3621e74
YO
2634 U8 *uscan = (U8*)NULL;
2635 STRLEN bufflen=0;
95b24440 2636 SV *sv_accept_buff = NULL;
5b47454d
DM
2637 const enum { trie_plain, trie_utf8, trie_uft8_fold }
2638 trie_type = do_utf8 ?
2639 (OP(scan) == TRIE ? trie_utf8 : trie_uft8_fold)
2640 : trie_plain;
2641
7087a21c
NC
2642 /* what trie are we using right now */
2643 reg_trie_data *trie
4f639d21 2644 = (reg_trie_data*)PL_reg_re->data->data[ ARG( scan ) ];
d8319b27 2645 st->u.trie.accepted = 0; /* how many accepting states we have seen */
95b24440 2646 result = 0;
a3621e74
YO
2647
2648 while ( state && uc <= (U8*)PL_regeol ) {
2649
5b47454d 2650 if (trie->states[ state ].wordnum) {
d8319b27 2651 if (!st->u.trie.accepted ) {
5b47454d
DM
2652 ENTER;
2653 SAVETMPS;
2654 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2655 sv_accept_buff=newSV(bufflen *
2656 sizeof(reg_trie_accepted) - 1);
2657 SvCUR_set(sv_accept_buff,
2658 sizeof(reg_trie_accepted));
2659 SvPOK_on(sv_accept_buff);
2660 sv_2mortal(sv_accept_buff);
d8319b27 2661 st->u.trie.accept_buff =
5b47454d
DM
2662 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2663 }
2664 else {
d8319b27 2665 if (st->u.trie.accepted >= bufflen) {
5b47454d 2666 bufflen *= 2;
d8319b27 2667 st->u.trie.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2668 SvGROW(sv_accept_buff,
2669 bufflen * sizeof(reg_trie_accepted));
2670 }
2671 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2672 + sizeof(reg_trie_accepted));
2673 }
d8319b27
DM
2674 st->u.trie.accept_buff[st->u.trie.accepted].wordnum = trie->states[state].wordnum;
2675 st->u.trie.accept_buff[st->u.trie.accepted].endpos = uc;
2676 ++st->u.trie.accepted;
5b47454d 2677 }
a3621e74
YO
2678
2679 base = trie->states[ state ].trans.base;
2680
2681 DEBUG_TRIE_EXECUTE_r(
2682 PerlIO_printf( Perl_debug_log,
e4584336 2683 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2684 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27 2685 (UV)state, (UV)base, (UV)st->u.trie.accepted );
a3621e74
YO
2686 );
2687
2688 if ( base ) {
5b47454d
DM
2689 switch (trie_type) {
2690 case trie_uft8_fold:
a3621e74
YO
2691 if ( foldlen>0 ) {
2692 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2693 foldlen -= len;
2694 uscan += len;
2695 len=0;
2696 } else {
1df70142 2697 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2698 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2699 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2700 foldlen -= UNISKIP( uvc );
2701 uscan = foldbuf + UNISKIP( uvc );
2702 }
5b47454d
DM
2703 break;
2704 case trie_utf8:
2705 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2706 &len, uniflags );
2707 break;
2708 case trie_plain:
e4584336 2709 uvc = (UV)*uc;
a3621e74
YO
2710 len = 1;
2711 }
2712
5b47454d
DM
2713 if (uvc < 256) {
2714 charid = trie->charmap[ uvc ];
2715 }
2716 else {
2717 charid = 0;
2718 if (trie->widecharmap) {
2719 SV** svpp = (SV**)NULL;
2720 svpp = hv_fetch(trie->widecharmap,
2721 (char*)&uvc, sizeof(UV), 0);
2722 if (svpp)
2723 charid = (U16)SvIV(*svpp);
2724 }
2725 }
a3621e74 2726
5b47454d
DM
2727 if (charid &&
2728 (base + charid > trie->uniquecharcount )
2729 && (base + charid - 1 - trie->uniquecharcount
2730 < trie->lasttrans)
2731 && trie->trans[base + charid - 1 -
2732 trie->uniquecharcount].check == state)
2733 {
2734 state = trie->trans[base + charid - 1 -
2735 trie->uniquecharcount ].next;
2736 }
2737 else {
2738 state = 0;
2739 }
2740 uc += len;
2741
2742 }
2743 else {
a3621e74
YO
2744 state = 0;
2745 }
2746 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2747 PerlIO_printf( Perl_debug_log,
2748 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2749 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2750 );
2751 }
d8319b27 2752 if (!st->u.trie.accepted )
a3621e74 2753 sayNO;
a3621e74
YO
2754
2755 /*
2756 There was at least one accepting state that we
2757 transitioned through. Presumably the number of accepting
2758 states is going to be low, typically one or two. So we
2759 simply scan through to find the one with lowest wordnum.
2760 Once we find it, we swap the last state into its place
2761 and decrement the size. We then try to match the rest of
2762 the pattern at the point where the word ends, if we
2763 succeed then we end the loop, otherwise the loop
2764 eventually terminates once all of the accepting states
2765 have been tried.
2766 */
a3621e74 2767
d8319b27 2768 if ( st->u.trie.accepted == 1 ) {
a3621e74 2769 DEBUG_EXECUTE_r({
d8319b27 2770 SV **tmp = av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 );
a3621e74
YO
2771 PerlIO_printf( Perl_debug_log,
2772 "%*s %sonly one match : #%d <%s>%s\n",
e4584336 2773 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
d8319b27 2774 st->u.trie.accept_buff[ 0 ].wordnum,
cfd0369c 2775 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
a3621e74
YO
2776 PL_colors[5] );
2777 });
d8319b27 2778 PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos;
a3621e74
YO
2779 /* in this case we free tmps/leave before we call regmatch
2780 as we wont be using accept_buff again. */
2781 FREETMPS;
2782 LEAVE;
95b24440
DM
2783 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
2784 /*** all unsaved local vars undefined at this point */
a3621e74
YO
2785 } else {
2786 DEBUG_EXECUTE_r(
e4584336 2787 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
d8319b27 2788 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)st->u.trie.accepted,
a3621e74
YO
2789 PL_colors[5] );
2790 );
d8319b27 2791 while ( !result && st->u.trie.accepted-- ) {
a3621e74
YO
2792 U32 best = 0;
2793 U32 cur;
d8319b27 2794 for( cur = 1 ; cur <= st->u.trie.accepted ; cur++ ) {
e4584336
RB
2795 DEBUG_TRIE_EXECUTE_r(
2796 PerlIO_printf( Perl_debug_log,
2797 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2798 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27
DM
2799 (IV)best, st->u.trie.accept_buff[ best ].wordnum, (IV)cur,
2800 st->u.trie.accept_buff[ cur ].wordnum, PL_colors[5] );
e4584336 2801 );
a3621e74 2802
d8319b27
DM
2803 if (st->u.trie.accept_buff[cur].wordnum <
2804 st->u.trie.accept_buff[best].wordnum)
e822a8b4 2805 best = cur;
a3621e74
YO
2806 }
2807 DEBUG_EXECUTE_r({
d8319b27 2808 SV ** const tmp = av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 );
a3621e74 2809 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
e4584336 2810 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
d8319b27 2811 st->u.trie.accept_buff[best].wordnum,
cfd0369c 2812 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",scan,
a3621e74
YO
2813 PL_colors[5] );
2814 });
d8319b27
DM
2815 if ( best<st->u.trie.accepted ) {
2816 reg_trie_accepted tmp = st->u.trie.accept_buff[ best ];
2817 st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ];
2818 st->u.trie.accept_buff[ st->u.trie.accepted ] = tmp;
2819 best = st->u.trie.accepted;
a3621e74 2820 }
d8319b27 2821 PL_reginput = (char *)st->u.trie.accept_buff[ best ].endpos;
a3621e74
YO
2822
2823 /*
2824 as far as I can tell we only need the SAVETMPS/FREETMPS
2825 for re's with EVAL in them but I'm leaving them in for
2826 all until I can be sure.
2827 */
2828 SAVETMPS;
95b24440
DM
2829 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
2830 /*** all unsaved local vars undefined at this point */
a3621e74
YO
2831 FREETMPS;
2832 }
2833 FREETMPS;
2834 LEAVE;
2835 }
2836
95b24440 2837 if (result) {
a3621e74
YO
2838 sayYES;
2839 } else {
2840 sayNO;
2841 }
2842 }
2843 /* unreached codepoint */
95b24440
DM
2844 case EXACT: {
2845 char *s = STRING(scan);
5d9a96ca 2846 st->ln = STR_LEN(scan);
eb160463 2847 if (do_utf8 != UTF) {
bc517b45 2848 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2849 char *l = locinput;
5d9a96ca 2850 const char *e = s + st->ln;
a72c7584 2851
5ff6fc6d
JH
2852 if (do_utf8) {
2853 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2854 while (s < e) {
a3b680e6 2855 STRLEN ulen;
1aa99e6b 2856 if (l >= PL_regeol)
5ff6fc6d
JH
2857 sayNO;
2858 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2859 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2860 uniflags))
5ff6fc6d 2861 sayNO;
bc517b45 2862 l += ulen;
5ff6fc6d 2863 s ++;
1aa99e6b 2864 }
5ff6fc6d
JH
2865 }
2866 else {
2867 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2868 while (s < e) {
a3b680e6 2869 STRLEN ulen;
1aa99e6b
IH
2870 if (l >= PL_regeol)
2871 sayNO;
5ff6fc6d 2872 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2873 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2874 uniflags))
1aa99e6b 2875 sayNO;
bc517b45 2876 s += ulen;
a72c7584 2877 l ++;
1aa99e6b 2878 }
5ff6fc6d 2879 }
1aa99e6b
IH
2880 locinput = l;
2881 nextchr = UCHARAT(locinput);
2882 break;
2883 }
bc517b45 2884 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2885 /* Inline the first character, for speed. */
2886 if (UCHARAT(s) != nextchr)
2887 sayNO;
5d9a96ca 2888 if (PL_regeol - locinput < st->ln)
d6a28714 2889 sayNO;
5d9a96ca 2890 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 2891 sayNO;
5d9a96ca 2892 locinput += st->ln;
d6a28714
JH
2893 nextchr = UCHARAT(locinput);
2894 break;
95b24440 2895 }
d6a28714 2896 case EXACTFL:
b8c5462f
JH
2897 PL_reg_flags |= RF_tainted;
2898 /* FALL THROUGH */
95b24440
DM
2899 case EXACTF: {
2900 char *s = STRING(scan);
5d9a96ca 2901 st->ln = STR_LEN(scan);
d6a28714 2902
d07ddd77
JH
2903 if (do_utf8 || UTF) {
2904 /* Either target or the pattern are utf8. */
d6a28714 2905 char *l = locinput;
d07ddd77 2906 char *e = PL_regeol;
bc517b45 2907
5d9a96ca 2908 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 2909 l, &e, 0, do_utf8)) {
5486206c
JH
2910 /* One more case for the sharp s:
2911 * pack("U0U*", 0xDF) =~ /ss/i,
2912 * the 0xC3 0x9F are the UTF-8
2913 * byte sequence for the U+00DF. */
2914 if (!(do_utf8 &&
2915 toLOWER(s[0]) == 's' &&
5d9a96ca 2916 st->ln >= 2 &&
5486206c
JH
2917 toLOWER(s[1]) == 's' &&
2918 (U8)l[0] == 0xC3 &&
2919 e - l >= 2 &&
2920 (U8)l[1] == 0x9F))
2921 sayNO;
2922 }
d07ddd77
JH
2923 locinput = e;
2924 nextchr = UCHARAT(locinput);
2925 break;
a0ed51b3 2926 }
d6a28714 2927
bc517b45
JH
2928 /* Neither the target and the pattern are utf8. */
2929
d6a28714
JH
2930 /* Inline the first character, for speed. */
2931 if (UCHARAT(s) != nextchr &&
2932 UCHARAT(s) != ((OP(scan) == EXACTF)
2933 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2934 sayNO;
5d9a96ca 2935 if (PL_regeol - locinput < st->ln)
b8c5462f 2936 sayNO;
5d9a96ca
DM
2937 if (st->ln > 1 && (OP(scan) == EXACTF
2938 ? ibcmp(s, locinput, st->ln)
2939 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 2940 sayNO;
5d9a96ca 2941 locinput += st->ln;
d6a28714 2942 nextchr = UCHARAT(locinput);
a0d0e21e 2943 break;
95b24440 2944 }
d6a28714 2945 case ANYOF:
ffc61ed2 2946 if (do_utf8) {
9e55ce06
JH
2947 STRLEN inclasslen = PL_regeol - locinput;
2948
ba7b4546 2949 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2950 sayNO_ANYOF;
ffc61ed2
JH
2951 if (locinput >= PL_regeol)
2952 sayNO;
0f0076b4 2953 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2954 nextchr = UCHARAT(locinput);
e0f9d4a8 2955 break;
ffc61ed2
JH
2956 }
2957 else {
2958 if (nextchr < 0)
2959 nextchr = UCHARAT(locinput);
7d3e948e 2960 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2961 sayNO_ANYOF;
ffc61ed2
JH
2962 if (!nextchr && locinput >= PL_regeol)
2963 sayNO;
2964 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2965 break;
2966 }
2967 no_anyof:
2968 /* If we might have the case of the German sharp s
2969 * in a casefolding Unicode character class. */
2970
ebc501f0
JH
2971 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2972 locinput += SHARP_S_SKIP;
e0f9d4a8 2973 nextchr = UCHARAT(locinput);
ffc61ed2 2974 }
e0f9d4a8
JH
2975 else
2976 sayNO;
b8c5462f 2977 break;
d6a28714 2978 case ALNUML:
b8c5462f
JH
2979 PL_reg_flags |= RF_tainted;
2980 /* FALL THROUGH */
d6a28714 2981 case ALNUM:
b8c5462f 2982 if (!nextchr)
4633a7c4 2983 sayNO;
ffc61ed2 2984 if (do_utf8) {
1a4fad37 2985 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2986 if (!(OP(scan) == ALNUM
3568d838 2987 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2988 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2989 {
2990 sayNO;
a0ed51b3 2991 }
b8c5462f 2992 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2993 nextchr = UCHARAT(locinput);
2994 break;
2995 }
ffc61ed2 2996 if (!(OP(scan) == ALNUM
d6a28714 2997 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2998 sayNO;
b8c5462f 2999 nextchr = UCHARAT(++locinput);
a0d0e21e 3000 break;
d6a28714 3001 case NALNUML:
b8c5462f
JH
3002 PL_reg_flags |= RF_tainted;
3003 /* FALL THROUGH */
d6a28714
JH
3004 case NALNUM:
3005 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3006 sayNO;
ffc61ed2 3007 if (do_utf8) {
1a4fad37 3008 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3009 if (OP(scan) == NALNUM
3568d838 3010 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3011 : isALNUM_LC_utf8((U8*)locinput))
3012 {
b8c5462f 3013 sayNO;
d6a28714 3014 }
b8c5462f
JH
3015 locinput += PL_utf8skip[nextchr];
3016 nextchr = UCHARAT(locinput);
3017 break;
3018 }
ffc61ed2 3019 if (OP(scan) == NALNUM
d6a28714 3020 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3021 sayNO;
76e3520e 3022 nextchr = UCHARAT(++locinput);
a0d0e21e 3023 break;
d6a28714
JH
3024 case BOUNDL:
3025 case NBOUNDL:
3280af22 3026 PL_reg_flags |= RF_tainted;
bbce6d69 3027 /* FALL THROUGH */
d6a28714
JH
3028 case BOUND:
3029 case NBOUND:
3030 /* was last char in word? */
ffc61ed2 3031 if (do_utf8) {
12d33761 3032 if (locinput == PL_bostr)
5d9a96ca 3033 st->ln = '\n';
ffc61ed2 3034 else {
a3b680e6 3035 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3036
4ad0818d 3037 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3038 }
3039 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3040 st->ln = isALNUM_uni(st->ln);
1a4fad37 3041 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3042 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3043 }
3044 else {
5d9a96ca 3045 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3046 n = isALNUM_LC_utf8((U8*)locinput);
3047 }
a0ed51b3 3048 }
d6a28714 3049 else {
5d9a96ca 3050 st->ln = (locinput != PL_bostr) ?
12d33761 3051 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3052 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3053 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3054 n = isALNUM(nextchr);
3055 }
3056 else {
5d9a96ca 3057 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3058 n = isALNUM_LC(nextchr);
3059 }
d6a28714 3060 }
5d9a96ca 3061 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3062 OP(scan) == BOUNDL))
3063 sayNO;
a0ed51b3 3064 break;
d6a28714 3065 case SPACEL:
3280af22 3066 PL_reg_flags |= RF_tainted;
bbce6d69 3067 /* FALL THROUGH */
d6a28714 3068 case SPACE:
9442cb0e 3069 if (!nextchr)
4633a7c4 3070 sayNO;
1aa99e6b 3071 if (do_utf8) {
fd400ab9 3072 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3073 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3074 if (!(OP(scan) == SPACE
3568d838 3075 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3076 : isSPACE_LC_utf8((U8*)locinput)))
3077 {
3078 sayNO;
3079 }
3080 locinput += PL_utf8skip[nextchr];
3081 nextchr = UCHARAT(locinput);
3082 break;
d6a28714 3083 }
ffc61ed2
JH
3084 if (!(OP(scan) == SPACE
3085 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3086 sayNO;
3087 nextchr = UCHARAT(++locinput);
3088 }
3089 else {
3090 if (!(OP(scan) == SPACE
3091 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3092 sayNO;
3093 nextchr = UCHARAT(++locinput);
a0ed51b3 3094 }
a0ed51b3 3095 break;
d6a28714 3096 case NSPACEL:
3280af22 3097 PL_reg_flags |= RF_tainted;
bbce6d69 3098 /* FALL THROUGH */
d6a28714 3099 case NSPACE:
9442cb0e 3100 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3101 sayNO;
1aa99e6b 3102 if (do_utf8) {
1a4fad37 3103 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3104 if (OP(scan) == NSPACE
3568d838 3105 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3106 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3107 {
3108 sayNO;
3109 }
3110 locinput += PL_utf8skip[nextchr];
3111 nextchr = UCHARAT(locinput);
3112 break;
a0ed51b3 3113 }
ffc61ed2 3114 if (OP(scan) == NSPACE
d6a28714 3115 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3116 sayNO;
76e3520e 3117 nextchr = UCHARAT(++locinput);
a0d0e21e 3118 break;
d6a28714 3119 case DIGITL:
a0ed51b3
LW
3120 PL_reg_flags |= RF_tainted;
3121 /* FALL THROUGH */
d6a28714 3122 case DIGIT:
9442cb0e 3123 if (!nextchr)
a0ed51b3 3124 sayNO;
1aa99e6b 3125 if (do_utf8) {
1a4fad37 3126 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3127 if (!(OP(scan) == DIGIT
3568d838 3128 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3129 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3130 {
a0ed51b3 3131 sayNO;
dfe13c55 3132 }
6f06b55f 3133 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3134 nextchr = UCHARAT(locinput);
3135 break;
3136 }
ffc61ed2 3137 if (!(OP(scan) == DIGIT
9442cb0e 3138 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3139 sayNO;
3140 nextchr = UCHARAT(++locinput);
3141 break;
d6a28714 3142 case NDIGITL:
b8c5462f
JH
3143 PL_reg_flags |= RF_tainted;
3144 /* FALL THROUGH */
d6a28714 3145 case NDIGIT:
9442cb0e 3146 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3147 sayNO;
1aa99e6b 3148 if (do_utf8) {
1a4fad37 3149 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3150 if (OP(scan) == NDIGIT
3568d838 3151 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3152 : isDIGIT_LC_utf8((U8*)locinput))
3153 {
a0ed51b3 3154 sayNO;
9442cb0e 3155 }
6f06b55f 3156 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3157 nextchr = UCHARAT(locinput);
3158 break;
3159 }
ffc61ed2 3160 if (OP(scan) == NDIGIT
9442cb0e 3161 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3162 sayNO;
3163 nextchr = UCHARAT(++locinput);
3164 break;
3165 case CLUMP:
b7c83a7e 3166 if (locinput >= PL_regeol)
a0ed51b3 3167 sayNO;
b7c83a7e 3168 if (do_utf8) {
1a4fad37 3169 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3170 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3171 sayNO;
3172 locinput += PL_utf8skip[nextchr];
3173 while (locinput < PL_regeol &&
3174 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3175 locinput += UTF8SKIP(locinput);
3176 if (locinput > PL_regeol)
3177 sayNO;
eb08e2da
JH
3178 }
3179 else
3180 locinput++;
a0ed51b3
LW
3181 nextchr = UCHARAT(locinput);
3182 break;
c8756f30 3183 case REFFL:
3280af22 3184 PL_reg_flags |= RF_tainted;
c8756f30 3185 /* FALL THROUGH */
c277df42 3186 case REF:
95b24440
DM
3187 case REFF: {
3188 char *s;
c277df42 3189 n = ARG(scan); /* which paren pair */
5d9a96ca 3190 st->ln = PL_regstartp[n];
2c2d71f5 3191 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3192 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3193 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3194 if (st->ln == PL_regendp[n])
a0d0e21e 3195 break;
a0ed51b3 3196
5d9a96ca 3197 s = PL_bostr + st->ln;
1aa99e6b 3198 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3199 char *l = locinput;
a3b680e6 3200 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3201 /*
3202 * Note that we can't do the "other character" lookup trick as
3203 * in the 8-bit case (no pun intended) because in Unicode we
3204 * have to map both upper and title case to lower case.
3205 */
3206 if (OP(scan) == REFF) {
3207 while (s < e) {
a3b680e6
AL
3208 STRLEN ulen1, ulen2;
3209 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3210 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3211
a0ed51b3
LW
3212 if (l >= PL_regeol)
3213 sayNO;
a2a2844f
JH
3214 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3215 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3216 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3217 sayNO;
a2a2844f
JH
3218 s += ulen1;
3219 l += ulen2;
a0ed51b3
LW
3220 }
3221 }
3222 locinput = l;
3223 nextchr = UCHARAT(locinput);
3224 break;
3225 }
3226
a0d0e21e 3227 /* Inline the first character, for speed. */
76e3520e 3228 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3229 (OP(scan) == REF ||
3230 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3231 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3232 sayNO;
5d9a96ca
DM
3233 st->ln = PL_regendp[n] - st->ln;
3234 if (locinput + st->ln > PL_regeol)
4633a7c4 3235 sayNO;
5d9a96ca
DM
3236 if (st->ln > 1 && (OP(scan) == REF
3237 ? memNE(s, locinput, st->ln)
c8756f30 3238 : (OP(scan) == REFF
5d9a96ca
DM
3239 ? ibcmp(s, locinput, st->ln)
3240 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3241 sayNO;
5d9a96ca 3242 locinput += st->ln;
76e3520e 3243 nextchr = UCHARAT(locinput);
a0d0e21e 3244 break;
95b24440 3245 }
a0d0e21e
LW
3246
3247 case NOTHING:
c277df42 3248 case TAIL:
a0d0e21e
LW
3249 break;
3250 case BACK:
3251 break;
c277df42
IZ
3252 case EVAL:
3253 {
c277df42 3254 SV *ret;
8e5e9ebe 3255 {
4aabdb9b
DM
3256 /* execute the code in the {...} */
3257 dSP;
6136c704 3258 SV ** const before = SP;
4aabdb9b
DM
3259 OP_4tree * const oop = PL_op;
3260 COP * const ocurcop = PL_curcop;
3261 PAD *old_comppad;
3262 struct regexp * const oreg = PL_reg_re;
3263
3264 n = ARG(scan);
4f639d21 3265 PL_op = (OP_4tree*)PL_reg_re->data->data[n];
4aabdb9b 3266 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4f639d21 3267 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_reg_re->data->data[n + 2]);
4aabdb9b
DM
3268 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3269
8e5e9ebe
RGS
3270 CALLRUNOPS(aTHX); /* Scalar context. */
3271 SPAGAIN;
3272 if (SP == before)
075aa684 3273 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3274 else {
3275 ret = POPs;
3276 PUTBACK;
3277 }
4aabdb9b
DM
3278
3279 PL_op = oop;
3280 PAD_RESTORE_LOCAL(old_comppad);
3281 PL_curcop = ocurcop;
3282 if (!st->logical) {
3283 /* /(?{...})/ */
3284 sv_setsv(save_scalar(PL_replgv), ret);
4f639d21 3285 PL_reg_re = oreg;
4aabdb9b
DM
3286 break;
3287 }
8e5e9ebe 3288 }
4aabdb9b
DM
3289 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3290 regexp *re;
3291 re_cc_state state;
3292 int toggleutf;
8e5e9ebe 3293
4aabdb9b 3294 {
4f639d21
DM
3295 /* extract RE object from returned value; compiling if
3296 * necessary */
3297
6136c704 3298 MAGIC *mg = NULL;
4aabdb9b 3299 SV *sv;
faf82a0b
AE
3300 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3301 mg = mg_find(sv, PERL_MAGIC_qr);
3302 else if (SvSMAGICAL(ret)) {
3303 if (SvGMAGICAL(ret))
3304 sv_unmagic(ret, PERL_MAGIC_qr);
3305 else
3306 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3307 }
faf82a0b 3308
0f5d15d6
IZ
3309 if (mg) {
3310 re = (regexp *)mg->mg_obj;
df0003d4 3311 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3312 }
3313 else {
3314 STRLEN len;
6136c704 3315 const char * const t = SvPV_const(ret, len);
0f5d15d6 3316 PMOP pm;
a3b680e6 3317 const I32 osize = PL_regsize;
0f5d15d6 3318
5fcd1c1b 3319 Zero(&pm, 1, PMOP);
4aabdb9b 3320 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3321 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3322 if (!(SvFLAGS(ret)
faf82a0b
AE
3323 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3324 | SVs_GMG)))
14befaf4
DM
3325 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3326 PERL_MAGIC_qr,0,0);
0f5d15d6 3327 PL_regsize = osize;
0f5d15d6 3328 }
4aabdb9b
DM
3329 }
3330 DEBUG_EXECUTE_r(
3331 PerlIO_printf(Perl_debug_log,
3332 "Entering embedded \"%s%.60s%s%s\"\n",
3333 PL_colors[0],
3334 re->precomp,
3335 PL_colors[1],
3336 (strlen(re->precomp) > 60 ? "..." : ""))
3337 );
3338 state.node = next;
3339 state.prev = PL_reg_call_cc;
3340 state.cc = st->cc;
3341 state.re = PL_reg_re;
2c2d71f5 3342
4aabdb9b
DM
3343 st->cc = 0;
3344
3345 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
3346 REGCP_SET(st->u.eval.lastcp);
4f639d21 3347 PL_reg_re = re;
4aabdb9b
DM
3348 state.ss = PL_savestack_ix;
3349 *PL_reglastparen = 0;
3350 *PL_reglastcloseparen = 0;
3351 PL_reg_call_cc = &state;
3352 PL_reginput = locinput;
3353 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3354 ((re->reganch & ROPT_UTF8) != 0);
3355 if (toggleutf) PL_reg_flags ^= RF_utf8;
3356
3357 /* XXXX This is too dramatic a measure... */
3358 PL_reg_maxiter = 0;
3359
3360 /* XXX the only recursion left in regmatch() */
4f639d21 3361 if (regmatch(re, re->program + 1)) {
4aabdb9b
DM
3362 /* Even though we succeeded, we need to restore
3363 global variables, since we may be wrapped inside
3364 SUSPEND, thus the match may be not finished yet. */
3365
3366 /* XXXX Do this only if SUSPENDed? */
0f5d15d6 3367 PL_reg_call_cc = state.prev;
5d9a96ca 3368 st->cc = state.cc;
0f5d15d6 3369 PL_reg_re = state.re;
4f639d21 3370
cb50f42d 3371 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3372
3373 /* XXXX This is too dramatic a measure... */
3374 PL_reg_maxiter = 0;
3375
4aabdb9b
DM
3376 /* These are needed even if not SUSPEND. */
3377 ReREFCNT_dec(re);
3378 regcpblow(st->u.eval.cp);
3379 sayYES;
0f5d15d6 3380 }
4aabdb9b
DM
3381 ReREFCNT_dec(re);
3382 REGCP_UNWIND(st->u.eval.lastcp);
4f639d21 3383 regcppop(rex);
4aabdb9b
DM
3384 PL_reg_call_cc = state.prev;
3385 st->cc = state.cc;
3386 PL_reg_re = state.re;
4aabdb9b
DM
3387 if (toggleutf) PL_reg_flags ^= RF_utf8;
3388
3389 /* XXXX This is too dramatic a measure... */
3390 PL_reg_maxiter = 0;
3391
5d9a96ca 3392 st->logical = 0;
4aabdb9b
DM
3393 sayNO;
3394 /* NOTREACHED */
a0ed51b3 3395 }
4aabdb9b
DM
3396 /* /(?(?{...})X|Y)/ */
3397 st->sw = SvTRUE(ret);
3398 st->logical = 0;
c277df42
IZ
3399 break;
3400 }
a0d0e21e 3401 case OPEN:
c277df42 3402 n = ARG(scan); /* which paren pair */
3280af22
NIS
3403 PL_reg_start_tmp[n] = locinput;
3404 if (n > PL_regsize)
3405 PL_regsize = n;
a0d0e21e
LW
3406 break;
3407 case CLOSE:
c277df42 3408 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3409 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3410 PL_regendp[n] = locinput - PL_bostr;
eb160463 3411 if (n > (I32)*PL_reglastparen)
3280af22 3412 *PL_reglastparen = n;
a01268b5 3413 *PL_reglastcloseparen = n;
a0d0e21e 3414 break;
c277df42
IZ
3415 case GROUPP:
3416 n = ARG(scan); /* which paren pair */
5d9a96ca 3417 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3418 break;
3419 case IFTHEN:
2c2d71f5 3420 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3421 if (st->sw)
c277df42
IZ
3422 next = NEXTOPER(NEXTOPER(scan));
3423 else {
3424 next = scan + ARG(scan);
3425 if (OP(next) == IFTHEN) /* Fake one. */
3426 next = NEXTOPER(NEXTOPER(next));
3427 }
3428 break;
3429 case LOGICAL:
5d9a96ca 3430 st->logical = scan->flags;
c277df42 3431 break;
2ab05381 3432/*******************************************************************
a0374537
DM
3433 cc points to the regmatch_state associated with the most recent CURLYX.
3434 This struct contains info about the innermost (...)* loop (an
3435 "infoblock"), and a pointer to the next outer cc.
2ab05381
IZ
3436
3437 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3438
95b24440 3439 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3440
a0374537 3441 2) This regnode populates cc, and calls regmatch() recursively
95b24440 3442 with the starting point at WHILEM node;
2ab05381
IZ
3443
3444 3) Each hit of WHILEM node tries to match A and Z (in the order
3445 depending on the current iteration, min/max of {min,max} and
3446 greediness). The information about where are nodes for "A"
a0374537 3447 and "Z" is read from cc, as is info on how many times "A"
2ab05381
IZ
3448 was already matched, and greediness.