This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Shave sizeof(NV) bytes from formats, by using the same offset
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
50/* *These* symbols are masked to allow static link. */
51# define Perl_pregexec my_pregexec
9041c2e3 52# define Perl_reginitcolors my_reginitcolors
490a3f88 53# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
9041c2e3 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 **** Alterations to Henry's code are...
79 ****
4bb101f2 80 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 81 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 82 ****
9ef589d8
LW
83 **** You may distribute under the terms of either the GNU General Public
84 **** License or the Artistic License, as specified in the README file.
a687059c
LW
85 *
86 * Beware that some of this code is subtly aware of the way operator
87 * precedence is structured in regular expressions. Serious changes in
88 * regular-expression syntax might require a total rethink.
89 */
90#include "EXTERN.h"
864dbfa3 91#define PERL_IN_REGEXEC_C
a687059c 92#include "perl.h"
0f5d15d6 93
a687059c
LW
94#include "regcomp.h"
95
c277df42
IZ
96#define RF_tainted 1 /* tainted information used? */
97#define RF_warned 2 /* warned about big count? */
ce862d02 98#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
99#define RF_utf8 8 /* String contains multibyte chars? */
100
eb160463 101#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
102
103#define RS_init 1 /* eval environment created */
104#define RS_set 2 /* replsv value is set */
c277df42 105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
ba7b4546 110#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 111
c277df42
IZ
112/*
113 * Forwards.
114 */
115
33b8afdf 116#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 117#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 118
dfe13c55
GS
119#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
120#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
121#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
122#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
123#define HOPc(pos,off) ((char*)HOP(pos,off))
124#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 125
efb30f32 126#define HOPBACK(pos, off) ( \
e54858b0 127 (PL_reg_match_utf8) \
efb30f32
HS
128 ? reghopmaybe((U8*)pos, -off) \
129 : (pos - off >= PL_bostr) \
130 ? (U8*)(pos - off) \
131 : (U8*)NULL \
132)
133#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
134
1aa99e6b
IH
135#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
136#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
137#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
138#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
139#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
140#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
141
1a4fad37
AL
142#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
143 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
144#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
145#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
146#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
147#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 148
5f80c4cf 149/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
150#define JUMPABLE(rn) ( \
151 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
152 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
153 OP(rn) == PLUS || OP(rn) == MINMOD || \
154 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
155)
156
cca55fe3
JP
157#define HAS_TEXT(rn) ( \
158 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
159)
e2d8ce26 160
a84d97b6
HS
161/*
162 Search for mandatory following text node; for lookahead, the text must
163 follow but for lookbehind (rn->flags != 0) we skip to the next step.
164*/
cca55fe3 165#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 166 while (JUMPABLE(rn)) \
a84d97b6 167 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 168 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
169 else if (OP(rn) == PLUS) \
170 rn = NEXTOPER(rn); \
a84d97b6
HS
171 else if (OP(rn) == IFMATCH) \
172 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 173 else rn += NEXT_OFF(rn); \
5f80c4cf 174} STMT_END
74750237 175
acfe0abc 176static void restore_pos(pTHX_ void *arg);
51371543 177
76e3520e 178STATIC CHECKPOINT
cea2e8a9 179S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 180{
97aff369 181 dVAR;
a3b680e6 182 const int retval = PL_savestack_ix;
b1ce53c5 183#define REGCP_PAREN_ELEMS 4
a3b680e6 184 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
185 int p;
186
e49a9654
IH
187 if (paren_elems_to_push < 0)
188 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
189
a01268b5 190#define REGCP_OTHER_ELEMS 6
4b3c1a47 191 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 192 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 193/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
194 SSPUSHINT(PL_regendp[p]);
195 SSPUSHINT(PL_regstartp[p]);
3280af22 196 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
197 SSPUSHINT(p);
198 }
b1ce53c5 199/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
200 SSPUSHINT(PL_regsize);
201 SSPUSHINT(*PL_reglastparen);
a01268b5 202 SSPUSHINT(*PL_reglastcloseparen);
3280af22 203 SSPUSHPTR(PL_reginput);
41123dfd
JH
204#define REGCP_FRAME_ELEMS 2
205/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
206 * are needed for the regexp context stack bookkeeping. */
207 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 208 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 209
a0d0e21e
LW
210 return retval;
211}
212
c277df42 213/* These are needed since we do not localize EVAL nodes: */
a3621e74 214# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 215 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 216 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 217
a3621e74 218# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 219 PerlIO_printf(Perl_debug_log, \
faccc32b 220 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 221 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 222
76e3520e 223STATIC char *
cea2e8a9 224S_regcppop(pTHX)
a0d0e21e 225{
97aff369 226 dVAR;
b1ce53c5 227 I32 i;
a0d0e21e
LW
228 U32 paren = 0;
229 char *input;
b1ce53c5 230
a3621e74
YO
231 GET_RE_DEBUG_FLAGS_DECL;
232
b1ce53c5 233 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 234 i = SSPOPINT;
b1ce53c5
JH
235 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
236 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 237 input = (char *) SSPOPPTR;
a01268b5 238 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
239 *PL_reglastparen = SSPOPINT;
240 PL_regsize = SSPOPINT;
b1ce53c5
JH
241
242 /* Now restore the parentheses context. */
41123dfd
JH
243 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
244 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 245 I32 tmps;
a0d0e21e 246 paren = (U32)SSPOPINT;
3280af22 247 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
248 PL_regstartp[paren] = SSPOPINT;
249 tmps = SSPOPINT;
3280af22
NIS
250 if (paren <= *PL_reglastparen)
251 PL_regendp[paren] = tmps;
a3621e74 252 DEBUG_EXECUTE_r(
c3464db5 253 PerlIO_printf(Perl_debug_log,
b900a521 254 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 255 (UV)paren, (IV)PL_regstartp[paren],
b900a521 256 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 257 (IV)PL_regendp[paren],
3280af22 258 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 259 );
a0d0e21e 260 }
a3621e74 261 DEBUG_EXECUTE_r(
eb160463 262 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 263 PerlIO_printf(Perl_debug_log,
faccc32b
JH
264 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
265 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
266 }
267 );
daf18116 268#if 1
dafc8851
JH
269 /* It would seem that the similar code in regtry()
270 * already takes care of this, and in fact it is in
271 * a better location to since this code can #if 0-ed out
272 * but the code in regtry() is needed or otherwise tests
273 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
274 * (as of patchlevel 7877) will fail. Then again,
275 * this code seems to be necessary or otherwise
276 * building DynaLoader will fail:
277 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
278 * --jhi */
eb160463
GS
279 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
280 if ((I32)paren > PL_regsize)
cf93c79d
IZ
281 PL_regstartp[paren] = -1;
282 PL_regendp[paren] = -1;
a0d0e21e 283 }
dafc8851 284#endif
a0d0e21e
LW
285 return input;
286}
287
0f5d15d6
IZ
288typedef struct re_cc_state
289{
290 I32 ss;
291 regnode *node;
292 struct re_cc_state *prev;
293 CURCUR *cc;
294 regexp *re;
295} re_cc_state;
296
02db2b7b 297#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 298
29d1e993
HS
299#define TRYPAREN(paren, n, input) { \
300 if (paren) { \
301 if (n) { \
302 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
303 PL_regendp[paren] = input - PL_bostr; \
304 } \
305 else \
306 PL_regendp[paren] = -1; \
307 } \
308 if (regmatch(next)) \
309 sayYES; \
310 if (paren && n) \
311 PL_regendp[paren] = -1; \
312}
313
314
a687059c 315/*
e50aee73 316 * pregexec and friends
a687059c
LW
317 */
318
319/*
c277df42 320 - pregexec - match a regexp against a string
a687059c 321 */
c277df42 322I32
864dbfa3 323Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 324 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
325/* strend: pointer to null at end of string */
326/* strbeg: real beginning of string */
327/* minend: end of match must be >=minend after stringarg. */
328/* nosave: For optimizations. */
329{
330 return
9041c2e3 331 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
332 nosave ? 0 : REXEC_COPY_STR);
333}
0f5d15d6
IZ
334
335STATIC void
cea2e8a9 336S_cache_re(pTHX_ regexp *prog)
0f5d15d6 337{
97aff369 338 dVAR;
0f5d15d6
IZ
339 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
340#ifdef DEBUGGING
341 PL_regprogram = prog->program;
342#endif
343 PL_regnpar = prog->nparens;
9041c2e3
NIS
344 PL_regdata = prog->data;
345 PL_reg_re = prog;
0f5d15d6 346}
22e551b9 347
9041c2e3 348/*
cad2e5aa
JH
349 * Need to implement the following flags for reg_anch:
350 *
351 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
352 * USE_INTUIT_ML
353 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
354 * INTUIT_AUTORITATIVE_ML
355 * INTUIT_ONCE_NOML - Intuit can match in one location only.
356 * INTUIT_ONCE_ML
357 *
358 * Another flag for this function: SECOND_TIME (so that float substrs
359 * with giant delta may be not rechecked).
360 */
361
362/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
363
3f7c398e 364/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
365 Otherwise, only SvCUR(sv) is used to get strbeg. */
366
367/* XXXX We assume that strpos is strbeg unless sv. */
368
6eb5f6b9
JH
369/* XXXX Some places assume that there is a fixed substring.
370 An update may be needed if optimizer marks as "INTUITable"
371 RExen without fixed substrings. Similarly, it is assumed that
372 lengths of all the strings are no more than minlen, thus they
373 cannot come from lookahead.
374 (Or minlen should take into account lookahead.) */
375
2c2d71f5
JH
376/* A failure to find a constant substring means that there is no need to make
377 an expensive call to REx engine, thus we celebrate a failure. Similarly,
378 finding a substring too deep into the string means that less calls to
30944b6d
IZ
379 regtry() should be needed.
380
381 REx compiler's optimizer found 4 possible hints:
382 a) Anchored substring;
383 b) Fixed substring;
384 c) Whether we are anchored (beginning-of-line or \G);
385 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 386 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
387 string which does not contradict any of them.
388 */
2c2d71f5 389
6eb5f6b9
JH
390/* Most of decisions we do here should have been done at compile time.
391 The nodes of the REx which we used for the search should have been
392 deleted from the finite automaton. */
393
cad2e5aa
JH
394char *
395Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
396 char *strend, U32 flags, re_scream_pos_data *data)
397{
97aff369 398 dVAR;
b7953727 399 register I32 start_shift = 0;
cad2e5aa 400 /* Should be nonnegative! */
b7953727 401 register I32 end_shift = 0;
2c2d71f5
JH
402 register char *s;
403 register SV *check;
a1933d95 404 char *strbeg;
cad2e5aa 405 char *t;
a3b680e6 406 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 407 I32 ml_anch;
bd61b366
SS
408 register char *other_last = NULL; /* other substr checked before this */
409 char *check_at = NULL; /* check substr found at this pos */
1df70142 410 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 411#ifdef DEBUGGING
890ce7af
AL
412 const char * const i_strpos = strpos;
413 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 414#endif
a3621e74
YO
415
416 GET_RE_DEBUG_FLAGS_DECL;
417
a30b2f1f 418 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 419
b8d68ded 420 if (prog->reganch & ROPT_UTF8) {
a3621e74 421 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
422 "UTF-8 regex...\n"));
423 PL_reg_flags |= RF_utf8;
424 }
425
a3621e74 426 DEBUG_EXECUTE_r({
1df70142 427 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
428 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
429 strpos;
1df70142 430 const int len = PL_reg_match_utf8 ?
b8d68ded 431 strlen(s) : strend - strpos;
2a782b5b
JH
432 if (!PL_colorset)
433 reginitcolors();
b8d68ded 434 if (PL_reg_match_utf8)
a3621e74 435 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 436 "UTF-8 target...\n"));
2a782b5b 437 PerlIO_printf(Perl_debug_log,
a0288114 438 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 439 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
440 prog->precomp,
441 PL_colors[1],
442 (strlen(prog->precomp) > 60 ? "..." : ""),
443 PL_colors[0],
444 (int)(len > 60 ? 60 : len),
445 s, PL_colors[1],
446 (len > 60 ? "..." : "")
447 );
448 });
cad2e5aa 449
c344f387
JH
450 /* CHR_DIST() would be more correct here but it makes things slow. */
451 if (prog->minlen > strend - strpos) {
a3621e74 452 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 453 "String too short... [re_intuit_start]\n"));
cad2e5aa 454 goto fail;
2c2d71f5 455 }
a1933d95 456 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 457 PL_regeol = strend;
33b8afdf
JH
458 if (do_utf8) {
459 if (!prog->check_utf8 && prog->check_substr)
460 to_utf8_substr(prog);
461 check = prog->check_utf8;
462 } else {
463 if (!prog->check_substr && prog->check_utf8)
464 to_byte_substr(prog);
465 check = prog->check_substr;
466 }
467 if (check == &PL_sv_undef) {
a3621e74 468 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
469 "Non-utf string cannot match utf check string\n"));
470 goto fail;
471 }
2c2d71f5 472 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
473 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
474 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 475 && !multiline ) ); /* Check after \n? */
cad2e5aa 476
7e25d62c
JH
477 if (!ml_anch) {
478 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
479 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 480 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
481 && sv && !SvROK(sv)
482 && (strpos != strbeg)) {
a3621e74 483 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
484 goto fail;
485 }
486 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 487 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 488 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
489 I32 slen;
490
1aa99e6b 491 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
492 if (SvTAIL(check)) {
493 slen = SvCUR(check); /* >= 1 */
cad2e5aa 494
9041c2e3 495 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 496 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 497 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 498 goto fail_finish;
cad2e5aa
JH
499 }
500 /* Now should match s[0..slen-2] */
501 slen--;
3f7c398e 502 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 503 || (slen > 1
3f7c398e 504 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 505 report_neq:
a3621e74 506 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
507 goto fail_finish;
508 }
cad2e5aa 509 }
3f7c398e 510 else if (*SvPVX_const(check) != *s
653099ff 511 || ((slen = SvCUR(check)) > 1
3f7c398e 512 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 513 goto report_neq;
c315bfe8 514 check_at = s;
2c2d71f5 515 goto success_at_start;
7e25d62c 516 }
cad2e5aa 517 }
2c2d71f5 518 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 519 s = strpos;
2c2d71f5 520 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 521 end_shift = prog->minlen - start_shift -
653099ff 522 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 523 if (!ml_anch) {
a3b680e6 524 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 525 - (SvTAIL(check) != 0);
a3b680e6 526 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
527
528 if (end_shift < eshift)
529 end_shift = eshift;
530 }
cad2e5aa 531 }
2c2d71f5 532 else { /* Can match at random position */
cad2e5aa
JH
533 ml_anch = 0;
534 s = strpos;
2c2d71f5
JH
535 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
536 /* Should be nonnegative! */
537 end_shift = prog->minlen - start_shift -
653099ff 538 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
539 }
540
2c2d71f5 541#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 542 if (end_shift < 0)
6bbae5e6 543 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
544#endif
545
2c2d71f5
JH
546 restart:
547 /* Find a possible match in the region s..strend by looking for
548 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 549 if (flags & REXEC_SCREAM) {
cad2e5aa 550 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 551 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 552
2c2d71f5
JH
553 if (PL_screamfirst[BmRARE(check)] >= 0
554 || ( BmRARE(check) == '\n'
555 && (BmPREVIOUS(check) == SvCUR(check) - 1)
556 && SvTAIL(check) ))
9041c2e3 557 s = screaminstr(sv, check,
2c2d71f5 558 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 559 else
2c2d71f5 560 goto fail_finish;
4addbd3b
HS
561 /* we may be pointing at the wrong string */
562 if (s && RX_MATCH_COPIED(prog))
3f7c398e 563 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
564 if (data)
565 *data->scream_olds = s;
566 }
f33976b4 567 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
568 s = fbm_instr((U8*)(s + start_shift),
569 (U8*)(strend - end_shift),
7fba1cd6 570 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 571 else
1aa99e6b
IH
572 s = fbm_instr(HOP3(s, start_shift, strend),
573 HOP3(strend, -end_shift, strbeg),
7fba1cd6 574 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
575
576 /* Update the count-of-usability, remove useless subpatterns,
577 unshift s. */
2c2d71f5 578
a0288114 579 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 580 (s ? "Found" : "Did not find"),
33b8afdf 581 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 582 PL_colors[0],
7b0972df 583 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 584 SvPVX_const(check),
2c2d71f5
JH
585 PL_colors[1], (SvTAIL(check) ? "$" : ""),
586 (s ? " at offset " : "...\n") ) );
587
588 if (!s)
589 goto fail_finish;
590
6eb5f6b9
JH
591 check_at = s;
592
2c2d71f5 593 /* Finish the diagnostic message */
a3621e74 594 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
595
596 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
597 Start with the other substr.
598 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 599 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
600 *always* match. Probably should be marked during compile...
601 Probably it is right to do no SCREAM here...
602 */
603
33b8afdf 604 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 605 /* Take into account the "other" substring. */
2c2d71f5
JH
606 /* XXXX May be hopelessly wrong for UTF... */
607 if (!other_last)
6eb5f6b9 608 other_last = strpos;
33b8afdf 609 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
610 do_other_anchored:
611 {
890ce7af
AL
612 char * const last = HOP3c(s, -start_shift, strbeg);
613 char *last1, *last2;
2c2d71f5 614 char *s1 = s;
33b8afdf 615 SV* must;
2c2d71f5 616
2c2d71f5
JH
617 t = s - prog->check_offset_max;
618 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 619 && (!do_utf8
1aa99e6b 620 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 621 && t > strpos)))
30944b6d 622 /* EMPTY */;
2c2d71f5
JH
623 else
624 t = strpos;
1aa99e6b 625 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
626 if (t < other_last) /* These positions already checked */
627 t = other_last;
1aa99e6b 628 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
629 if (last < last1)
630 last1 = last;
631 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
632 /* On end-of-str: see comment below. */
33b8afdf
JH
633 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
634 if (must == &PL_sv_undef) {
635 s = (char*)NULL;
a3621e74 636 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
637 }
638 else
639 s = fbm_instr(
640 (unsigned char*)t,
641 HOP3(HOP3(last1, prog->anchored_offset, strend)
642 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
643 must,
7fba1cd6 644 multiline ? FBMrf_MULTILINE : 0
33b8afdf 645 );
a3621e74 646 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 647 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
648 (s ? "Found" : "Contradicts"),
649 PL_colors[0],
33b8afdf
JH
650 (int)(SvCUR(must)
651 - (SvTAIL(must)!=0)),
3f7c398e 652 SvPVX_const(must),
33b8afdf 653 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
654 if (!s) {
655 if (last1 >= last2) {
a3621e74 656 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
657 ", giving up...\n"));
658 goto fail_finish;
659 }
a3621e74 660 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 661 ", trying floating at offset %ld...\n",
1aa99e6b
IH
662 (long)(HOP3c(s1, 1, strend) - i_strpos)));
663 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
664 s = HOP3c(last, 1, strend);
2c2d71f5
JH
665 goto restart;
666 }
667 else {
a3621e74 668 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 669 (long)(s - i_strpos)));
1aa99e6b
IH
670 t = HOP3c(s, -prog->anchored_offset, strbeg);
671 other_last = HOP3c(s, 1, strend);
30944b6d 672 s = s1;
2c2d71f5
JH
673 if (t == strpos)
674 goto try_at_start;
2c2d71f5
JH
675 goto try_at_offset;
676 }
30944b6d 677 }
2c2d71f5
JH
678 }
679 else { /* Take into account the floating substring. */
33b8afdf
JH
680 char *last, *last1;
681 char *s1 = s;
682 SV* must;
683
684 t = HOP3c(s, -start_shift, strbeg);
685 last1 = last =
686 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
687 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
688 last = HOP3c(t, prog->float_max_offset, strend);
689 s = HOP3c(t, prog->float_min_offset, strend);
690 if (s < other_last)
691 s = other_last;
2c2d71f5 692 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
693 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
694 /* fbm_instr() takes into account exact value of end-of-str
695 if the check is SvTAIL(ed). Since false positives are OK,
696 and end-of-str is not later than strend we are OK. */
697 if (must == &PL_sv_undef) {
698 s = (char*)NULL;
a3621e74 699 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
700 }
701 else
2c2d71f5 702 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
703 (unsigned char*)last + SvCUR(must)
704 - (SvTAIL(must)!=0),
7fba1cd6 705 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 706 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
707 (s ? "Found" : "Contradicts"),
708 PL_colors[0],
709 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 710 SvPVX_const(must),
33b8afdf
JH
711 PL_colors[1], (SvTAIL(must) ? "$" : "")));
712 if (!s) {
713 if (last1 == last) {
a3621e74 714 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
715 ", giving up...\n"));
716 goto fail_finish;
2c2d71f5 717 }
a3621e74 718 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
719 ", trying anchored starting at offset %ld...\n",
720 (long)(s1 + 1 - i_strpos)));
721 other_last = last;
722 s = HOP3c(t, 1, strend);
723 goto restart;
724 }
725 else {
a3621e74 726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
727 (long)(s - i_strpos)));
728 other_last = s; /* Fix this later. --Hugo */
729 s = s1;
730 if (t == strpos)
731 goto try_at_start;
732 goto try_at_offset;
733 }
2c2d71f5 734 }
cad2e5aa 735 }
2c2d71f5
JH
736
737 t = s - prog->check_offset_max;
2c2d71f5 738 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 739 && (!do_utf8
1aa99e6b
IH
740 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
741 && t > strpos))) {
2c2d71f5
JH
742 /* Fixed substring is found far enough so that the match
743 cannot start at strpos. */
744 try_at_offset:
cad2e5aa 745 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
746 /* Eventually fbm_*() should handle this, but often
747 anchored_offset is not 0, so this check will not be wasted. */
748 /* XXXX In the code below we prefer to look for "^" even in
749 presence of anchored substrings. And we search even
750 beyond the found float position. These pessimizations
751 are historical artefacts only. */
752 find_anchor:
2c2d71f5 753 while (t < strend - prog->minlen) {
cad2e5aa 754 if (*t == '\n') {
4ee3650e 755 if (t < check_at - prog->check_offset_min) {
33b8afdf 756 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
757 /* Since we moved from the found position,
758 we definitely contradict the found anchored
30944b6d
IZ
759 substr. Due to the above check we do not
760 contradict "check" substr.
761 Thus we can arrive here only if check substr
762 is float. Redo checking for "other"=="fixed".
763 */
9041c2e3 764 strpos = t + 1;
a3621e74 765 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 766 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
767 goto do_other_anchored;
768 }
4ee3650e
GS
769 /* We don't contradict the found floating substring. */
770 /* XXXX Why not check for STCLASS? */
cad2e5aa 771 s = t + 1;
a3621e74 772 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 773 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
774 goto set_useful;
775 }
4ee3650e
GS
776 /* Position contradicts check-string */
777 /* XXXX probably better to look for check-string
778 than for "\n", so one should lower the limit for t? */
a3621e74 779 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 780 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 781 other_last = strpos = s = t + 1;
cad2e5aa
JH
782 goto restart;
783 }
784 t++;
785 }
a3621e74 786 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 787 PL_colors[0], PL_colors[1]));
2c2d71f5 788 goto fail_finish;
cad2e5aa 789 }
f5952150 790 else {
a3621e74 791 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 792 PL_colors[0], PL_colors[1]));
f5952150 793 }
cad2e5aa
JH
794 s = t;
795 set_useful:
33b8afdf 796 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
797 }
798 else {
f5952150 799 /* The found string does not prohibit matching at strpos,
2c2d71f5 800 - no optimization of calling REx engine can be performed,
f5952150
GS
801 unless it was an MBOL and we are not after MBOL,
802 or a future STCLASS check will fail this. */
2c2d71f5
JH
803 try_at_start:
804 /* Even in this situation we may use MBOL flag if strpos is offset
805 wrt the start of the string. */
05b4157f 806 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 807 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
808 /* May be due to an implicit anchor of m{.*foo} */
809 && !(prog->reganch & ROPT_IMPLICIT))
810 {
cad2e5aa
JH
811 t = strpos;
812 goto find_anchor;
813 }
a3621e74 814 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 815 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 816 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 817 );
2c2d71f5 818 success_at_start:
30944b6d 819 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
820 && (do_utf8 ? (
821 prog->check_utf8 /* Could be deleted already */
822 && --BmUSEFUL(prog->check_utf8) < 0
823 && (prog->check_utf8 == prog->float_utf8)
824 ) : (
825 prog->check_substr /* Could be deleted already */
826 && --BmUSEFUL(prog->check_substr) < 0
827 && (prog->check_substr == prog->float_substr)
828 )))
66e933ab 829 {
cad2e5aa 830 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 831 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
832 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
833 if (do_utf8 ? prog->check_substr : prog->check_utf8)
834 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
835 prog->check_substr = prog->check_utf8 = NULL; /* disable */
836 prog->float_substr = prog->float_utf8 = NULL; /* clear */
837 check = NULL; /* abort */
cad2e5aa 838 s = strpos;
3cf5c195
IZ
839 /* XXXX This is a remnant of the old implementation. It
840 looks wasteful, since now INTUIT can use many
6eb5f6b9 841 other heuristics. */
cad2e5aa
JH
842 prog->reganch &= ~RE_USE_INTUIT;
843 }
844 else
845 s = strpos;
846 }
847
6eb5f6b9
JH
848 /* Last resort... */
849 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
850 if (prog->regstclass) {
851 /* minlen == 0 is possible if regstclass is \b or \B,
852 and the fixed substr is ''$.
853 Since minlen is already taken into account, s+1 is before strend;
854 accidentally, minlen >= 1 guaranties no false positives at s + 1
855 even for \b or \B. But (minlen? 1 : 0) below assumes that
856 regstclass does not come from lookahead... */
857 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
858 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 859 const U8* const str = (U8*)STRING(prog->regstclass);
06b5626a 860 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 861 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 862 : 1);
a3b680e6 863 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 864 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 865 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
866 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
867 cl_l, strend)
868 : strend);
6eb5f6b9
JH
869
870 t = s;
9926ca43 871 cache_re(prog);
06b5626a 872 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
6eb5f6b9
JH
873 if (!s) {
874#ifdef DEBUGGING
cbbf8932 875 const char *what = NULL;
6eb5f6b9
JH
876#endif
877 if (endpos == strend) {
a3621e74 878 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
879 "Could not match STCLASS...\n") );
880 goto fail;
881 }
a3621e74 882 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 883 "This position contradicts STCLASS...\n") );
653099ff
GS
884 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
885 goto fail;
6eb5f6b9 886 /* Contradict one of substrings */
33b8afdf
JH
887 if (prog->anchored_substr || prog->anchored_utf8) {
888 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 889 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 890 hop_and_restart:
1aa99e6b 891 s = HOP3c(t, 1, strend);
66e933ab
GS
892 if (s + start_shift + end_shift > strend) {
893 /* XXXX Should be taken into account earlier? */
a3621e74 894 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
895 "Could not match STCLASS...\n") );
896 goto fail;
897 }
5e39e1e5
HS
898 if (!check)
899 goto giveup;
a3621e74 900 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 901 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
902 what, (long)(s + start_shift - i_strpos)) );
903 goto restart;
904 }
66e933ab 905 /* Have both, check_string is floating */
6eb5f6b9
JH
906 if (t + start_shift >= check_at) /* Contradicts floating=check */
907 goto retry_floating_check;
908 /* Recheck anchored substring, but not floating... */
9041c2e3 909 s = check_at;
5e39e1e5
HS
910 if (!check)
911 goto giveup;
a3621e74 912 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 913 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
914 (long)(other_last - i_strpos)) );
915 goto do_other_anchored;
916 }
60e71179
GS
917 /* Another way we could have checked stclass at the
918 current position only: */
919 if (ml_anch) {
920 s = t = t + 1;
5e39e1e5
HS
921 if (!check)
922 goto giveup;
a3621e74 923 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 924 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 925 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 926 goto try_at_offset;
66e933ab 927 }
33b8afdf 928 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 929 goto fail;
6eb5f6b9
JH
930 /* Check is floating subtring. */
931 retry_floating_check:
932 t = check_at - start_shift;
a3621e74 933 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
934 goto hop_and_restart;
935 }
b7953727 936 if (t != s) {
a3621e74 937 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 938 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
939 (long)(t - i_strpos), (long)(s - i_strpos))
940 );
941 }
942 else {
a3621e74 943 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
944 "Does not contradict STCLASS...\n");
945 );
946 }
6eb5f6b9 947 }
5e39e1e5 948 giveup:
a3621e74 949 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
950 PL_colors[4], (check ? "Guessed" : "Giving up"),
951 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 952 return s;
2c2d71f5
JH
953
954 fail_finish: /* Substring not found */
33b8afdf
JH
955 if (prog->check_substr || prog->check_utf8) /* could be removed already */
956 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 957 fail:
a3621e74 958 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 959 PL_colors[4], PL_colors[5]));
bd61b366 960 return NULL;
cad2e5aa 961}
9661b544 962
6eb5f6b9 963/* We know what class REx starts with. Try to find this position... */
3c3eec57 964STATIC char *
a3b680e6 965S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 norun)
a687059c 966{
27da23d5 967 dVAR;
1df70142 968 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 969 char *m;
d8093b23 970 STRLEN ln;
5dab1207 971 STRLEN lnc;
078c425b 972 register STRLEN uskip;
d8093b23
G
973 unsigned int c1;
974 unsigned int c2;
6eb5f6b9
JH
975 char *e;
976 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 977 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 978
6eb5f6b9
JH
979 /* We know what class it must start with. */
980 switch (OP(c)) {
6eb5f6b9 981 case ANYOF:
388cc4de 982 if (do_utf8) {
078c425b 983 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
984 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
985 !UTF8_IS_INVARIANT((U8)s[0]) ?
986 reginclass(c, (U8*)s, 0, do_utf8) :
987 REGINCLASS(c, (U8*)s)) {
988 if (tmp && (norun || regtry(prog, s)))
989 goto got_it;
990 else
991 tmp = doevery;
992 }
993 else
994 tmp = 1;
078c425b 995 s += uskip;
388cc4de
HS
996 }
997 }
998 else {
999 while (s < strend) {
1000 STRLEN skip = 1;
1001
1002 if (REGINCLASS(c, (U8*)s) ||
1003 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1004 /* The assignment of 2 is intentional:
1005 * for the folded sharp s, the skip is 2. */
1006 (skip = SHARP_S_SKIP))) {
1007 if (tmp && (norun || regtry(prog, s)))
1008 goto got_it;
1009 else
1010 tmp = doevery;
1011 }
1012 else
1013 tmp = 1;
1014 s += skip;
1015 }
a0d0e21e 1016 }
6eb5f6b9 1017 break;
f33976b4
DB
1018 case CANY:
1019 while (s < strend) {
1020 if (tmp && (norun || regtry(prog, s)))
1021 goto got_it;
1022 else
1023 tmp = doevery;
1024 s++;
1025 }
1026 break;
6eb5f6b9 1027 case EXACTF:
5dab1207
NIS
1028 m = STRING(c);
1029 ln = STR_LEN(c); /* length to match in octets/bytes */
1030 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1031 if (UTF) {
a2a2844f 1032 STRLEN ulen1, ulen2;
5dab1207 1033 U8 *sm = (U8 *) m;
89ebb4a3
JH
1034 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1035 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
041457d9 1036 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
a2a2844f
JH
1037
1038 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1039 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1040
89ebb4a3 1041 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1042 0, uniflags);
89ebb4a3 1043 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1044 0, uniflags);
5dab1207
NIS
1045 lnc = 0;
1046 while (sm < ((U8 *) m + ln)) {
1047 lnc++;
1048 sm += UTF8SKIP(sm);
1049 }
1aa99e6b
IH
1050 }
1051 else {
1052 c1 = *(U8*)m;
1053 c2 = PL_fold[c1];
1054 }
6eb5f6b9
JH
1055 goto do_exactf;
1056 case EXACTFL:
5dab1207
NIS
1057 m = STRING(c);
1058 ln = STR_LEN(c);
1059 lnc = (I32) ln;
d8093b23 1060 c1 = *(U8*)m;
6eb5f6b9
JH
1061 c2 = PL_fold_locale[c1];
1062 do_exactf:
db12adc6 1063 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1064
6eb5f6b9
JH
1065 if (norun && e < s)
1066 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1067
60a8b682
JH
1068 /* The idea in the EXACTF* cases is to first find the
1069 * first character of the EXACTF* node and then, if
1070 * necessary, case-insensitively compare the full
1071 * text of the node. The c1 and c2 are the first
1072 * characters (though in Unicode it gets a bit
1073 * more complicated because there are more cases
7f16dd3d
JH
1074 * than just upper and lower: one needs to use
1075 * the so-called folding case for case-insensitive
1076 * matching (called "loose matching" in Unicode).
1077 * ibcmp_utf8() will do just that. */
60a8b682 1078
1aa99e6b 1079 if (do_utf8) {
575cac57 1080 UV c, f;
89ebb4a3 1081 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1082 STRLEN len, foldlen;
041457d9 1083 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
09091399 1084 if (c1 == c2) {
5dab1207
NIS
1085 /* Upper and lower of 1st char are equal -
1086 * probably not a "letter". */
1aa99e6b 1087 while (s <= e) {
89ebb4a3 1088 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1089 uniflags);
80aecb99
JH
1090 if ( c == c1
1091 && (ln == len ||
66423254 1092 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1093 m, (char **)0, ln, (bool)UTF))
55da9344 1094 && (norun || regtry(prog, s)) )
1aa99e6b 1095 goto got_it;
80aecb99 1096 else {
1df70142 1097 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1098 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1099 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1100 if ( f != c
1101 && (f == c1 || f == c2)
1102 && (ln == foldlen ||
66423254
JH
1103 !ibcmp_utf8((char *) foldbuf,
1104 (char **)0, foldlen, do_utf8,
d07ddd77 1105 m,
eb160463 1106 (char **)0, ln, (bool)UTF))
80aecb99
JH
1107 && (norun || regtry(prog, s)) )
1108 goto got_it;
1109 }
1aa99e6b
IH
1110 s += len;
1111 }
09091399
JH
1112 }
1113 else {
1aa99e6b 1114 while (s <= e) {
89ebb4a3 1115 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1116 uniflags);
80aecb99 1117
60a8b682 1118 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1119 * Note that not all the possible combinations
1120 * are handled here: some of them are handled
1121 * by the standard folding rules, and some of
1122 * them (the character class or ANYOF cases)
1123 * are handled during compiletime in
1124 * regexec.c:S_regclass(). */
880bd946
JH
1125 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1126 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1127 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1128
1129 if ( (c == c1 || c == c2)
1130 && (ln == len ||
66423254 1131 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1132 m, (char **)0, ln, (bool)UTF))
55da9344 1133 && (norun || regtry(prog, s)) )
1aa99e6b 1134 goto got_it;
80aecb99 1135 else {
1df70142 1136 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1137 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1138 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1139 if ( f != c
1140 && (f == c1 || f == c2)
1141 && (ln == foldlen ||
a6872d42 1142 !ibcmp_utf8((char *) foldbuf,
66423254 1143 (char **)0, foldlen, do_utf8,
d07ddd77 1144 m,
eb160463 1145 (char **)0, ln, (bool)UTF))
80aecb99
JH
1146 && (norun || regtry(prog, s)) )
1147 goto got_it;
1148 }
1aa99e6b
IH
1149 s += len;
1150 }
09091399 1151 }
1aa99e6b
IH
1152 }
1153 else {
1154 if (c1 == c2)
1155 while (s <= e) {
1156 if ( *(U8*)s == c1
1157 && (ln == 1 || !(OP(c) == EXACTF
1158 ? ibcmp(s, m, ln)
1159 : ibcmp_locale(s, m, ln)))
1160 && (norun || regtry(prog, s)) )
1161 goto got_it;
1162 s++;
1163 }
1164 else
1165 while (s <= e) {
1166 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1167 && (ln == 1 || !(OP(c) == EXACTF
1168 ? ibcmp(s, m, ln)
1169 : ibcmp_locale(s, m, ln)))
1170 && (norun || regtry(prog, s)) )
1171 goto got_it;
1172 s++;
1173 }
b3c9acc1
IZ
1174 }
1175 break;
bbce6d69 1176 case BOUNDL:
3280af22 1177 PL_reg_flags |= RF_tainted;
bbce6d69 1178 /* FALL THROUGH */
a0d0e21e 1179 case BOUND:
ffc61ed2 1180 if (do_utf8) {
12d33761 1181 if (s == PL_bostr)
ffc61ed2
JH
1182 tmp = '\n';
1183 else {
6136c704 1184 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
b4f7163a 1185 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1186 }
1187 tmp = ((OP(c) == BOUND ?
9041c2e3 1188 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1189 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1190 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1191 if (tmp == !(OP(c) == BOUND ?
3568d838 1192 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1193 isALNUM_LC_utf8((U8*)s)))
1194 {
1195 tmp = !tmp;
1196 if ((norun || regtry(prog, s)))
1197 goto got_it;
1198 }
078c425b 1199 s += uskip;
a687059c 1200 }
a0d0e21e 1201 }
667bb95a 1202 else {
12d33761 1203 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1204 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1205 while (s < strend) {
1206 if (tmp ==
1207 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1208 tmp = !tmp;
1209 if ((norun || regtry(prog, s)))
1210 goto got_it;
1211 }
1212 s++;
a0ed51b3 1213 }
a0ed51b3 1214 }
6eb5f6b9 1215 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1216 goto got_it;
1217 break;
bbce6d69 1218 case NBOUNDL:
3280af22 1219 PL_reg_flags |= RF_tainted;
bbce6d69 1220 /* FALL THROUGH */
a0d0e21e 1221 case NBOUND:
ffc61ed2 1222 if (do_utf8) {
12d33761 1223 if (s == PL_bostr)
ffc61ed2
JH
1224 tmp = '\n';
1225 else {
6136c704 1226 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
b4f7163a 1227 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1228 }
1229 tmp = ((OP(c) == NBOUND ?
9041c2e3 1230 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1231 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1232 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1233 if (tmp == !(OP(c) == NBOUND ?
3568d838 1234 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1235 isALNUM_LC_utf8((U8*)s)))
1236 tmp = !tmp;
1237 else if ((norun || regtry(prog, s)))
1238 goto got_it;
078c425b 1239 s += uskip;
ffc61ed2 1240 }
a0d0e21e 1241 }
667bb95a 1242 else {
12d33761 1243 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1244 tmp = ((OP(c) == NBOUND ?
1245 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1246 while (s < strend) {
1247 if (tmp ==
1248 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1249 tmp = !tmp;
1250 else if ((norun || regtry(prog, s)))
1251 goto got_it;
1252 s++;
1253 }
a0ed51b3 1254 }
6eb5f6b9 1255 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1256 goto got_it;
1257 break;
a0d0e21e 1258 case ALNUM:
ffc61ed2 1259 if (do_utf8) {
1a4fad37 1260 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1261 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1262 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1263 if (tmp && (norun || regtry(prog, s)))
1264 goto got_it;
1265 else
1266 tmp = doevery;
1267 }
bbce6d69 1268 else
ffc61ed2 1269 tmp = 1;
078c425b 1270 s += uskip;
bbce6d69 1271 }
bbce6d69 1272 }
ffc61ed2
JH
1273 else {
1274 while (s < strend) {
1275 if (isALNUM(*s)) {
1276 if (tmp && (norun || regtry(prog, s)))
1277 goto got_it;
1278 else
1279 tmp = doevery;
1280 }
a0ed51b3 1281 else
ffc61ed2
JH
1282 tmp = 1;
1283 s++;
a0ed51b3 1284 }
a0ed51b3
LW
1285 }
1286 break;
bbce6d69 1287 case ALNUML:
3280af22 1288 PL_reg_flags |= RF_tainted;
ffc61ed2 1289 if (do_utf8) {
078c425b 1290 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1291 if (isALNUM_LC_utf8((U8*)s)) {
1292 if (tmp && (norun || regtry(prog, s)))
1293 goto got_it;
1294 else
1295 tmp = doevery;
1296 }
a687059c 1297 else
ffc61ed2 1298 tmp = 1;
078c425b 1299 s += uskip;
a0d0e21e 1300 }
a0d0e21e 1301 }
ffc61ed2
JH
1302 else {
1303 while (s < strend) {
1304 if (isALNUM_LC(*s)) {
1305 if (tmp && (norun || regtry(prog, s)))
1306 goto got_it;
1307 else
1308 tmp = doevery;
1309 }
a0ed51b3 1310 else
ffc61ed2
JH
1311 tmp = 1;
1312 s++;
a0ed51b3 1313 }
a0ed51b3
LW
1314 }
1315 break;
a0d0e21e 1316 case NALNUM:
ffc61ed2 1317 if (do_utf8) {
1a4fad37 1318 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1319 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1320 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1321 if (tmp && (norun || regtry(prog, s)))
1322 goto got_it;
1323 else
1324 tmp = doevery;
1325 }
bbce6d69 1326 else
ffc61ed2 1327 tmp = 1;
078c425b 1328 s += uskip;
bbce6d69 1329 }
bbce6d69 1330 }
ffc61ed2
JH
1331 else {
1332 while (s < strend) {
1333 if (!isALNUM(*s)) {
1334 if (tmp && (norun || regtry(prog, s)))
1335 goto got_it;
1336 else
1337 tmp = doevery;
1338 }
a0ed51b3 1339 else
ffc61ed2
JH
1340 tmp = 1;
1341 s++;
a0ed51b3 1342 }
a0ed51b3
LW
1343 }
1344 break;
bbce6d69 1345 case NALNUML:
3280af22 1346 PL_reg_flags |= RF_tainted;
ffc61ed2 1347 if (do_utf8) {
078c425b 1348 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1349 if (!isALNUM_LC_utf8((U8*)s)) {
1350 if (tmp && (norun || regtry(prog, s)))
1351 goto got_it;
1352 else
1353 tmp = doevery;
1354 }
a687059c 1355 else
ffc61ed2 1356 tmp = 1;
078c425b 1357 s += uskip;
a687059c 1358 }
a0d0e21e 1359 }
ffc61ed2
JH
1360 else {
1361 while (s < strend) {
1362 if (!isALNUM_LC(*s)) {
1363 if (tmp && (norun || regtry(prog, s)))
1364 goto got_it;
1365 else
1366 tmp = doevery;
1367 }
a0ed51b3 1368 else
ffc61ed2
JH
1369 tmp = 1;
1370 s++;
a0ed51b3 1371 }
a0ed51b3
LW
1372 }
1373 break;
a0d0e21e 1374 case SPACE:
ffc61ed2 1375 if (do_utf8) {
1a4fad37 1376 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1377 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1378 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1379 if (tmp && (norun || regtry(prog, s)))
1380 goto got_it;
1381 else
1382 tmp = doevery;
1383 }
a0d0e21e 1384 else
ffc61ed2 1385 tmp = 1;
078c425b 1386 s += uskip;
2304df62 1387 }
a0d0e21e 1388 }
ffc61ed2
JH
1389 else {
1390 while (s < strend) {
1391 if (isSPACE(*s)) {
1392 if (tmp && (norun || regtry(prog, s)))
1393 goto got_it;
1394 else
1395 tmp = doevery;
1396 }
a0ed51b3 1397 else
ffc61ed2
JH
1398 tmp = 1;
1399 s++;
a0ed51b3 1400 }
a0ed51b3
LW
1401 }
1402 break;
bbce6d69 1403 case SPACEL:
3280af22 1404 PL_reg_flags |= RF_tainted;
ffc61ed2 1405 if (do_utf8) {
078c425b 1406 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1407 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1408 if (tmp && (norun || regtry(prog, s)))
1409 goto got_it;
1410 else
1411 tmp = doevery;
1412 }
bbce6d69 1413 else
ffc61ed2 1414 tmp = 1;
078c425b 1415 s += uskip;
bbce6d69 1416 }
bbce6d69 1417 }
ffc61ed2
JH
1418 else {
1419 while (s < strend) {
1420 if (isSPACE_LC(*s)) {
1421 if (tmp && (norun || regtry(prog, s)))
1422 goto got_it;
1423 else
1424 tmp = doevery;
1425 }
a0ed51b3 1426 else
ffc61ed2
JH
1427 tmp = 1;
1428 s++;
a0ed51b3 1429 }
a0ed51b3
LW
1430 }
1431 break;
a0d0e21e 1432 case NSPACE:
ffc61ed2 1433 if (do_utf8) {
1a4fad37 1434 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1435 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1436 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1437 if (tmp && (norun || regtry(prog, s)))
1438 goto got_it;
1439 else
1440 tmp = doevery;
1441 }
a0d0e21e 1442 else
ffc61ed2 1443 tmp = 1;
078c425b 1444 s += uskip;
a687059c 1445 }
a0d0e21e 1446 }
ffc61ed2
JH
1447 else {
1448 while (s < strend) {
1449 if (!isSPACE(*s)) {
1450 if (tmp && (norun || regtry(prog, s)))
1451 goto got_it;
1452 else
1453 tmp = doevery;
1454 }
a0ed51b3 1455 else
ffc61ed2
JH
1456 tmp = 1;
1457 s++;
a0ed51b3 1458 }
a0ed51b3
LW
1459 }
1460 break;
bbce6d69 1461 case NSPACEL:
3280af22 1462 PL_reg_flags |= RF_tainted;
ffc61ed2 1463 if (do_utf8) {
078c425b 1464 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1465 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1466 if (tmp && (norun || regtry(prog, s)))
1467 goto got_it;
1468 else
1469 tmp = doevery;
1470 }
bbce6d69 1471 else
ffc61ed2 1472 tmp = 1;
078c425b 1473 s += uskip;
bbce6d69 1474 }
bbce6d69 1475 }
ffc61ed2
JH
1476 else {
1477 while (s < strend) {
1478 if (!isSPACE_LC(*s)) {
1479 if (tmp && (norun || regtry(prog, s)))
1480 goto got_it;
1481 else
1482 tmp = doevery;
1483 }
a0ed51b3 1484 else
ffc61ed2
JH
1485 tmp = 1;
1486 s++;
a0ed51b3 1487 }
a0ed51b3
LW
1488 }
1489 break;
a0d0e21e 1490 case DIGIT:
ffc61ed2 1491 if (do_utf8) {
1a4fad37 1492 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1493 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1494 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1495 if (tmp && (norun || regtry(prog, s)))
1496 goto got_it;
1497 else
1498 tmp = doevery;
1499 }
a0d0e21e 1500 else
ffc61ed2 1501 tmp = 1;
078c425b 1502 s += uskip;
2b69d0c2 1503 }
a0d0e21e 1504 }
ffc61ed2
JH
1505 else {
1506 while (s < strend) {
1507 if (isDIGIT(*s)) {
1508 if (tmp && (norun || regtry(prog, s)))
1509 goto got_it;
1510 else
1511 tmp = doevery;
1512 }
a0ed51b3 1513 else
ffc61ed2
JH
1514 tmp = 1;
1515 s++;
a0ed51b3 1516 }
a0ed51b3
LW
1517 }
1518 break;
b8c5462f
JH
1519 case DIGITL:
1520 PL_reg_flags |= RF_tainted;
ffc61ed2 1521 if (do_utf8) {
078c425b 1522 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1523 if (isDIGIT_LC_utf8((U8*)s)) {
1524 if (tmp && (norun || regtry(prog, s)))
1525 goto got_it;
1526 else
1527 tmp = doevery;
1528 }
b8c5462f 1529 else
ffc61ed2 1530 tmp = 1;
078c425b 1531 s += uskip;
b8c5462f 1532 }
b8c5462f 1533 }
ffc61ed2
JH
1534 else {
1535 while (s < strend) {
1536 if (isDIGIT_LC(*s)) {
1537 if (tmp && (norun || regtry(prog, s)))
1538 goto got_it;
1539 else
1540 tmp = doevery;
1541 }
b8c5462f 1542 else
ffc61ed2
JH
1543 tmp = 1;
1544 s++;
b8c5462f 1545 }
b8c5462f
JH
1546 }
1547 break;
a0d0e21e 1548 case NDIGIT:
ffc61ed2 1549 if (do_utf8) {
1a4fad37 1550 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1551 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1552 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1553 if (tmp && (norun || regtry(prog, s)))
1554 goto got_it;
1555 else
1556 tmp = doevery;
1557 }
a0d0e21e 1558 else
ffc61ed2 1559 tmp = 1;
078c425b 1560 s += uskip;
a687059c 1561 }
a0d0e21e 1562 }
ffc61ed2
JH
1563 else {
1564 while (s < strend) {
1565 if (!isDIGIT(*s)) {
1566 if (tmp && (norun || regtry(prog, s)))
1567 goto got_it;
1568 else
1569 tmp = doevery;
1570 }
a0ed51b3 1571 else
ffc61ed2
JH
1572 tmp = 1;
1573 s++;
a0ed51b3 1574 }
a0ed51b3
LW
1575 }
1576 break;
b8c5462f
JH
1577 case NDIGITL:
1578 PL_reg_flags |= RF_tainted;
ffc61ed2 1579 if (do_utf8) {
078c425b 1580 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1581 if (!isDIGIT_LC_utf8((U8*)s)) {
1582 if (tmp && (norun || regtry(prog, s)))
1583 goto got_it;
1584 else
1585 tmp = doevery;
1586 }
b8c5462f 1587 else
ffc61ed2 1588 tmp = 1;
078c425b 1589 s += uskip;
b8c5462f 1590 }
a0ed51b3 1591 }
ffc61ed2
JH
1592 else {
1593 while (s < strend) {
1594 if (!isDIGIT_LC(*s)) {
1595 if (tmp && (norun || regtry(prog, s)))
1596 goto got_it;
1597 else
1598 tmp = doevery;
1599 }
cf93c79d 1600 else
ffc61ed2
JH
1601 tmp = 1;
1602 s++;
b8c5462f 1603 }
b8c5462f
JH
1604 }
1605 break;
b3c9acc1 1606 default:
3c3eec57
GS
1607 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1608 break;
d6a28714 1609 }
6eb5f6b9
JH
1610 return 0;
1611 got_it:
1612 return s;
1613}
1614
1615/*
1616 - regexec_flags - match a regexp against a string
1617 */
1618I32
1619Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1620 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1621/* strend: pointer to null at end of string */
1622/* strbeg: real beginning of string */
1623/* minend: end of match must be >=minend after stringarg. */
1624/* data: May be used for some additional optimizations. */
1625/* nosave: For optimizations. */
1626{
97aff369 1627 dVAR;
6eb5f6b9
JH
1628 register char *s;
1629 register regnode *c;
1630 register char *startpos = stringarg;
6eb5f6b9
JH
1631 I32 minlen; /* must match at least this many chars */
1632 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1633 I32 end_shift = 0; /* Same for the end. */ /* CC */
1634 I32 scream_pos = -1; /* Internal iterator of scream. */
1635 char *scream_olds;
1636 SV* oreplsv = GvSV(PL_replgv);
1df70142 1637 const bool do_utf8 = DO_UTF8(sv);
a3b680e6 1638 const I32 multiline = prog->reganch & PMf_MULTILINE;
2a782b5b 1639#ifdef DEBUGGING
6136c704
AL
1640 SV * const dsv0 = PERL_DEBUG_PAD_ZERO(0);
1641 SV * const dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1642#endif
a3621e74
YO
1643
1644 GET_RE_DEBUG_FLAGS_DECL;
1645
9d4ba2ae 1646 PERL_UNUSED_ARG(data);
a30b2f1f 1647 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1648
1649 PL_regcc = 0;
1650
1651 cache_re(prog);
1652#ifdef DEBUGGING
aea4f609 1653 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1654#endif
1655
1656 /* Be paranoid... */
1657 if (prog == NULL || startpos == NULL) {
1658 Perl_croak(aTHX_ "NULL regexp parameter");
1659 return 0;
1660 }
1661
1662 minlen = prog->minlen;
61a36c01 1663 if (strend - startpos < minlen) {
a3621e74 1664 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1665 "String too short [regexec_flags]...\n"));
1666 goto phooey;
1aa99e6b 1667 }
6eb5f6b9 1668
6eb5f6b9
JH
1669 /* Check validity of program. */
1670 if (UCHARAT(prog->program) != REG_MAGIC) {
1671 Perl_croak(aTHX_ "corrupted regexp program");
1672 }
1673
1674 PL_reg_flags = 0;
1675 PL_reg_eval_set = 0;
1676 PL_reg_maxiter = 0;
1677
1678 if (prog->reganch & ROPT_UTF8)
1679 PL_reg_flags |= RF_utf8;
1680
1681 /* Mark beginning of line for ^ and lookbehind. */
1682 PL_regbol = startpos;
1683 PL_bostr = strbeg;
1684 PL_reg_sv = sv;
1685
1686 /* Mark end of line for $ (and such) */
1687 PL_regeol = strend;
1688
1689 /* see how far we have to get to not match where we matched before */
1690 PL_regtill = startpos+minend;
1691
1692 /* We start without call_cc context. */
1693 PL_reg_call_cc = 0;
1694
1695 /* If there is a "must appear" string, look for it. */
1696 s = startpos;
1697
1698 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1699 MAGIC *mg;
1700
1701 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1702 PL_reg_ganch = startpos;
1703 else if (sv && SvTYPE(sv) >= SVt_PVMG
1704 && SvMAGIC(sv)
14befaf4
DM
1705 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1706 && mg->mg_len >= 0) {
6eb5f6b9
JH
1707 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1708 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1709 if (s > PL_reg_ganch)
6eb5f6b9
JH
1710 goto phooey;
1711 s = PL_reg_ganch;
1712 }
1713 }
1714 else /* pos() not defined */
1715 PL_reg_ganch = strbeg;
1716 }
1717
a0714e2c 1718 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1719 re_scream_pos_data d;
1720
1721 d.scream_olds = &scream_olds;
1722 d.scream_pos = &scream_pos;
1723 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1724 if (!s) {
a3621e74 1725 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1726 goto phooey; /* not present */
3fa9c3d7 1727 }
6eb5f6b9
JH
1728 }
1729
a3621e74 1730 DEBUG_EXECUTE_r({
1df70142
AL
1731 const char * const s0 = UTF
1732 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1733 UNI_DISPLAY_REGEX)
1734 : prog->precomp;
1735 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1736 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1737 UNI_DISPLAY_REGEX) : startpos;
1df70142 1738 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1739 if (!PL_colorset)
1740 reginitcolors();
1741 PerlIO_printf(Perl_debug_log,
a0288114 1742 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1743 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1744 len0, len0, s0,
2a782b5b 1745 PL_colors[1],
9e55ce06 1746 len0 > 60 ? "..." : "",
2a782b5b 1747 PL_colors[0],
9e55ce06
JH
1748 (int)(len1 > 60 ? 60 : len1),
1749 s1, PL_colors[1],
1750 (len1 > 60 ? "..." : "")
2a782b5b
JH
1751 );
1752 });
6eb5f6b9
JH
1753
1754 /* Simplest case: anchored match need be tried only once. */
1755 /* [unless only anchor is BOL and multiline is set] */
1756 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1757 if (s == startpos && regtry(prog, startpos))
1758 goto got_it;
7fba1cd6 1759 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1760 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1761 {
1762 char *end;
1763
1764 if (minlen)
1765 dontbother = minlen - 1;
1aa99e6b 1766 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1767 /* for multiline we only have to try after newlines */
33b8afdf 1768 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1769 if (s == startpos)
1770 goto after_try;
1771 while (1) {
1772 if (regtry(prog, s))
1773 goto got_it;
1774 after_try:
1775 if (s >= end)
1776 goto phooey;
1777 if (prog->reganch & RE_USE_INTUIT) {
1778 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1779 if (!s)
1780 goto phooey;
1781 }
1782 else
1783 s++;
1784 }
1785 } else {
1786 if (s > startpos)
1787 s--;
1788 while (s < end) {
1789 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1790 if (regtry(prog, s))
1791 goto got_it;
1792 }
1793 }
1794 }
1795 }
1796 goto phooey;
1797 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1798 if (regtry(prog, PL_reg_ganch))
1799 goto got_it;
1800 goto phooey;
1801 }
1802
1803 /* Messy cases: unanchored match. */
33b8afdf 1804 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1805 /* we have /x+whatever/ */
1806 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1807 char ch;
bf93d4cc
GS
1808#ifdef DEBUGGING
1809 int did_match = 0;
1810#endif
33b8afdf
JH
1811 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1812 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1813 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1814
1aa99e6b 1815 if (do_utf8) {
6eb5f6b9
JH
1816 while (s < strend) {
1817 if (*s == ch) {
a3621e74 1818 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1819 if (regtry(prog, s)) goto got_it;
1820 s += UTF8SKIP(s);
1821 while (s < strend && *s == ch)
1822 s += UTF8SKIP(s);
1823 }
1824 s += UTF8SKIP(s);
1825 }
1826 }
1827 else {
1828 while (s < strend) {
1829 if (*s == ch) {
a3621e74 1830 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1831 if (regtry(prog, s)) goto got_it;
1832 s++;
1833 while (s < strend && *s == ch)
1834 s++;
1835 }
1836 s++;
1837 }
1838 }
a3621e74 1839 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1840 PerlIO_printf(Perl_debug_log,
b7953727
JH
1841 "Did not find anchored character...\n")
1842 );
6eb5f6b9 1843 }
a0714e2c
SS
1844 else if (prog->anchored_substr != NULL
1845 || prog->anchored_utf8 != NULL
1846 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1847 && prog->float_max_offset < strend - s)) {
1848 SV *must;
1849 I32 back_max;
1850 I32 back_min;
1851 char *last;
6eb5f6b9 1852 char *last1; /* Last position checked before */
bf93d4cc
GS
1853#ifdef DEBUGGING
1854 int did_match = 0;
1855#endif
33b8afdf
JH
1856 if (prog->anchored_substr || prog->anchored_utf8) {
1857 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1858 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1859 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1860 back_max = back_min = prog->anchored_offset;
1861 } else {
1862 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1863 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1864 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1865 back_max = prog->float_max_offset;
1866 back_min = prog->float_min_offset;
1867 }
1868 if (must == &PL_sv_undef)
1869 /* could not downgrade utf8 check substring, so must fail */
1870 goto phooey;
1871
1872 last = HOP3c(strend, /* Cannot start after this */
1873 -(I32)(CHR_SVLEN(must)
1874 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1875
1876 if (s > PL_bostr)
1877 last1 = HOPc(s, -1);
1878 else
1879 last1 = s - 1; /* bogus */
1880
a0288114 1881 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1882 check_substr==must. */
1883 scream_pos = -1;
1884 dontbother = end_shift;
1885 strend = HOPc(strend, -dontbother);
1886 while ( (s <= last) &&
9041c2e3 1887 ((flags & REXEC_SCREAM)
1aa99e6b 1888 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1889 end_shift, &scream_pos, 0))
1aa99e6b 1890 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1891 (unsigned char*)strend, must,
7fba1cd6 1892 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1893 /* we may be pointing at the wrong string */
1894 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1895 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1896 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1897 if (HOPc(s, -back_max) > last1) {
1898 last1 = HOPc(s, -back_min);
1899 s = HOPc(s, -back_max);
1900 }
1901 else {
1902 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1903
1904 last1 = HOPc(s, -back_min);
1905 s = t;
1906 }
1aa99e6b 1907 if (do_utf8) {
6eb5f6b9
JH
1908 while (s <= last1) {
1909 if (regtry(prog, s))
1910 goto got_it;
1911 s += UTF8SKIP(s);
1912 }
1913 }
1914 else {
1915 while (s <= last1) {
1916 if (regtry(prog, s))
1917 goto got_it;
1918 s++;
1919 }
1920 }
1921 }
a3621e74 1922 DEBUG_EXECUTE_r(if (!did_match)
b7953727 1923 PerlIO_printf(Perl_debug_log,
a0288114 1924 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1925 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1926 ? "anchored" : "floating"),
1927 PL_colors[0],
1928 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 1929 SvPVX_const(must),
b7953727
JH
1930 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1931 );
6eb5f6b9
JH
1932 goto phooey;
1933 }
155aba94 1934 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1935 if (minlen) {
1936 I32 op = (U8)OP(prog->regstclass);
66e933ab 1937 /* don't bother with what can't match */
f14c76ed
RGS
1938 if (PL_regkind[op] != EXACT && op != CANY)
1939 strend = HOPc(strend, -(minlen - 1));
1940 }
a3621e74 1941 DEBUG_EXECUTE_r({
ffc61ed2 1942 SV *prop = sv_newmortal();
cfd0369c
NC
1943 const char *s0;
1944 const char *s1;
9e55ce06
JH
1945 int len0;
1946 int len1;
1947
ffc61ed2 1948 regprop(prop, c);
9e55ce06 1949 s0 = UTF ?
3f7c398e 1950 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 1951 UNI_DISPLAY_REGEX) :
cfd0369c 1952 SvPVX_const(prop);
9e55ce06
JH
1953 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1954 s1 = UTF ?
c728cb41 1955 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1956 len1 = UTF ? SvCUR(dsv1) : strend - s;
1957 PerlIO_printf(Perl_debug_log,
a0288114 1958 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
9e55ce06
JH
1959 len0, len0, s0,
1960 len1, len1, s1);
ffc61ed2 1961 });
06b5626a 1962 if (find_byclass(prog, c, s, strend, 0))
6eb5f6b9 1963 goto got_it;
a3621e74 1964 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1965 }
1966 else {
1967 dontbother = 0;
a0714e2c 1968 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1969 /* Trim the end. */
d6a28714 1970 char *last;
33b8afdf
JH
1971 SV* float_real;
1972
1973 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1974 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1975 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1976
1977 if (flags & REXEC_SCREAM) {
33b8afdf 1978 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1979 end_shift, &scream_pos, 1); /* last one */
1980 if (!last)
ffc61ed2 1981 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1982 /* we may be pointing at the wrong string */
1983 else if (RX_MATCH_COPIED(prog))
3f7c398e 1984 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1985 }
d6a28714
JH
1986 else {
1987 STRLEN len;
cfd0369c 1988 const char * const little = SvPV_const(float_real, len);
d6a28714 1989
33b8afdf 1990 if (SvTAIL(float_real)) {
d6a28714
JH
1991 if (memEQ(strend - len + 1, little, len - 1))
1992 last = strend - len + 1;
7fba1cd6 1993 else if (!multiline)
9041c2e3 1994 last = memEQ(strend - len, little, len)
bd61b366 1995 ? strend - len : NULL;
b8c5462f 1996 else
d6a28714
JH
1997 goto find_last;
1998 } else {
1999 find_last:
9041c2e3 2000 if (len)
d6a28714 2001 last = rninstr(s, strend, little, little + len);
b8c5462f 2002 else
a0288114 2003 last = strend; /* matching "$" */
b8c5462f 2004 }
b8c5462f 2005 }
bf93d4cc 2006 if (last == NULL) {
a3621e74 2007 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2008 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2009 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2010 goto phooey; /* Should not happen! */
2011 }
d6a28714
JH
2012 dontbother = strend - last + prog->float_min_offset;
2013 }
2014 if (minlen && (dontbother < minlen))
2015 dontbother = minlen - 1;
2016 strend -= dontbother; /* this one's always in bytes! */
2017 /* We don't know much -- general case. */
1aa99e6b 2018 if (do_utf8) {
d6a28714
JH
2019 for (;;) {
2020 if (regtry(prog, s))
2021 goto got_it;
2022 if (s >= strend)
2023 break;
b8c5462f 2024 s += UTF8SKIP(s);
d6a28714
JH
2025 };
2026 }
2027 else {
2028 do {
2029 if (regtry(prog, s))
2030 goto got_it;
2031 } while (s++ < strend);
2032 }
2033 }
2034
2035 /* Failure. */
2036 goto phooey;
2037
2038got_it:
2039 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2040
2041 if (PL_reg_eval_set) {
2042 /* Preserve the current value of $^R */
2043 if (oreplsv != GvSV(PL_replgv))
2044 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2045 restored, the value remains
2046 the same. */
acfe0abc 2047 restore_pos(aTHX_ 0);
d6a28714
JH
2048 }
2049
2050 /* make sure $`, $&, $', and $digit will work later */
2051 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2052 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2053 if (flags & REXEC_COPY_STR) {
2054 I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2055#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2056 if ((SvIsCOW(sv)
2057 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2058 if (DEBUG_C_TEST) {
2059 PerlIO_printf(Perl_debug_log,
2060 "Copy on write: regexp capture, type %d\n",
2061 (int) SvTYPE(sv));
2062 }
2063 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2064 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2065 assert (SvPOKp(prog->saved_copy));
2066 } else
2067#endif
2068 {
2069 RX_MATCH_COPIED_on(prog);
2070 s = savepvn(strbeg, i);
2071 prog->subbeg = s;
2072 }
d6a28714 2073 prog->sublen = i;
d6a28714
JH
2074 }
2075 else {
2076 prog->subbeg = strbeg;
2077 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2078 }
2079 }
9041c2e3 2080
d6a28714
JH
2081 return 1;
2082
2083phooey:
a3621e74 2084 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2085 PL_colors[4], PL_colors[5]));
d6a28714 2086 if (PL_reg_eval_set)
acfe0abc 2087 restore_pos(aTHX_ 0);
d6a28714
JH
2088 return 0;
2089}
2090
2091/*
2092 - regtry - try match at specific point
2093 */
2094STATIC I32 /* 0 failure, 1 success */
2095S_regtry(pTHX_ regexp *prog, char *startpos)
2096{
97aff369 2097 dVAR;
d6a28714
JH
2098 register I32 i;
2099 register I32 *sp;
2100 register I32 *ep;
2101 CHECKPOINT lastcp;
a3621e74 2102 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2103
02db2b7b
IZ
2104#ifdef DEBUGGING
2105 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2106#endif
d6a28714
JH
2107 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2108 MAGIC *mg;
2109
2110 PL_reg_eval_set = RS_init;
a3621e74 2111 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2112 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2113 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2114 ));
e8347627 2115 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2116 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2117 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2118 SAVETMPS;
2119 /* Apparently this is not needed, judging by wantarray. */
e8347627 2120 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2121 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2122
2123 if (PL_reg_sv) {
2124 /* Make $_ available to executed code. */
2125 if (PL_reg_sv != DEFSV) {
59f00321 2126 SAVE_DEFSV;
d6a28714 2127 DEFSV = PL_reg_sv;
b8c5462f 2128 }
d6a28714 2129
9041c2e3 2130 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2131 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2132 /* prepare for quick setting of pos */
14befaf4 2133 sv_magic(PL_reg_sv, (SV*)0,
bd61b366 2134 PERL_MAGIC_regex_global, NULL, 0);
14befaf4 2135 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2136 mg->mg_len = -1;
b8c5462f 2137 }
d6a28714
JH
2138 PL_reg_magic = mg;
2139 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2140 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2141 }
09687e5a 2142 if (!PL_reg_curpm) {
a02a5408 2143 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2144#ifdef USE_ITHREADS
2145 {
2146 SV* repointer = newSViv(0);
577e12cc 2147 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2148 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2149 av_push(PL_regex_padav,repointer);
2150 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2151 PL_regex_pad = AvARRAY(PL_regex_padav);
2152 }
2153#endif
2154 }
aaa362c4 2155 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2156 PL_reg_oldcurpm = PL_curpm;
2157 PL_curpm = PL_reg_curpm;
2158 if (RX_MATCH_COPIED(prog)) {
2159 /* Here is a serious problem: we cannot rewrite subbeg,
2160 since it may be needed if this match fails. Thus
2161 $` inside (?{}) could fail... */
2162 PL_reg_oldsaved = prog->subbeg;
2163 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2164#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2165 PL_nrs = prog->saved_copy;
2166#endif
d6a28714
JH
2167 RX_MATCH_COPIED_off(prog);
2168 }
2169 else
bd61b366 2170 PL_reg_oldsaved = NULL;
d6a28714
JH
2171 prog->subbeg = PL_bostr;
2172 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2173 }
973dddac 2174 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2175 PL_reginput = startpos;
2176 PL_regstartp = prog->startp;
2177 PL_regendp = prog->endp;
2178 PL_reglastparen = &prog->lastparen;
a01268b5 2179 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2180 prog->lastparen = 0;
03994de8 2181 prog->lastcloseparen = 0;
d6a28714 2182 PL_regsize = 0;
a3621e74 2183 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2184 if (PL_reg_start_tmpl <= prog->nparens) {
2185 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2186 if(PL_reg_start_tmp)
2187 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2188 else
a02a5408 2189 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2190 }
2191
2192 /* XXXX What this code is doing here?!!! There should be no need
2193 to do this again and again, PL_reglastparen should take care of
3dd2943c 2194 this! --ilya*/
dafc8851
JH
2195
2196 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2197 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2198 * PL_reglastparen), is not needed at all by the test suite
2199 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2200 * enough, for building DynaLoader, or otherwise this
2201 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2202 * will happen. Meanwhile, this code *is* needed for the
2203 * above-mentioned test suite tests to succeed. The common theme
2204 * on those tests seems to be returning null fields from matches.
2205 * --jhi */
dafc8851 2206#if 1
d6a28714
JH
2207 sp = prog->startp;
2208 ep = prog->endp;
2209 if (prog->nparens) {
eb160463 2210 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2211 *++sp = -1;
2212 *++ep = -1;
2213 }
2214 }
dafc8851 2215#endif
02db2b7b 2216 REGCP_SET(lastcp);
d6a28714
JH
2217 if (regmatch(prog->program + 1)) {
2218 prog->endp[0] = PL_reginput - PL_bostr;
2219 return 1;
2220 }
02db2b7b 2221 REGCP_UNWIND(lastcp);
d6a28714
JH
2222 return 0;
2223}
2224
02db2b7b
IZ
2225#define RE_UNWIND_BRANCH 1
2226#define RE_UNWIND_BRANCHJ 2
2227
2228union re_unwind_t;
2229
2230typedef struct { /* XX: makes sense to enlarge it... */
2231 I32 type;
2232 I32 prev;
2233 CHECKPOINT lastcp;
2234} re_unwind_generic_t;
2235
2236typedef struct {
2237 I32 type;
2238 I32 prev;
2239 CHECKPOINT lastcp;
2240 I32 lastparen;
2241 regnode *next;
2242 char *locinput;
2243 I32 nextchr;
2244#ifdef DEBUGGING
2245 int regindent;
2246#endif
2247} re_unwind_branch_t;
2248
2249typedef union re_unwind_t {
2250 I32 type;
2251 re_unwind_generic_t generic;
2252 re_unwind_branch_t branch;
2253} re_unwind_t;
2254
8ba1375e
MJD
2255#define sayYES goto yes
2256#define sayNO goto no
e0f9d4a8 2257#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2258#define sayYES_FINAL goto yes_final
2259#define sayYES_LOUD goto yes_loud
2260#define sayNO_FINAL goto no_final
2261#define sayNO_SILENT goto do_no
2262#define saySAME(x) if (x) goto yes; else goto no
2263
3ab3c9b4
HS
2264#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2265#define POSCACHE_SEEN 1 /* we know what we're caching */
2266#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2267#define CACHEsayYES STMT_START { \
2268 if (cache_offset | cache_bit) { \
2269 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2270 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2271 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2272 /* cache records failure, but this is success */ \
2273 DEBUG_r( \
2274 PerlIO_printf(Perl_debug_log, \
2275 "%*s (remove success from failure cache)\n", \
2276 REPORT_CODE_OFF+PL_regindent*2, "") \
2277 ); \
2278 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2279 } \
2280 } \
2281 sayYES; \
2282} STMT_END
2283#define CACHEsayNO STMT_START { \
2284 if (cache_offset | cache_bit) { \
2285 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2286 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2287 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2288 /* cache records success, but this is failure */ \
2289 DEBUG_r( \
2290 PerlIO_printf(Perl_debug_log, \
2291 "%*s (remove failure from success cache)\n", \
2292 REPORT_CODE_OFF+PL_regindent*2, "") \
2293 ); \
2294 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2295 } \
2296 } \
2297 sayNO; \
2298} STMT_END
2299
a3621e74
YO
2300/* this is used to determine how far from the left messages like
2301 'failed...' are printed. Currently 29 makes these messages line
2302 up with the opcode they refer to. Earlier perls used 25 which
2303 left these messages outdented making reviewing a debug output
2304 quite difficult.
2305*/
2306#define REPORT_CODE_OFF 29
2307
2308
2309/* Make sure there is a test for this +1 options in re_tests */
2310#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2311
2312#define TRIE_CHECK_STATE_IS_ACCEPTING STMT_START { \
2313 if ( trie->states[ state ].wordnum ) { \
2314 if ( !accepted ) { \
2315 ENTER; \
2316 SAVETMPS; \
2317 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN ; \
561b68a9 2318 sv_accept_buff=newSV(bufflen * sizeof(reg_trie_accepted) - 1 );\
a3621e74
YO
2319 SvCUR_set( sv_accept_buff, sizeof(reg_trie_accepted) ); \
2320 SvPOK_on( sv_accept_buff ); \
2321 sv_2mortal( sv_accept_buff ); \
2322 accept_buff = (reg_trie_accepted*)SvPV_nolen( sv_accept_buff );\
2323 } else { \
2324 if ( accepted >= bufflen ) { \
2325 bufflen *= 2; \
2326 accept_buff =(reg_trie_accepted*)SvGROW( sv_accept_buff, \
2327 bufflen * sizeof(reg_trie_accepted) ); \
2328 } \
2329 SvCUR_set( sv_accept_buff,SvCUR( sv_accept_buff ) \
2330 + sizeof( reg_trie_accepted ) ); \
2331 } \
2332 accept_buff[ accepted ].wordnum = trie->states[ state ].wordnum; \
2333 accept_buff[ accepted ].endpos = uc; \
2334 ++accepted; \
2335 } } STMT_END
2336
2337#define TRIE_HANDLE_CHAR STMT_START { \
2338 if ( uvc < 256 ) { \
2339 charid = trie->charmap[ uvc ]; \
2340 } else { \
2341 charid = 0; \
2342 if( trie->widecharmap ) { \
2343 SV** svpp = (SV**)NULL; \
2344 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, \
2345 sizeof( UV ), 0 ); \
2346 if ( svpp ) { \
2347 charid = (U16)SvIV( *svpp ); \
2348 } \
2349 } \
2350 } \
2351 if ( charid && \
cc601c31
YO
2352 ( base + charid > trie->uniquecharcount ) && \
2353 ( base + charid - 1 - trie->uniquecharcount < trie->lasttrans) && \
a3621e74
YO
2354 trie->trans[ base + charid - 1 - trie->uniquecharcount ].check == state ) \
2355 { \
2356 state = trie->trans[ base + charid - 1 - trie->uniquecharcount ].next; \
2357 } else { \
2358 state = 0; \
2359 } \
2360 uc += len; \
2361 } STMT_END
8ba1375e 2362
d6a28714
JH
2363/*
2364 - regmatch - main matching routine
2365 *
2366 * Conceptually the strategy is simple: check to see whether the current
2367 * node matches, call self recursively to see whether the rest matches,
2368 * and then act accordingly. In practice we make some effort to avoid
2369 * recursion, in particular by going through "ordinary" nodes (that don't
2370 * need to know whether the rest of the match failed) by a loop instead of
2371 * by recursion.
2372 */
2373/* [lwall] I've hoisted the register declarations to the outer block in order to
2374 * maybe save a little bit of pushing and popping on the stack. It also takes
2375 * advantage of machines that use a register save mask on subroutine entry.
2376 */
2377STATIC I32 /* 0 failure, 1 success */
2378S_regmatch(pTHX_ regnode *prog)
2379{
27da23d5 2380 dVAR;
d6a28714
JH
2381 register regnode *scan; /* Current node. */
2382 regnode *next; /* Next node. */
2383 regnode *inner; /* Next node in internal branch. */
2384 register I32 nextchr; /* renamed nextchr - nextchar colides with
2385 function of same name */
2386 register I32 n; /* no or next */
b7953727 2387 register I32 ln = 0; /* len or last */
bd61b366 2388 register char *s = NULL; /* operand or save */
d6a28714 2389 register char *locinput = PL_reginput;
b7953727 2390 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2391 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2392 I32 unwind = 0;
a3621e74
YO
2393
2394 /* used by the trie code */
cbbf8932
AL
2395 SV *sv_accept_buff = NULL; /* accepting states we have traversed */
2396 reg_trie_accepted *accept_buff = NULL; /* "" */
ab74612d
NC
2397 reg_trie_data *trie; /* what trie are we using right now */
2398 U32 accepted = 0; /* how many accepting states we have seen*/
a3621e74 2399
b7953727 2400#if 0
02db2b7b 2401 I32 firstcp = PL_savestack_ix;
b7953727 2402#endif
0d46e09a 2403 register const bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2404#ifdef DEBUGGING
6136c704
AL
2405 SV * const dsv0 = PERL_DEBUG_PAD_ZERO(0);
2406 SV * const dsv1 = PERL_DEBUG_PAD_ZERO(1);
2407 SV * const dsv2 = PERL_DEBUG_PAD_ZERO(2);
a3621e74 2408
ab74612d 2409 SV *re_debug_flags = NULL;
2a782b5b 2410#endif
041457d9 2411 U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
02db2b7b 2412
a3621e74
YO
2413 GET_RE_DEBUG_FLAGS;
2414
d6a28714
JH
2415#ifdef DEBUGGING
2416 PL_regindent++;
2417#endif
2418
a3621e74 2419
d6a28714
JH
2420 /* Note that nextchr is a byte even in UTF */
2421 nextchr = UCHARAT(locinput);
2422 scan = prog;
2423 while (scan != NULL) {
8ba1375e 2424
a3621e74 2425 DEBUG_EXECUTE_r( {
6136c704 2426 SV * const prop = sv_newmortal();
1df70142
AL
2427 const int docolor = *PL_colors[0];
2428 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2429 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2430 /* The part of the string before starttry has one color
2431 (pref0_len chars), between starttry and current
2432 position another one (pref_len - pref0_len chars),
2433 after the current position the third one.
2434 We assume that pref0_len <= pref_len, otherwise we
2435 decrease pref0_len. */
9041c2e3 2436 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2437 ? (5 + taill) - l : locinput - PL_bostr;
2438 int pref0_len;
d6a28714 2439
df1ffd02 2440 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2441 pref_len++;
2442 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2443 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2444 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2445 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2446 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2447 l--;
d6a28714
JH
2448 if (pref0_len < 0)
2449 pref0_len = 0;
2450 if (pref0_len > pref_len)
2451 pref0_len = pref_len;
2452 regprop(prop, scan);
2a782b5b 2453 {
1df70142 2454 const char * const s0 =
f14c76ed 2455 do_utf8 && OP(scan) != CANY ?
2a782b5b 2456 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2457 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2458 locinput - pref_len;
1df70142
AL
2459 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2460 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2461 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2462 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2463 locinput - pref_len + pref0_len;
1df70142
AL
2464 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2465 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2466 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2467 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2468 locinput;
1df70142 2469 const int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2470 PerlIO_printf(Perl_debug_log,
2471 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2472 (IV)(locinput - PL_bostr),
2473 PL_colors[4],
2474 len0, s0,
2475 PL_colors[5],
2476 PL_colors[2],
2477 len1, s1,
2478 PL_colors[3],
2479 (docolor ? "" : "> <"),
2480 PL_colors[0],
2481 len2, s2,
2482 PL_colors[1],
2483 15 - l - pref_len + 1,
2484 "",
2485 (IV)(scan - PL_regprogram), PL_regindent*2, "",
3f7c398e 2486 SvPVX_const(prop));
2a782b5b
JH
2487 }
2488 });
d6a28714
JH
2489
2490 next = scan + NEXT_OFF(scan);
2491 if (next == scan)
2492 next = NULL;
2493
2494 switch (OP(scan)) {
2495 case BOL:
7fba1cd6 2496 if (locinput == PL_bostr)
d6a28714
JH
2497 {
2498 /* regtill = regbol; */
b8c5462f
JH
2499 break;
2500 }
d6a28714
JH
2501 sayNO;
2502 case MBOL:
12d33761
HS
2503 if (locinput == PL_bostr ||
2504 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2505 {
b8c5462f
JH
2506 break;
2507 }
d6a28714
JH
2508 sayNO;
2509 case SBOL:
c2a73568 2510 if (locinput == PL_bostr)
b8c5462f 2511 break;
d6a28714
JH
2512 sayNO;
2513 case GPOS:
2514 if (locinput == PL_reg_ganch)
2515 break;
2516 sayNO;
2517 case EOL:
d6a28714
JH
2518 goto seol;
2519 case MEOL:
d6a28714 2520 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2521 sayNO;
b8c5462f 2522 break;
d6a28714
JH
2523 case SEOL:
2524 seol:
2525 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2526 sayNO;
d6a28714 2527 if (PL_regeol - locinput > 1)
b8c5462f 2528 sayNO;
b8c5462f 2529 break;
d6a28714
JH
2530 case EOS:
2531 if (PL_regeol != locinput)
b8c5462f 2532 sayNO;
d6a28714 2533 break;
ffc61ed2 2534 case SANY:
d6a28714 2535 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2536 sayNO;
f33976b4
DB
2537 if (do_utf8) {
2538 locinput += PL_utf8skip[nextchr];
2539 if (locinput > PL_regeol)
2540 sayNO;
2541 nextchr = UCHARAT(locinput);
2542 }
2543 else
2544 nextchr = UCHARAT(++locinput);
2545 break;
2546 case CANY:
2547 if (!nextchr && locinput >= PL_regeol)
2548 sayNO;
b8c5462f 2549 nextchr = UCHARAT(++locinput);
a0d0e21e 2550 break;
ffc61ed2 2551 case REG_ANY:
1aa99e6b
IH
2552 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2553 sayNO;
2554 if (do_utf8) {
b8c5462f 2555 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2556 if (locinput > PL_regeol)
2557 sayNO;
a0ed51b3 2558 nextchr = UCHARAT(locinput);
a0ed51b3 2559 }
1aa99e6b
IH
2560 else
2561 nextchr = UCHARAT(++locinput);
a0ed51b3 2562 break;
a3621e74
YO
2563
2564
2565
2566 /*
2567 traverse the TRIE keeping track of all accepting states
2568 we transition through until we get to a failing node.
2569
2570 we use two slightly different pieces of code to handle
2571 the traversal depending on whether its case sensitive or
2572 not. we reuse the accept code however. (this should probably
2573 be turned into a macro.)
2574
2575 */
2576 case TRIEF:
2577 case TRIEFL:
2578 {
a3621e74
YO
2579 U8 *uc = ( U8* )locinput;
2580 U32 state = 1;
2581 U16 charid = 0;
2582 U32 base = 0;
2583 UV uvc = 0;
2584 STRLEN len = 0;
2585 STRLEN foldlen = 0;
a3621e74
YO
2586 U8 *uscan = (U8*)NULL;
2587 STRLEN bufflen=0;
2588 accepted = 0;
2589
2590 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2591
2592 while ( state && uc <= (U8*)PL_regeol ) {
2593
2594 TRIE_CHECK_STATE_IS_ACCEPTING;
2595
2596 base = trie->states[ state ].trans.base;
2597
2598 DEBUG_TRIE_EXECUTE_r(
2599 PerlIO_printf( Perl_debug_log,
e4584336 2600 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2601 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
e4584336 2602 (UV)state, (UV)base, (UV)accepted );
a3621e74
YO
2603 );
2604
2605 if ( base ) {
2606
108bb1ad 2607 if ( do_utf8 ) {
a3621e74
YO
2608 if ( foldlen>0 ) {
2609 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2610 foldlen -= len;
2611 uscan += len;
2612 len=0;
2613 } else {
1df70142 2614 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2615 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2616 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2617 foldlen -= UNISKIP( uvc );
2618 uscan = foldbuf + UNISKIP( uvc );
2619 }
2620 } else {
e4584336 2621 uvc = (UV)*uc;
a3621e74
YO
2622 len = 1;
2623 }
2624
2625 TRIE_HANDLE_CHAR;
2626
2627 } else {
2628 state = 0;
2629 }
2630 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2631 PerlIO_printf( Perl_debug_log,
2632 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2633 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2634 );
2635 }
2636 if ( !accepted ) {
2637 sayNO;
2638 } else {
2639 goto TrieAccept;
2640 }
2641 }
2642 /* unreached codepoint: we jump into the middle of the next case
2643 from previous if blocks */
2644 case TRIE:
2645 {
a3621e74
YO
2646 U8 *uc = (U8*)locinput;
2647 U32 state = 1;
2648 U16 charid = 0;
2649 U32 base = 0;
2650 UV uvc = 0;
2651 STRLEN len = 0;
2652 STRLEN bufflen = 0;
2653 accepted = 0;
2654
2655 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2656
2657 while ( state && uc <= (U8*)PL_regeol ) {
2658
2659 TRIE_CHECK_STATE_IS_ACCEPTING;
2660
2661 base = trie->states[ state ].trans.base;
2662
2663 DEBUG_TRIE_EXECUTE_r(
2664 PerlIO_printf( Perl_debug_log,
e4584336 2665 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2666 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
e4584336 2667 (UV)state, (UV)base, (UV)accepted );
a3621e74
YO
2668 );
2669
2670 if ( base ) {
2671
108bb1ad 2672 if ( do_utf8 ) {
a3621e74
YO
2673 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2674 } else {
2675 uvc = (U32)*uc;
2676 len = 1;
2677 }
2678
2679 TRIE_HANDLE_CHAR;
2680
2681 } else {
2682 state = 0;
2683 }
2684 DEBUG_TRIE_EXECUTE_r(
2685 PerlIO_printf( Perl_debug_log,
e4584336
RB
2686 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2687 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2688 );
2689 }
2690 if ( !accepted ) {
2691 sayNO;
2692 }
2693 }
2694
2695
2696 /*
2697 There was at least one accepting state that we
2698 transitioned through. Presumably the number of accepting
2699 states is going to be low, typically one or two. So we
2700 simply scan through to find the one with lowest wordnum.
2701 Once we find it, we swap the last state into its place
2702 and decrement the size. We then try to match the rest of
2703 the pattern at the point where the word ends, if we
2704 succeed then we end the loop, otherwise the loop
2705 eventually terminates once all of the accepting states
2706 have been tried.
2707 */
2708 TrieAccept:
2709 {
2710 int gotit = 0;
2711
2712 if ( accepted == 1 ) {
2713 DEBUG_EXECUTE_r({
2714 SV **tmp = av_fetch( trie->words, accept_buff[ 0 ].wordnum-1, 0 );
2715 PerlIO_printf( Perl_debug_log,
2716 "%*s %sonly one match : #%d <%s>%s\n",
e4584336 2717 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74 2718 accept_buff[ 0 ].wordnum,
cfd0369c 2719 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
a3621e74
YO
2720 PL_colors[5] );
2721 });
cc601c31 2722 PL_reginput = (char *)accept_buff[ 0 ].endpos;
a3621e74
YO
2723 /* in this case we free tmps/leave before we call regmatch
2724 as we wont be using accept_buff again. */
2725 FREETMPS;
2726 LEAVE;
2727 gotit = regmatch( scan + NEXT_OFF( scan ) );
2728 } else {
2729 DEBUG_EXECUTE_r(
e4584336
RB
2730 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
2731 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)accepted,
a3621e74
YO
2732 PL_colors[5] );
2733 );
2734 while ( !gotit && accepted-- ) {
2735 U32 best = 0;
2736 U32 cur;
2737 for( cur = 1 ; cur <= accepted ; cur++ ) {
e4584336
RB
2738 DEBUG_TRIE_EXECUTE_r(
2739 PerlIO_printf( Perl_debug_log,
2740 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2741 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2742 (IV)best, accept_buff[ best ].wordnum, (IV)cur,
2743 accept_buff[ cur ].wordnum, PL_colors[5] );
2744 );
a3621e74
YO
2745
2746 if ( accept_buff[ cur ].wordnum < accept_buff[ best ].wordnum )
2747 best = cur;
2748 }
2749 DEBUG_EXECUTE_r({
6136c704 2750 SV ** const tmp = av_fetch( trie->words, accept_buff[ best ].wordnum - 1, 0 );
a3621e74 2751 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
e4584336 2752 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74 2753 accept_buff[best].wordnum,
cfd0369c 2754 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",scan,
a3621e74
YO
2755 PL_colors[5] );
2756 });
2757 if ( best<accepted ) {
2758 reg_trie_accepted tmp = accept_buff[ best ];
2759 accept_buff[ best ] = accept_buff[ accepted ];
2760 accept_buff[ accepted ] = tmp;
2761 best = accepted;
2762 }
cc601c31 2763 PL_reginput = (char *)accept_buff[ best ].endpos;
a3621e74
YO
2764
2765 /*
2766 as far as I can tell we only need the SAVETMPS/FREETMPS
2767 for re's with EVAL in them but I'm leaving them in for
2768 all until I can be sure.
2769 */
2770 SAVETMPS;
2771 gotit = regmatch( scan + NEXT_OFF( scan ) ) ;
2772 FREETMPS;
2773 }
2774 FREETMPS;
2775 LEAVE;
2776 }
2777
2778 if ( gotit ) {
2779 sayYES;
2780 } else {
2781 sayNO;
2782 }
2783 }
2784 /* unreached codepoint */
d6a28714 2785 case EXACT:
cd439c50
IZ
2786 s = STRING(scan);
2787 ln = STR_LEN(scan);
eb160463 2788 if (do_utf8 != UTF) {
bc517b45 2789 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2790 char *l = locinput;
a3b680e6 2791 const char *e = s + ln;
a72c7584 2792
5ff6fc6d
JH
2793 if (do_utf8) {
2794 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2795 while (s < e) {
a3b680e6 2796 STRLEN ulen;
1aa99e6b 2797 if (l >= PL_regeol)
5ff6fc6d
JH
2798 sayNO;
2799 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2800 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2801 uniflags))
5ff6fc6d 2802 sayNO;
bc517b45 2803 l += ulen;
5ff6fc6d 2804 s ++;
1aa99e6b 2805 }
5ff6fc6d
JH
2806 }
2807 else {
2808 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2809 while (s < e) {
a3b680e6 2810 STRLEN ulen;
1aa99e6b
IH
2811 if (l >= PL_regeol)
2812 sayNO;
5ff6fc6d 2813 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2814 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2815 uniflags))
1aa99e6b 2816 sayNO;
bc517b45 2817 s += ulen;
a72c7584 2818 l ++;
1aa99e6b 2819 }
5ff6fc6d 2820 }
1aa99e6b
IH
2821 locinput = l;
2822 nextchr = UCHARAT(locinput);
2823 break;
2824 }
bc517b45 2825 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2826 /* Inline the first character, for speed. */
2827 if (UCHARAT(s) != nextchr)
2828 sayNO;
2829 if (PL_regeol - locinput < ln)
2830 sayNO;
2831 if (ln > 1 && memNE(s, locinput, ln))
2832 sayNO;
2833 locinput += ln;
2834 nextchr = UCHARAT(locinput);
2835 break;
2836 case EXACTFL:
b8c5462f
JH
2837 PL_reg_flags |= RF_tainted;
2838 /* FALL THROUGH */
d6a28714 2839 case EXACTF:
cd439c50
IZ
2840 s = STRING(scan);
2841 ln = STR_LEN(scan);
d6a28714 2842
d07ddd77
JH
2843 if (do_utf8 || UTF) {
2844 /* Either target or the pattern are utf8. */
d6a28714 2845 char *l = locinput;
d07ddd77 2846 char *e = PL_regeol;
bc517b45 2847
eb160463 2848 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2849 l, &e, 0, do_utf8)) {
5486206c
JH
2850 /* One more case for the sharp s:
2851 * pack("U0U*", 0xDF) =~ /ss/i,
2852 * the 0xC3 0x9F are the UTF-8
2853 * byte sequence for the U+00DF. */
2854 if (!(do_utf8 &&
2855 toLOWER(s[0]) == 's' &&
2856 ln >= 2 &&
2857 toLOWER(s[1]) == 's' &&
2858 (U8)l[0] == 0xC3 &&
2859 e - l >= 2 &&
2860 (U8)l[1] == 0x9F))
2861 sayNO;
2862 }
d07ddd77
JH
2863 locinput = e;
2864 nextchr = UCHARAT(locinput);
2865 break;
a0ed51b3 2866 }
d6a28714 2867
bc517b45
JH
2868 /* Neither the target and the pattern are utf8. */
2869
d6a28714
JH
2870 /* Inline the first character, for speed. */
2871 if (UCHARAT(s) != nextchr &&
2872 UCHARAT(s) != ((OP(scan) == EXACTF)
2873 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2874 sayNO;
d6a28714 2875 if (PL_regeol - locinput < ln)
b8c5462f 2876 sayNO;
d6a28714
JH
2877 if (ln > 1 && (OP(scan) == EXACTF
2878 ? ibcmp(s, locinput, ln)
2879 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2880 sayNO;
d6a28714
JH
2881 locinput += ln;
2882 nextchr = UCHARAT(locinput);
a0d0e21e 2883 break;
d6a28714 2884 case ANYOF:
ffc61ed2 2885 if (do_utf8) {
9e55ce06
JH
2886 STRLEN inclasslen = PL_regeol - locinput;
2887
ba7b4546 2888 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2889 sayNO_ANYOF;
ffc61ed2
JH
2890 if (locinput >= PL_regeol)
2891 sayNO;
0f0076b4 2892 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2893 nextchr = UCHARAT(locinput);
e0f9d4a8 2894 break;
ffc61ed2
JH
2895 }
2896 else {
2897 if (nextchr < 0)
2898 nextchr = UCHARAT(locinput);
7d3e948e 2899 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2900 sayNO_ANYOF;
ffc61ed2
JH
2901 if (!nextchr && locinput >= PL_regeol)
2902 sayNO;
2903 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2904 break;
2905 }
2906 no_anyof:
2907 /* If we might have the case of the German sharp s
2908 * in a casefolding Unicode character class. */
2909
ebc501f0
JH
2910 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2911 locinput += SHARP_S_SKIP;
e0f9d4a8 2912 nextchr = UCHARAT(locinput);
ffc61ed2 2913 }
e0f9d4a8
JH
2914 else
2915 sayNO;
b8c5462f 2916 break;
d6a28714 2917 case ALNUML:
b8c5462f
JH
2918 PL_reg_flags |= RF_tainted;
2919 /* FALL THROUGH */
d6a28714 2920 case ALNUM:
b8c5462f 2921 if (!nextchr)
4633a7c4 2922 sayNO;
ffc61ed2 2923 if (do_utf8) {
1a4fad37 2924 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2925 if (!(OP(scan) == ALNUM
3568d838 2926 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2927 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2928 {
2929 sayNO;
a0ed51b3 2930 }
b8c5462f 2931 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2932 nextchr = UCHARAT(locinput);
2933 break;
2934 }
ffc61ed2 2935 if (!(OP(scan) == ALNUM
d6a28714 2936 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2937 sayNO;
b8c5462f 2938 nextchr = UCHARAT(++locinput);
a0d0e21e 2939 break;
d6a28714 2940 case NALNUML:
b8c5462f
JH
2941 PL_reg_flags |= RF_tainted;
2942 /* FALL THROUGH */
d6a28714
JH
2943 case NALNUM:
2944 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2945 sayNO;
ffc61ed2 2946 if (do_utf8) {
1a4fad37 2947 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2948 if (OP(scan) == NALNUM
3568d838 2949 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2950 : isALNUM_LC_utf8((U8*)locinput))
2951 {
b8c5462f 2952 sayNO;
d6a28714 2953 }
b8c5462f
JH
2954 locinput += PL_utf8skip[nextchr];
2955 nextchr = UCHARAT(locinput);
2956 break;
2957 }
ffc61ed2 2958 if (OP(scan) == NALNUM
d6a28714 2959 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2960 sayNO;
76e3520e 2961 nextchr = UCHARAT(++locinput);
a0d0e21e 2962 break;
d6a28714
JH
2963 case BOUNDL:
2964 case NBOUNDL:
3280af22 2965 PL_reg_flags |= RF_tainted;
bbce6d69 2966 /* FALL THROUGH */
d6a28714
JH
2967 case BOUND:
2968 case NBOUND:
2969 /* was last char in word? */
ffc61ed2 2970 if (do_utf8) {
12d33761
HS
2971 if (locinput == PL_bostr)
2972 ln = '\n';
ffc61ed2 2973 else {
a3b680e6 2974 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2975
b4f7163a 2976 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2977 }
2978 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2979 ln = isALNUM_uni(ln);
1a4fad37 2980 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 2981 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2982 }
2983 else {
9041c2e3 2984 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2985 n = isALNUM_LC_utf8((U8*)locinput);
2986 }
a0ed51b3 2987 }
d6a28714 2988 else {
12d33761
HS
2989 ln = (locinput != PL_bostr) ?
2990 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2991 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2992 ln = isALNUM(ln);
2993 n = isALNUM(nextchr);
2994 }
2995 else {
2996 ln = isALNUM_LC(ln);
2997 n = isALNUM_LC(nextchr);
2998 }
d6a28714 2999 }
ffc61ed2
JH
3000 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3001 OP(scan) == BOUNDL))
3002 sayNO;
a0ed51b3 3003 break;
d6a28714 3004 case SPACEL:
3280af22 3005 PL_reg_flags |= RF_tainted;
bbce6d69 3006 /* FALL THROUGH */
d6a28714 3007 case SPACE:
9442cb0e 3008 if (!nextchr)
4633a7c4 3009 sayNO;
1aa99e6b 3010 if (do_utf8) {
fd400ab9 3011 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3012 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3013 if (!(OP(scan) == SPACE
3568d838 3014 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3015 : isSPACE_LC_utf8((U8*)locinput)))
3016 {
3017 sayNO;
3018 }
3019 locinput += PL_utf8skip[nextchr];
3020 nextchr = UCHARAT(locinput);
3021 break;
d6a28714 3022 }
ffc61ed2
JH
3023 if (!(OP(scan) == SPACE
3024 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3025 sayNO;
3026 nextchr = UCHARAT(++locinput);
3027 }
3028 else {
3029 if (!(OP(scan) == SPACE
3030 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3031 sayNO;
3032 nextchr = UCHARAT(++locinput);
a0ed51b3 3033 }
a0ed51b3 3034 break;
d6a28714 3035 case NSPACEL:
3280af22 3036 PL_reg_flags |= RF_tainted;
bbce6d69 3037 /* FALL THROUGH */
d6a28714 3038 case NSPACE:
9442cb0e 3039 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3040 sayNO;
1aa99e6b 3041 if (do_utf8) {
1a4fad37 3042 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3043 if (OP(scan) == NSPACE
3568d838 3044 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3045 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3046 {
3047 sayNO;
3048 }
3049 locinput += PL_utf8skip[nextchr];
3050 nextchr = UCHARAT(locinput);
3051 break;
a0ed51b3 3052 }
ffc61ed2 3053 if (OP(scan) == NSPACE
d6a28714 3054 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3055 sayNO;
76e3520e 3056 nextchr = UCHARAT(++locinput);
a0d0e21e 3057 break;
d6a28714 3058 case DIGITL:
a0ed51b3
LW
3059 PL_reg_flags |= RF_tainted;
3060 /* FALL THROUGH */
d6a28714 3061 case DIGIT:
9442cb0e 3062 if (!nextchr)
a0ed51b3 3063 sayNO;
1aa99e6b 3064 if (do_utf8) {
1a4fad37 3065 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3066 if (!(OP(scan) == DIGIT
3568d838 3067 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3068 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3069 {
a0ed51b3 3070 sayNO;
dfe13c55 3071 }
6f06b55f 3072 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3073 nextchr = UCHARAT(locinput);
3074 break;
3075 }
ffc61ed2 3076 if (!(OP(scan) == DIGIT
9442cb0e 3077 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3078 sayNO;
3079 nextchr = UCHARAT(++locinput);
3080 break;
d6a28714 3081 case NDIGITL:
b8c5462f
JH
3082 PL_reg_flags |= RF_tainted;
3083 /* FALL THROUGH */
d6a28714 3084 case NDIGIT:
9442cb0e 3085 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3086 sayNO;
1aa99e6b 3087 if (do_utf8) {
1a4fad37 3088 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3089 if (OP(scan) == NDIGIT
3568d838 3090 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3091 : isDIGIT_LC_utf8((U8*)locinput))
3092 {
a0ed51b3 3093 sayNO;
9442cb0e 3094 }
6f06b55f 3095 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3096 nextchr = UCHARAT(locinput);
3097 break;
3098 }
ffc61ed2 3099 if (OP(scan) == NDIGIT
9442cb0e 3100 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3101 sayNO;
3102 nextchr = UCHARAT(++locinput);
3103 break;
3104 case CLUMP:
b7c83a7e 3105 if (locinput >= PL_regeol)
a0ed51b3 3106 sayNO;
b7c83a7e 3107 if (do_utf8) {
1a4fad37 3108 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3109 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3110 sayNO;
3111 locinput += PL_utf8skip[nextchr];
3112 while (locinput < PL_regeol &&
3113 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3114 locinput += UTF8SKIP(locinput);
3115 if (locinput > PL_regeol)
3116 sayNO;
eb08e2da
JH
3117 }
3118 else
3119 locinput++;
a0ed51b3
LW
3120 nextchr = UCHARAT(locinput);
3121 break;
c8756f30 3122 case REFFL:
3280af22 3123 PL_reg_flags |= RF_tainted;
c8756f30 3124 /* FALL THROUGH */
c277df42 3125 case REF:
c8756f30 3126 case REFF:
c277df42 3127 n = ARG(scan); /* which paren pair */
cf93c79d 3128 ln = PL_regstartp[n];
2c2d71f5 3129 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 3130 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 3131 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 3132 if (ln == PL_regendp[n])
a0d0e21e 3133 break;
a0ed51b3 3134
cf93c79d 3135 s = PL_bostr + ln;
1aa99e6b 3136 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3137 char *l = locinput;
a3b680e6 3138 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3139 /*
3140 * Note that we can't do the "other character" lookup trick as
3141 * in the 8-bit case (no pun intended) because in Unicode we
3142 * have to map both upper and title case to lower case.
3143 */
3144 if (OP(scan) == REFF) {
3145 while (s < e) {
a3b680e6
AL
3146 STRLEN ulen1, ulen2;
3147 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3148 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3149
a0ed51b3
LW
3150 if (l >= PL_regeol)
3151 sayNO;
a2a2844f
JH
3152 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3153 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3154 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3155 sayNO;
a2a2844f
JH
3156 s += ulen1;
3157 l += ulen2;
a0ed51b3
LW
3158 }
3159 }
3160 locinput = l;
3161 nextchr = UCHARAT(locinput);
3162 break;
3163 }
3164
a0d0e21e 3165 /* Inline the first character, for speed. */
76e3520e 3166 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3167 (OP(scan) == REF ||
3168 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3169 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3170 sayNO;
cf93c79d 3171 ln = PL_regendp[n] - ln;
3280af22 3172 if (locinput + ln > PL_regeol)
4633a7c4 3173 sayNO;
c8756f30
AK
3174 if (ln > 1 && (OP(scan) == REF
3175 ? memNE(s, locinput, ln)
3176 : (OP(scan) == REFF
3177 ? ibcmp(s, locinput, ln)
3178 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3179 sayNO;
a0d0e21e 3180 locinput += ln;
76e3520e 3181 nextchr = UCHARAT(locinput);
a0d0e21e
LW
3182 break;
3183
3184 case NOTHING:
c277df42 3185 case TAIL:
a0d0e21e
LW
3186 break;
3187 case BACK:
3188 break;
c277df42
IZ
3189 case EVAL:
3190 {
3191 dSP;
6136c704
AL
3192 OP_4tree * const oop = PL_op;
3193 COP * const ocurcop = PL_curcop;
f3548bdc 3194 PAD *old_comppad;
c277df42 3195 SV *ret;
6136c704 3196 struct regexp * const oreg = PL_reg_re;
9041c2e3 3197
c277df42 3198 n = ARG(scan);
533c011a 3199 PL_op = (OP_4tree*)PL_regdata->data[n];
a3621e74 3200 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 3201 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 3202 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 3203
8e5e9ebe 3204 {
6136c704 3205 SV ** const before = SP;
8e5e9ebe
RGS
3206 CALLRUNOPS(aTHX); /* Scalar context. */
3207 SPAGAIN;
3208 if (SP == before)
075aa684 3209 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3210 else {
3211 ret = POPs;
3212 PUTBACK;
3213 }
3214 }
3215
0f5d15d6 3216 PL_op = oop;
f3548bdc 3217 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 3218 PL_curcop = ocurcop;
c277df42 3219 if (logical) {
0f5d15d6
IZ
3220 if (logical == 2) { /* Postponed subexpression. */
3221 regexp *re;
6136c704 3222 MAGIC *mg = NULL;
0f5d15d6 3223 re_cc_state state;
0f5d15d6 3224 CHECKPOINT cp, lastcp;
cb50f42d 3225 int toggleutf;
faf82a0b 3226 register SV *sv;
0f5d15d6 3227
faf82a0b
AE
3228 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3229 mg = mg_find(sv, PERL_MAGIC_qr);
3230 else if (SvSMAGICAL(ret)) {
3231 if (SvGMAGICAL(ret))
3232 sv_unmagic(ret, PERL_MAGIC_qr);
3233 else
3234 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3235 }
faf82a0b 3236
0f5d15d6
IZ
3237 if (mg) {
3238 re = (regexp *)mg->mg_obj;
df0003d4 3239 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3240 }
3241 else {
3242 STRLEN len;
6136c704 3243 const char * const t = SvPV_const(ret, len);
0f5d15d6 3244 PMOP pm;
a3b680e6
AL
3245 char * const oprecomp = PL_regprecomp;
3246 const I32 osize = PL_regsize;
3247 const I32 onpar = PL_regnpar;
0f5d15d6 3248
5fcd1c1b 3249 Zero(&pm, 1, PMOP);
cb50f42d 3250 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3251 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3252 if (!(SvFLAGS(ret)
faf82a0b
AE
3253 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3254 | SVs_GMG)))
14befaf4
DM
3255 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3256 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
3257 PL_regprecomp = oprecomp;
3258 PL_regsize = osize;
3259 PL_regnpar = onpar;
3260 }
a3621e74 3261 DEBUG_EXECUTE_r(
9041c2e3 3262 PerlIO_printf(Perl_debug_log,
a0288114 3263 "Entering embedded \"%s%.60s%s%s\"\n",
0f5d15d6
IZ
3264 PL_colors[0],
3265 re->precomp,
3266 PL_colors[1],
3267 (strlen(re->precomp) > 60 ? "..." : ""))
3268 );
3269 state.node = next;
3270 state.prev = PL_reg_call_cc;
3271 state.cc = PL_regcc;
3272 state.re = PL_reg_re;
3273
2ab05381 3274 PL_regcc = 0;
9041c2e3 3275
0f5d15d6 3276 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3277 REGCP_SET(lastcp);
0f5d15d6
IZ
3278 cache_re(re);
3279 state.ss = PL_savestack_ix;
3280 *PL_reglastparen = 0;
a01268b5 3281 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
3282 PL_reg_call_cc = &state;
3283 PL_reginput = locinput;
cb50f42d
YST
3284 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3285 ((re->reganch & ROPT_UTF8) != 0);
3286 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3287
3288 /* XXXX This is too dramatic a measure... */
3289 PL_reg_maxiter = 0;
3290
0f5d15d6 3291 if (regmatch(re->program + 1)) {
2c914db6
IZ
3292 /* Even though we succeeded, we need to restore
3293 global variables, since we may be wrapped inside
3294 SUSPEND, thus the match may be not finished yet. */
3295
3296 /* XXXX Do this only if SUSPENDed? */
3297 PL_reg_call_cc = state.prev;
3298 PL_regcc = state.cc;
3299 PL_reg_re = state.re;
3300 cache_re(PL_reg_re);
cb50f42d 3301 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
3302
3303 /* XXXX This is too dramatic a measure... */
3304 PL_reg_maxiter = 0;
3305
3306 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
3307 ReREFCNT_dec(re);
3308 regcpblow(cp);
3309 sayYES;
3310 }
0f5d15d6 3311 ReREFCNT_dec(re);
02db2b7b 3312 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3313 regcppop();
3314 PL_reg_call_cc = state.prev;
3315 PL_regcc = state.cc;
3316 PL_reg_re = state.re;
d3790889 3317 cache_re(PL_reg_re);
cb50f42d 3318 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3319
3320 /* XXXX This is too dramatic a measure... */
3321 PL_reg_maxiter = 0;
3322
8e514ae6 3323 logical = 0;
0f5d15d6
IZ
3324 sayNO;
3325 }
c277df42 3326 sw = SvTRUE(ret);
0f5d15d6 3327 logical = 0;
a0ed51b3 3328 }
080c2dec 3329 else {
3280af22 3330 sv_setsv(save_scalar(PL_replgv), ret);
080c2dec
AE
3331 cache_re(oreg);
3332 }
c277df42
IZ
3333 break;
3334 }
a0d0e21e 3335 case OPEN:
c277df42 3336 n = ARG(scan); /* which paren pair */
3280af22
NIS
3337 PL_reg_start_tmp[n] = locinput;
3338 if (n > PL_regsize)
3339 PL_regsize = n;
a0d0e21e
LW
3340 break;
3341 case CLOSE:
c277df42 3342 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3343 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3344 PL_regendp[n] = locinput - PL_bostr;
eb160463 3345 if (n > (I32)*PL_reglastparen)
3280af22 3346 *PL_reglastparen = n;
a01268b5 3347 *PL_reglastcloseparen = n;
a0d0e21e 3348 break;
c277df42
IZ
3349 case GROUPP:
3350 n = ARG(scan); /* which paren pair */
eb160463 3351 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3352 break;
3353 case IFTHEN:
2c2d71f5 3354 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
3355 if (sw)
3356 next = NEXTOPER(NEXTOPER(scan));
3357 else {
3358 next = scan + ARG(scan);
3359 if (OP(next) == IFTHEN) /* Fake one. */
3360 next = NEXTOPER(NEXTOPER(next));
3361 }
3362 break;
3363 case LOGICAL:
0f5d15d6 3364 logical = scan->flags;
c277df42 3365 break;
2ab05381
IZ
3366/*******************************************************************
3367 PL_regcc contains infoblock about the innermost (...)* loop, and
3368 a pointer to the next outer infoblock.
3369
3370 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3371
3372 1) After matching X, regnode for CURLYX is processed;
3373
9041c2e3 3374 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3375 regmatch() recursively with the starting point at WHILEM node;
3376
3377 3) Each hit of WHILEM node tries to match A and Z (in the order
3378 depending on the current iteration, min/max of {min,max} and
3379 greediness). The information about where are nodes for "A"
3380 and "Z" is read from the infoblock, as is info on how many times "A"
3381 was already matched, and greediness.
3382
3383 4) After A matches, the same WHILEM node is hit again.
3384
3385 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3386 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3387 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3388 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3389 of the external loop.
3390
3391 Currently present infoblocks form a tree with a stem formed by PL_curcc
3392 and whatever it mentions via ->next, and additional attached trees
3393 corresponding to temporarily unset infoblocks as in "5" above.
3394
9041c2e3 3395 In the following picture infoblocks for outer loop of
2ab05381
IZ
3396 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3397 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3398 infoblocks are drawn below the "reset" infoblock.
3399
3400 In fact in the picture below we do not show failed matches for Z and T
3401 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3402 more obvious *why* one needs to *temporary* unset infoblocks.]
3403
3404 Matched REx position InfoBlocks Comment
3405 (Y(A)*?Z)*?T x
3406 Y(A)*?Z)*?T x <- O
3407 Y (A)*?Z)*?T x <- O
3408 Y A)*?Z)*?T x <- O <- I
3409 YA )*?Z)*?T x <- O <- I
3410 YA A)*?Z)*?T x <- O <- I
3411 YAA )*?Z)*?T x <- O <- I
3412 YAA Z)*?T x <- O # Temporary unset I
3413 I
3414
3415 YAAZ Y(A)*?Z)*?T x <- O
3416 I
3417
3418 YAAZY (A)*?Z)*?T x <- O
3419 I
3420
3421 YAAZY A)*?Z)*?T x <- O <- I
3422 I
3423
3424 YAAZYA )*?Z)*?T x <- O <- I
3425 I
3426
3427 YAAZYA Z)*?T x <- O # Temporary unset I
3428 I,I
3429
3430 YAAZYAZ )*?T x <- O
3431 I,I
3432
3433 YAAZYAZ T x # Temporary unset O
3434 O
3435 I,I
3436
3437 YAAZYAZT x
3438 O
3439 I,I
3440 *******************************************************************/
a0d0e21e
LW
3441 case CURLYX: {
3442 CURCUR cc;
3280af22 3443 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3444 /* No need to save/restore up to this paren */
3445 I32 parenfloor = scan->flags;
c277df42
IZ
3446
3447 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3448 next += ARG(next);
3280af22
NIS
3449 cc.oldcc = PL_regcc;
3450 PL_regcc = &cc;
cb434fcc
IZ
3451 /* XXXX Probably it is better to teach regpush to support
3452 parenfloor > PL_regsize... */
eb160463 3453 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3454 parenfloor = *PL_reglastparen; /* Pessimization... */
3455 cc.parenfloor = parenfloor;
a0d0e21e
LW
3456 cc.cur = -1;
3457 cc.min = ARG1(scan);
3458 cc.max = ARG2(scan);
c277df42 3459 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3460 cc.next = next;
3461 cc.minmod = minmod;
3462 cc.lastloc = 0;
3280af22 3463 PL_reginput = locinput;
a0d0e21e
LW
3464 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3465 regcpblow(cp);
3280af22 3466 PL_regcc = cc.oldcc;
4633a7c4 3467 saySAME(n);
a0d0e21e 3468 }
5f66b61c 3469 /* NOTREACHED */
a0d0e21e
LW
3470 case WHILEM: {
3471 /*
3472 * This is really hard to understand, because after we match
3473 * what we're trying to match, we must make sure the rest of
2c2d71f5 3474 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3475 * to go back UP the parse tree by recursing ever deeper. And
3476 * if it fails, we have to reset our parent's current state
3477 * that we can try again after backing off.
3478 */
3479
c277df42 3480 CHECKPOINT cp, lastcp;
3280af22 3481 CURCUR* cc = PL_regcc;
6136c704 3482 char * const lastloc = cc->lastloc; /* Detection of 0-len. */
3ab3c9b4 3483 I32 cache_offset = 0, cache_bit = 0;
c277df42 3484
4633a7c4 3485 n = cc->cur + 1; /* how many we know we matched */
3280af22 3486 PL_reginput = locinput;
a0d0e21e 3487
a3621e74 3488 DEBUG_EXECUTE_r(
9041c2e3 3489 PerlIO_printf(Perl_debug_log,
91f3b821 3490 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3491 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3492 (long)n, (long)cc->min,
2797576d 3493 (long)cc->max, PTR2UV(cc))
c277df42 3494 );
4633a7c4 3495
a0d0e21e
LW
3496 /* If degenerate scan matches "", assume scan done. */
3497
579cf2c3 3498 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3499 PL_regcc = cc->oldcc;
2ab05381
IZ
3500 if (PL_regcc)
3501 ln = PL_regcc->cur;
a3621e74 3502 DEBUG_EXECUTE_r(
c3464db5
DD
3503 PerlIO_printf(Perl_debug_log,
3504 "%*s empty match detected, try continuation...\n",
3280af22 3505 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3506 );
a0d0e21e 3507 if (regmatch(cc->next))
4633a7c4 3508 sayYES;
2ab05381
IZ
3509 if (PL_regcc)
3510 PL_regcc->cur = ln;
3280af22 3511 PL_regcc = cc;
4633a7c4 3512 sayNO;
a0d0e21e
LW
3513 }
3514
3515 /* First just match a string of min scans. */
3516
3517 if (n < cc->min) {
3518 cc->cur = n;
3519 cc->lastloc = locinput;
4633a7c4
LW
3520 if (regmatch(cc->scan))
3521 sayYES;
3522 cc->cur = n - 1;
c277df42 3523 cc->lastloc = lastloc;
4633a7c4 3524 sayNO;
a0d0e21e
LW
3525 }
3526
2c2d71f5
JH
3527 if (scan->flags) {
3528 /* Check whether we already were at this position.
3529 Postpone detection until we know the match is not
3530 *that* much linear. */
3531 if (!PL_reg_maxiter) {
3532 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3533 PL_reg_leftiter = PL_reg_maxiter;
3534 }
3535 if (PL_reg_leftiter-- == 0) {
a3b680e6 3536 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
2c2d71f5 3537 if (PL_reg_poscache) {
eb160463 3538 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3539 Renew(PL_reg_poscache, size, char);
3540 PL_reg_poscache_size = size;
3541 }
3542 Zero(PL_reg_poscache, size, char);
3543 }
3544 else {
3545 PL_reg_poscache_size = size;
a02a5408 3546 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3547 }
a3621e74 3548 DEBUG_EXECUTE_r(
2c2d71f5
JH
3549 PerlIO_printf(Perl_debug_log,
3550 "%sDetected a super-linear match, switching on caching%s...\n",
3551 PL_colors[4], PL_colors[5])
3552 );
3553 }
3554 if (PL_reg_leftiter < 0) {
3ab3c9b4 3555 cache_offset = locinput - PL_bostr;
2c2d71f5 3556
3ab3c9b4
HS
3557 cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3558 + cache_offset * (scan->flags>>4);
3559 cache_bit = cache_offset % 8;
3560 cache_offset /= 8;
3561 if (PL_reg_poscache[cache_offset] & (1<<cache_bit)) {
a3621e74 3562 DEBUG_EXECUTE_r(
2c2d71f5
JH
3563 PerlIO_printf(Perl_debug_log,
3564 "%*s already tried at this position...\n",
3565 REPORT_CODE_OFF+PL_regindent*2, "")
3566 );
3ab3c9b4
HS
3567 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3568 /* cache records success */
c2b0868c
HS
3569 sayYES;
3570 else
3ab3c9b4 3571 /* cache records failure */
c2b0868c 3572 sayNO_SILENT;
2c2d71f5 3573 }
3ab3c9b4 3574 PL_reg_poscache[cache_offset] |= (1<<cache_bit);
2c2d71f5
JH
3575 }
3576 }
3577
a0d0e21e
LW
3578 /* Prefer next over scan for minimal matching. */
3579
3580 if (cc->minmod) {
3280af22 3581 PL_regcc = cc->oldcc;
2ab05381
IZ
3582 if (PL_regcc)
3583 ln = PL_regcc->cur;
5f05dabc 3584 cp = regcppush(cc->parenfloor);
02db2b7b 3585 REGCP_SET(lastcp);
5f05dabc 3586 if (regmatch(cc->next)) {
c277df42 3587 regcpblow(cp);
3ab3c9b4 3588 CACHEsayYES; /* All done. */
5f05dabc 3589 }
02db2b7b 3590 REGCP_UNWIND(lastcp);
5f05dabc 3591 regcppop();
2ab05381
IZ
3592 if (PL_regcc)
3593 PL_regcc->cur = ln;
3280af22 3594 PL_regcc = cc;
a0d0e21e 3595
c277df42 3596 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3597 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3598 && !(PL_reg_flags & RF_warned)) {
3599 PL_reg_flags |= RF_warned;
9014280d 3600 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3601 "Complex regular subexpression recursion",
3602 REG_INFTY - 1);
c277df42 3603 }
3ab3c9b4 3604 CACHEsayNO;
c277df42 3605 }
a687059c 3606
a3621e74 3607 DEBUG_EXECUTE_r(
c3464db5
DD
3608 PerlIO_printf(Perl_debug_log,
3609 "%*s trying longer...\n",
3280af22 3610 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3611 );
a0d0e21e 3612 /* Try scanning more and see if it helps. */
3280af22 3613 PL_reginput = locinput;
a0d0e21e
LW
3614 cc->cur = n;
3615 cc->lastloc = locinput;
5f05dabc 3616 cp = regcppush(cc->parenfloor);
02db2b7b 3617 REGCP_SET(lastcp);
5f05dabc 3618 if (regmatch(cc->scan)) {
c277df42 3619 regcpblow(cp);
3ab3c9b4 3620 CACHEsayYES;
5f05dabc 3621 }
02db2b7b 3622 REGCP_UNWIND(lastcp);
5f05dabc 3623 regcppop();
4633a7c4 3624 cc->cur = n - 1;
c277df42 3625 cc->lastloc = lastloc;
3ab3c9b4 3626 CACHEsayNO;
a0d0e21e
LW
3627 }
3628
3629 /* Prefer scan over next for maximal matching. */
3630
3631 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3632 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3633 cc->cur = n;
3634 cc->lastloc = locinput;
02db2b7b 3635 REGCP_SET(lastcp);
5f05dabc 3636 if (regmatch(cc->scan)) {
c277df42 3637 regcpblow(cp);
3ab3c9b4 3638 CACHEsayYES;
5f05dabc 3639 }
02db2b7b 3640 REGCP_UNWIND(lastcp);
a0d0e21e 3641 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3642 PL_reginput = locinput;
a3621e74 3643 DEBUG_EXECUTE_r(
c3464db5
DD
3644 PerlIO_printf(Perl_debug_log,
3645 "%*s failed, try continuation...\n",
3280af22 3646 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3647 );
3648 }
9041c2e3 3649 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3650 && !(PL_reg_flags & RF_warned)) {
3280af22 3651 PL_reg_flags |= RF_warned;
9014280d 3652 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3653 "Complex regular subexpression recursion",
3654 REG_INFTY - 1);
a0d0e21e
LW
3655 }
3656
3657 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3658 PL_regcc = cc->oldcc;
2ab05381
IZ
3659 if (PL_regcc)
3660 ln = PL_regcc->cur;
a0d0e21e 3661 if (regmatch(cc->next))
3ab3c9b4 3662 CACHEsayYES;
2ab05381
IZ
3663 if (PL_regcc)
3664 PL_regcc->cur = ln;
3280af22 3665 PL_regcc = cc;
4633a7c4 3666 cc->cur = n - 1;
c277df42 3667 cc->lastloc = lastloc;
3ab3c9b4 3668 CACHEsayNO;
a0d0e21e 3669 }
5f66b61c 3670 /* NOTREACHED */
9041c2e3 3671 case BRANCHJ:
c277df42
IZ
3672 next = scan + ARG(scan);
3673 if (next == scan)
3674 next = NULL;
3675 inner = NEXTOPER(NEXTOPER(scan));
3676 goto do_branch;
9041c2e3 3677 case BRANCH:
c277df42
IZ
3678 inner = NEXTOPER(scan);
3679 do_branch:
3680 {
c277df42
IZ
3681 c1 = OP(scan);
3682 if (OP(next) != c1) /* No choice. */
3683 next = inner; /* Avoid recursion. */
a0d0e21e 3684 else {
a3b680e6 3685 const I32 lastparen = *PL_reglastparen;
02db2b7b 3686 /* Put unwinding data on stack */
6136c704
AL
3687 const I32 unwind1 = SSNEWt(1,re_unwind_branch_t);
3688 re_unwind_branch_t * const uw = SSPTRt(unwind1,re_unwind_branch_t);
3689
02db2b7b
IZ
3690 uw->prev = unwind;
3691 unwind = unwind1;
3692 uw->type = ((c1 == BRANCH)
3693 ? RE_UNWIND_BRANCH
3694 : RE_UNWIND_BRANCHJ);
3695 uw->lastparen = lastparen;
3696 uw->next = next;
3697 uw->locinput = locinput;
3698 uw->nextchr = nextchr;
3699#ifdef DEBUGGING
3700 uw->regindent = ++PL_regindent;
3701#endif
c277df42 3702
02db2b7b
IZ
3703 REGCP_SET(uw->lastcp);
3704
3705 /* Now go into the first branch */
3706 next = inner;
a687059c 3707 }
a0d0e21e
LW
3708 }
3709 break;
3710 case MINMOD:
3711 minmod = 1;
3712 break;
c277df42
IZ
3713 case CURLYM:
3714 {
00db4c45 3715 I32 l = 0;
c277df42 3716 CHECKPOINT lastcp;
9041c2e3 3717
c277df42 3718 /* We suppose that the next guy does not need
0e788c72 3719 backtracking: in particular, it is of constant non-zero length,
c277df42
IZ
3720 and has no parenths to influence future backrefs. */
3721 ln = ARG1(scan); /* min to match */
3722 n = ARG2(scan); /* max to match */
c277df42
IZ
3723 paren = scan->flags;
3724 if (paren) {
3280af22
NIS
3725 if (paren > PL_regsize)
3726 PL_regsize = paren;
eb160463 3727 if (paren > (I32)*PL_reglastparen)
3280af22 3728 *PL_reglastparen = paren;
c277df42 3729 }
dc45a647 3730 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3731 if (paren)
3732 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3733 PL_reginput = locinput;
c277df42
IZ
3734 if (minmod) {
3735 minmod = 0;
3736 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3737 sayNO;
3280af22 3738 locinput = PL_reginput;
cca55fe3 3739 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3740 regnode *text_node = next;
3741
cca55fe3 3742 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3743
cca55fe3 3744 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3745 else {
cca55fe3 3746 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3747 c1 = c2 = -1000;
3748 goto assume_ok_MM;
cca55fe3
JP
3749 }
3750 else { c1 = (U8)*STRING(text_node); }
af5decee 3751 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3752 c2 = PL_fold[c1];
af5decee 3753 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3754 c2 = PL_fold_locale[c1];
3755 else
3756 c2 = c1;
3757 }
a0ed51b3
LW
3758 }
3759 else
c277df42 3760 c1 = c2 = -1000;
cca55fe3 3761 assume_ok_MM:
02db2b7b 3762 REGCP_SET(lastcp);
0e788c72 3763 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
c277df42
IZ
3764 /* If it could work, try it. */
3765 if (c1 == -1000 ||
3280af22
NIS
3766 UCHARAT(PL_reginput) == c1 ||
3767 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3768 {
3769 if (paren) {
f31a99c8 3770 if (ln) {
cf93c79d
IZ
3771 PL_regstartp[paren] =
3772 HOPc(PL_reginput, -l) - PL_bostr;
3773 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3774 }
3775 else
cf93c79d 3776 PL_regendp[paren] = -1;
c277df42
IZ
3777 }
3778 if (regmatch(next))
3779 sayYES;
02db2b7b 3780 REGCP_UNWIND(lastcp);
c277df42
IZ
3781 }
3782 /* Couldn't or didn't -- move forward. */
3280af22 3783 PL_reginput = locinput;
c277df42
IZ
3784 if (regrepeat_hard(scan, 1, &l)) {
3785 ln++;
3280af22 3786 locinput = PL_reginput;
c277df42
IZ
3787 }
3788 else
3789 sayNO;
3790 }
a0ed51b3
LW
3791 }
3792 else {
c277df42 3793 n = regrepeat_hard(scan, n, &l);
3280af22 3794 locinput = PL_reginput;
a3621e74 3795 DEBUG_EXECUTE_r(
5c0ca799 3796 PerlIO_printf(Perl_debug_log,
faccc32b 3797 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3798 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3799 (IV) n, (IV)l)
c277df42
IZ
3800 );
3801 if (n >= ln) {
cca55fe3 3802 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3803 regnode *text_node = next;
3804
cca55fe3 3805 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3806
cca55fe3 3807 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3808 else {
cca55fe3 3809 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3810 c1 = c2 = -1000;
3811 goto assume_ok_REG;
cca55fe3
JP
3812 }
3813 else { c1 = (U8)*STRING(text_node); }
3814
af5decee 3815 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3816 c2 = PL_fold[c1];
af5decee 3817 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3818 c2 = PL_fold_locale[c1];
3819 else
3820 c2 = c1;
3821 }
a0ed51b3
LW
3822 }
3823 else
c277df42
IZ
3824 c1 = c2 = -1000;
3825 }
cca55fe3 3826 assume_ok_REG:
02db2b7b 3827 REGCP_SET(lastcp);
c277df42
IZ
3828 while (n >= ln) {
3829 /* If it could work, try it. */
3830 if (c1 == -1000 ||
3280af22
NIS
3831 UCHARAT(PL_reginput) == c1 ||
3832 UCHARAT(PL_reginput) == c2)
a0ed51b3 3833 {
a3621e74 3834 DEBUG_EXECUTE_r(
c3464db5 3835 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3836 "%*s trying tail with n=%"IVdf"...\n",
3837 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3838 );
3839 if (paren) {
3840 if (n) {
cf93c79d
IZ
3841 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3842 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3843 }
a0ed51b3 3844 else
cf93c79d 3845 PL_regendp[paren] = -1;
c277df42 3846 }
a0ed51b3
LW
3847 if (regmatch(next))
3848 sayYES;
02db2b7b 3849 REGCP_UNWIND(lastcp);
a0ed51b3 3850 }
c277df42
IZ
3851 /* Couldn't or didn't -- back up. */
3852 n--;
dfe13c55 3853 locinput = HOPc(locinput, -l);
3280af22 3854 PL_reginput = locinput;
c277df42
IZ
3855 }
3856 }
3857 sayNO;
5f66b61c 3858 /* NOTREACHED */
c277df42
IZ
3859 break;
3860 }
3861 case CURLYN:
3862 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3863 if (paren > PL_regsize)
3864 PL_regsize = paren;
eb160463 3865 if (paren > (I32)*PL_reglastparen)
3280af22 3866 *PL_reglastparen = paren;
c277df42
IZ
3867 ln = ARG1(scan); /* min to match */
3868 n = ARG2(scan); /* max to match */
dc45a647 3869 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3870 goto repeat;
a0d0e21e 3871 case CURLY:
c277df42 3872 paren = 0;
a0d0e21e
LW
3873 ln = ARG1(scan); /* min to match */
3874 n = ARG2(scan); /* max to match */
dc45a647 3875 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3876 goto repeat;
3877 case STAR:
3878 ln = 0;
c277df42 3879 n = REG_INFTY;
a0d0e21e 3880 scan = NEXTOPER(scan);
c277df42 3881 paren = 0;
a0d0e21e
LW
3882 goto repeat;
3883 case PLUS:
c277df42
IZ
3884 ln = 1;
3885 n = REG_INFTY;
3886 scan = NEXTOPER(scan);
3887 paren = 0;
3888 repeat:
a0d0e21e
LW
3889 /*
3890 * Lookahead to avoid useless match attempts
3891 * when we know what character comes next.
3892 */
5f80c4cf
JP
3893
3894 /*
3895 * Used to only do .*x and .*?x, but now it allows
3896 * for )'s, ('s and (?{ ... })'s to be in the way
3897 * of the quantifier and the EXACT-like node. -- japhy
3898 */
3899
cca55fe3 3900 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3901 U8 *s;
3902 regnode *text_node = next;
3903
cca55fe3 3904 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3905
cca55fe3 3906 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3907 else {
cca55fe3 3908 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3909 c1 = c2 = -1000;
3910 goto assume_ok_easy;
cca55fe3
JP
3911 }
3912 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
3913
3914 if (!UTF) {
3915 c2 = c1 = *s;
f65d3ee7 3916 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3917 c2 = PL_fold[c1];
f65d3ee7 3918 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf 3919 c2 = PL_fold_locale[c1];
1aa99e6b 3920 }
5f80c4cf 3921 else { /* UTF */
f65d3ee7 3922 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3923 STRLEN ulen1, ulen2;
89ebb4a3
JH
3924 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3925 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
3926
3927 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3928 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3929
89ebb4a3 3930 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
041457d9 3931 uniflags);
89ebb4a3 3932 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
041457d9 3933 uniflags);
5f80c4cf
JP
3934 }
3935 else {
89ebb4a3 3936 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 3937 uniflags);
5f80c4cf 3938 }
1aa99e6b
IH
3939 }
3940 }
bbce6d69 3941 }
a0d0e21e 3942 else
bbce6d69 3943 c1 = c2 = -1000;
cca55fe3 3944 assume_ok_easy:
3280af22 3945 PL_reginput = locinput;
a0d0e21e 3946 if (minmod) {
c277df42 3947 CHECKPOINT lastcp;
a0d0e21e
LW
3948 minmod = 0;
3949 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 3950 sayNO;
a0ed51b3 3951 locinput = PL_reginput;
02db2b7b 3952 REGCP_SET(lastcp);
0fe9bf95 3953 if (c1 != -1000) {
1aa99e6b 3954 char *e; /* Should not check after this */
0fe9bf95 3955 char *old = locinput;
b2f2f093 3956 int count = 0;
0fe9bf95 3957
1aa99e6b 3958 if (n == REG_INFTY) {
0fe9bf95 3959 e = PL_regeol - 1;
1aa99e6b
IH
3960 if (do_utf8)
3961 while (UTF8_IS_CONTINUATION(*(U8*)e))
3962 e--;
3963 }
3964 else if (do_utf8) {
3965 int m = n - ln;
3966 for (e = locinput;
3967 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3968 e += UTF8SKIP(e);
3969 }
3970 else {
3971 e = locinput + n - ln;
3972 if (e >= PL_regeol)
3973 e = PL_regeol - 1;
3974 }
0fe9bf95
IZ
3975 while (1) {
3976 /* Find place 'next' could work */
1aa99e6b
IH
3977 if (!do_utf8) {
3978 if (c1 == c2) {
a8e8ab15
JH
3979 while (locinput <= e &&
3980 UCHARAT(locinput) != c1)
1aa99e6b
IH
3981 locinput++;
3982 } else {
9041c2e3 3983 while (locinput <= e
a8e8ab15
JH
3984 && UCHARAT(locinput) != c1
3985 && UCHARAT(locinput) != c2)
1aa99e6b
IH
3986 locinput++;
3987 }
3988 count = locinput - old;
3989 }
3990 else {
1aa99e6b 3991 if (c1 == c2) {
a3b680e6 3992 STRLEN len;
872c91ae
JH
3993 /* count initialised to
3994 * utf8_distance(old, locinput) */
b2f2f093 3995 while (locinput <= e &&
872c91ae 3996 utf8n_to_uvchr((U8*)locinput,
89ebb4a3 3997 UTF8_MAXBYTES, &len,
041457d9 3998 uniflags) != (UV)c1) {
1aa99e6b 3999 locinput += len;
b2f2f093
JH
4000 count++;
4001 }
1aa99e6b 4002 } else {
a3b680e6 4003 STRLEN len;
872c91ae
JH
4004 /* count initialised to
4005 * utf8_distance(old, locinput) */
b2f2f093 4006 while (locinput <= e) {
872c91ae 4007 UV c = utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4008 UTF8_MAXBYTES, &len,
041457d9 4009 uniflags);
eb160463 4010 if (c == (UV)c1 || c == (UV)c2)
1aa99e6b 4011 break;
b2f2f093
JH
4012 locinput += len;
4013 count++;
1aa99e6b
IH
4014 }
4015 }
0fe9bf95 4016 }
9041c2e3 4017 if (locinput > e)
0fe9bf95
IZ
4018 sayNO;
4019 /* PL_reginput == old now */
4020 if (locinput != old) {
4021 ln = 1; /* Did some */
1aa99e6b 4022 if (regrepeat(scan, count) < count)
0fe9bf95
IZ
4023 sayNO;
4024 }
4025 /* PL_reginput == locinput now */
29d1e993 4026 TRYPAREN(paren, ln, locinput);
0fe9bf95 4027 PL_reginput = locinput; /* Could be reset... */
02db2b7b 4028 REGCP_UNWIND(lastcp);
0fe9bf95 4029 /* Couldn't or didn't -- move forward. */
1aa99e6b
IH
4030 old = locinput;
4031 if (do_utf8)
4032 locinput += UTF8SKIP(locinput);
4033 else
4034 locinput++;
b2f2f093 4035 count = 1;
0fe9bf95
IZ
4036 }
4037 }
4038 else
c277df42 4039 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
1aa99e6b
IH
4040 UV c;
4041 if (c1 != -1000) {
4042 if (do_utf8)
872c91ae 4043 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4044 UTF8_MAXBYTES, 0,
041457d9 4045 uniflags);
1aa99e6b 4046 else
9041c2e3 4047 c = UCHARAT(PL_reginput);
2390ecbc 4048 /* If it could work, try it. */
eb160463 4049 if (c == (UV)c1 || c == (UV)c2)
2390ecbc 4050 {
ecc99935 4051 TRYPAREN(paren, ln, PL_reginput);
2390ecbc
PP
4052 REGCP_UNWIND(lastcp);
4053 }
1aa99e6b 4054 }
a0d0e21e 4055 /* If it could work, try it. */
2390ecbc 4056 else if (c1 == -1000)
bbce6d69 4057 {
ecc99935 4058 TRYPAREN(paren, ln, PL_reginput);
02db2b7b 4059 REGCP_UNWIND(lastcp);
bbce6d69 4060 }
c277df42 4061 /* Couldn't or didn't -- move forward. */
a0ed51b3 4062 PL_reginput = locinput;
a0d0e21e
LW
4063 if (regrepeat(scan, 1)) {
4064 ln++;
a0ed51b3
LW
4065 locinput = PL_reginput;
4066 }
4067 else
4633a7c4 4068 sayNO;
a0d0e21e
LW
4069 }
4070 }
4071 else {
c277df42 4072 CHECKPOINT lastcp;
a0d0e21e 4073 n = regrepeat(scan, n);
a0ed51b3 4074 locinput = PL_reginput;
22c35a8c 4075 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
7fba1cd6 4076 (OP(next) != MEOL ||
15272685
HS
4077 OP(next) == SEOL || OP(next) == EOS))
4078 {
a0d0e21e 4079 ln = n; /* why back off? */
1aeab75a
GS
4080 /* ...because $ and \Z can match before *and* after
4081 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4082 We should back off by one in this case. */
4083 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
4084 ln--;
4085 }
02db2b7b 4086 REGCP_SET(lastcp);
c277df42 4087 if (paren) {
8fa7f367 4088 UV c = 0;
c277df42 4089 while (n >= ln) {
1aa99e6b
IH
4090 if (c1 != -1000) {
4091 if (do_utf8)
872c91ae 4092 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4093 UTF8_MAXBYTES, 0,
041457d9 4094 uniflags);
1aa99e6b 4095 else
9041c2e3 4096 c = UCHARAT(PL_reginput);
1aa99e6b 4097 }
c277df42 4098 /* If it could work, try it. */
eb160463 4099 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 4100 {
29d1e993 4101 TRYPAREN(paren, n, PL_reginput);
02db2b7b 4102 REGCP_UNWIND(lastcp);
c277df42
IZ
4103 }
4104 /* Couldn't or didn't -- back up. */
4105 n--;
dfe13c55 4106 PL_reginput = locinput = HOPc(locinput, -1);
c277df42 4107 }
a0ed51b3
LW
4108 }
4109 else {
8fa7f367 4110 UV c = 0;
c277df42 4111 while (n >= ln) {
1aa99e6b
IH
4112 if (c1 != -1000) {
4113 if (do_utf8)
872c91ae 4114 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4115 UTF8_MAXBYTES, 0,
041457d9 4116 uniflags);
1aa99e6b 4117 else
9041c2e3 4118 c = UCHARAT(PL_reginput);
1aa99e6b 4119 }
c277df42 4120 /* If it could work, try it. */
eb160463 4121 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 4122 {
29d1e993 4123 TRYPAREN(paren, n, PL_reginput);
02db2b7b 4124 REGCP_UNWIND(lastcp);
c277df42
IZ
4125 }
4126 /* Couldn't or didn't -- back up. */
4127 n--;
dfe13c55 4128 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 4129 }
a0d0e21e
LW
4130 }
4131 }
4633a7c4 4132 sayNO;
c277df42 4133 break;
a0d0e21e 4134 case END:
0f5d15d6
IZ
4135 if (PL_reg_call_cc) {
4136 re_cc_state *cur_call_cc = PL_reg_call_cc;
4137 CURCUR *cctmp = PL_regcc;
4138 regexp *re = PL_reg_re;
6136c704
AL
4139 CHECKPOINT lastcp;
4140 I32 tmp;
4141
4142 /* Save *all* the positions. */
4143 const CHECKPOINT cp = regcppush(0);
02db2b7b 4144 REGCP_SET(lastcp);
6136c704
AL
4145
4146 /* Restore parens of the caller. */
4147 tmp = PL_savestack_ix;
4148 PL_savestack_ix = PL_reg_call_cc->ss;
4149 regcppop();
4150 PL_savestack_ix = tmp;
4151
4152 /* Make position available to the callcc. */
4153 PL_reginput = locinput;
4154
0f5d15d6
IZ
4155 cache_re(PL_reg_call_cc->re);
4156 PL_regcc = PL_reg_call_cc->cc;
4157 PL_reg_call_cc = PL_reg_call_cc->prev;
4158 if (regmatch(cur_call_cc->node)) {
4159 PL_reg_call_cc = cur_call_cc;
4160 regcpblow(cp);
4161 sayYES;
4162 }
02db2b7b 4163 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
4164 regcppop();
4165 PL_reg_call_cc = cur_call_cc;
4166 PL_regcc = cctmp;
4167 PL_reg_re = re;
4168 cache_re(re);
4169
a3621e74 4170 DEBUG_EXECUTE_r(
0f5d15d6
IZ
4171 PerlIO_printf(Perl_debug_log,
4172 "%*s continuation failed...\n",
4173 REPORT_CODE_OFF+PL_regindent*2, "")
4174 );
7821416a 4175 sayNO_SILENT;
0f5d15d6 4176 }
7821416a 4177 if (locinput < PL_regtill) {
a3621e74 4178 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4179 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4180 PL_colors[4],
4181 (long)(locinput - PL_reg_starttry),
4182 (long)(PL_regtill - PL_reg_starttry),
4183 PL_colors[5]));
4184 sayNO_FINAL; /* Cannot match: too short. */
4185 }
4186 PL_reginput = locinput; /* put where regtry can find it */
4187 sayYES_FINAL; /* Success! */
7e5428c5 4188 case SUCCEED:
3280af22 4189 PL_reginput = locinput; /* put where regtry can find it */
7821416a 4190 sayYES_LOUD; /* Success! */
c277df42
IZ
4191 case SUSPEND:
4192 n = 1;
9fe1d20c 4193 PL_reginput = locinput;
9041c2e3 4194 goto do_ifmatch;
a0d0e21e 4195 case UNLESSM:
c277df42 4196 n = 0;
a0ed51b3 4197 if (scan->flags) {
efb30f32
HS
4198 s = HOPBACKc(locinput, scan->flags);
4199 if (!s)
4200 goto say_yes;
4201 PL_reginput = s;
a0ed51b3
LW
4202 }
4203 else
4204 PL_reginput = locinput;
c277df42
IZ
4205 goto do_ifmatch;
4206 case IFMATCH:
4207 n = 1;
a0ed51b3 4208 if (scan->flags) {
efb30f32
HS
4209 s = HOPBACKc(locinput, scan->flags);
4210 if (!s)
4211 goto say_no;
4212 PL_reginput = s;
a0ed51b3
LW
4213 }
4214 else
4215 PL_reginput = locinput;
4216
c277df42 4217 do_ifmatch:
c277df42
IZ
4218 inner = NEXTOPER(NEXTOPER(scan));
4219 if (regmatch(inner) != n) {
4220 say_no:
4221 if (logical) {
4222 logical = 0;
4223 sw = 0;
4224 goto do_longjump;
a0ed51b3
LW
4225 }
4226 else
c277df42
IZ
4227 sayNO;
4228 }
4229 say_yes:
4230 if (logical) {
4231 logical = 0;
4232 sw = 1;
4233 }
fe44a5e8 4234 if (OP(scan) == SUSPEND) {
3280af22 4235 locinput = PL_reginput;
565764a8 4236 nextchr = UCHARAT(locinput);
fe44a5e8 4237 }
c277df42
IZ
4238 /* FALL THROUGH. */
4239 case LONGJMP:
4240 do_longjump:
4241 next = scan + ARG(scan);
4242 if (next == scan)
4243 next = NULL;
a0d0e21e
LW
4244 break;
4245 default:
b900a521 4246 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4247 PTR2UV(scan), OP(scan));
cea2e8a9 4248 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4249 }
02db2b7b 4250 reenter:
a0d0e21e
LW
4251 scan = next;
4252 }
a687059c 4253
a0d0e21e
LW
4254 /*
4255 * We get here only if there's trouble -- normally "case END" is
4256 * the terminating point.
4257 */
cea2e8a9 4258 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4259 /*NOTREACHED*/
4633a7c4
LW
4260 sayNO;
4261
7821416a 4262yes_loud:
a3621e74 4263 DEBUG_EXECUTE_r(
7821416a
IZ
4264 PerlIO_printf(Perl_debug_log,
4265 "%*s %scould match...%s\n",
e4584336 4266 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a
IZ
4267 );
4268 goto yes;
4269yes_final:
a3621e74 4270 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4271 PL_colors[4], PL_colors[5]));
4633a7c4
LW
4272yes:
4273#ifdef DEBUGGING
3280af22 4274 PL_regindent--;
4633a7c4 4275#endif
02db2b7b
IZ
4276
4277#if 0 /* Breaks $^R */
4278 if (unwind)
4279 regcpblow(firstcp);
4280#endif
4633a7c4
LW
4281 return 1;
4282
4283no:
a3621e74 4284 DEBUG_EXECUTE_r(
7821416a
IZ
4285 PerlIO_printf(Perl_debug_log,
4286 "%*s %sfailed...%s\n",
e4584336 4287 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a
IZ
4288 );
4289 goto do_no;
4290no_final:
4291do_no:
02db2b7b 4292 if (unwind) {
6136c704 4293 re_unwind_t * const uw = SSPTRt(unwind,re_unwind_t);
02db2b7b
IZ
4294
4295 switch (uw->type) {
4296 case RE_UNWIND_BRANCH:
4297 case RE_UNWIND_BRANCHJ:
4298 {
6136c704 4299 re_unwind_branch_t * const uwb = &(uw->branch);
a3b680e6 4300 const I32 lastparen = uwb->lastparen;
9041c2e3 4301
02db2b7b
IZ
4302 REGCP_UNWIND(uwb->lastcp);
4303 for (n = *PL_reglastparen; n > lastparen; n--)
4304 PL_regendp[n] = -1;
4305 *PL_reglastparen = n;
4306 scan = next = uwb->next;
9041c2e3
NIS
4307 if ( !scan ||
4308 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b
IZ
4309 ? BRANCH : BRANCHJ) ) { /* Failure */
4310 unwind = uwb->prev;
4311#ifdef DEBUGGING
4312 PL_regindent--;
4313#endif
4314 goto do_no;
4315 }
4316 /* Have more choice yet. Reuse the same uwb. */
02db2b7b
IZ
4317 if ((n = (uwb->type == RE_UNWIND_BRANCH
4318 ? NEXT_OFF(next) : ARG(next))))
4319 next += n;
4320 else
4321 next = NULL; /* XXXX Needn't unwinding in this case... */
4322 uwb->next = next;
4323 next = NEXTOPER(scan);
4324 if (uwb->type == RE_UNWIND_BRANCHJ)
4325 next = NEXTOPER(next);
4326 locinput = uwb->locinput;
4327 nextchr = uwb->nextchr;
4328#ifdef DEBUGGING
4329 PL_regindent = uwb->regindent;
4330#endif
4331
4332 goto reenter;
4333 }
5f66b61c 4334 /* NOTREACHED */
02db2b7b
IZ
4335 default:
4336 Perl_croak(aTHX_ "regexp unwind memory corruption");
4337 }
5f66b61c 4338 /* NOTREACHED */
02db2b7b 4339 }
4633a7c4 4340#ifdef DEBUGGING
3280af22 4341 PL_regindent--;
4633a7c4 4342#endif
a0d0e21e 4343 return 0;
a687059c
LW
4344}
4345
4346/*
4347 - regrepeat - repeatedly match something simple, report how many
4348 */
4349/*
4350 * [This routine now assumes that it will only match on things of length 1.
4351 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4352 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4353 */
76e3520e 4354STATIC I32
a3b680e6 4355S_regrepeat(pTHX_ const regnode *p, I32 max)
a687059c 4356{
27da23d5 4357 dVAR;
a0d0e21e 4358 register char *scan;
a0d0e21e 4359 register I32 c;
3280af22 4360 register char *loceol = PL_regeol;
a0ed51b3 4361 register I32 hardcount = 0;
53c4c00c 4362 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4363
3280af22 4364 scan = PL_reginput;
faf11cac
HS
4365 if (max == REG_INFTY)
4366 max = I32_MAX;
4367 else if (max < loceol - scan)
a0d0e21e 4368 loceol = scan + max;
a0d0e21e 4369 switch (OP(p)) {
22c35a8c 4370 case REG_ANY:
1aa99e6b 4371 if (do_utf8) {
ffc61ed2 4372 loceol = PL_regeol;
1aa99e6b 4373 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4374 scan += UTF8SKIP(scan);
4375 hardcount++;
4376 }
4377 } else {
4378 while (scan < loceol && *scan != '\n')
4379 scan++;
a0ed51b3
LW
4380 }
4381 break;
ffc61ed2 4382 case SANY:
def8e4ea
JH
4383 if (do_utf8) {
4384 loceol = PL_regeol;
a0804c9e 4385 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4386 scan += UTF8SKIP(scan);
4387 hardcount++;
4388 }
4389 }
4390 else
4391 scan = loceol;
a0ed51b3 4392 break;
f33976b4
DB
4393 case CANY:
4394 scan = loceol;
4395 break;
090f7165
JH
4396 case EXACT: /* length of string is 1 */
4397 c = (U8)*STRING(p);
4398 while (scan < loceol && UCHARAT(scan) == c)
4399 scan++;
bbce6d69 4400 break;
4401 case EXACTF: /* length of string is 1 */
cd439c50 4402 c = (U8)*STRING(p);
bbce6d69 4403 while (scan < loceol &&
22c35a8c 4404 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4405 scan++;
4406 break;
4407 case EXACTFL: /* length of string is 1 */
3280af22 4408 PL_reg_flags |= RF_tainted;
cd439c50 4409 c = (U8)*STRING(p);
bbce6d69 4410 while (scan < loceol &&
22c35a8c 4411 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4412 scan++;
4413 break;
4414 case ANYOF:
ffc61ed2
JH
4415 if (do_utf8) {
4416 loceol = PL_regeol;
cfc92286
JH
4417 while (hardcount < max && scan < loceol &&
4418 reginclass(p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4419 scan += UTF8SKIP(scan);
4420 hardcount++;
4421 }
4422 } else {
7d3e948e 4423 while (scan < loceol && REGINCLASS(p, (U8*)scan))
ffc61ed2
JH
4424 scan++;
4425 }
a0d0e21e
LW
4426 break;
4427 case ALNUM:
1aa99e6b 4428 if (do_utf8) {
ffc61ed2 4429 loceol = PL_regeol;
1a4fad37 4430 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4431 while (hardcount < max && scan < loceol &&
3568d838 4432 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4433 scan += UTF8SKIP(scan);
4434 hardcount++;
4435 }
4436 } else {
4437 while (scan < loceol && isALNUM(*scan))
4438 scan++;
a0ed51b3
LW
4439 }
4440 break;
bbce6d69 4441 case ALNUML:
3280af22 4442 PL_reg_flags |= RF_tainted;
1aa99e6b 4443 if (do_utf8) {
ffc61ed2 4444 loceol = PL_regeol;
1aa99e6b
IH
4445 while (hardcount < max && scan < loceol &&
4446 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4447 scan += UTF8SKIP(scan);
4448 hardcount++;
4449 }
4450 } else {
4451 while (scan < loceol && isALNUM_LC(*scan))
4452 scan++;
a0ed51b3
LW
4453 }
4454 break;
a0d0e21e 4455 case NALNUM:
1aa99e6b 4456 if (do_utf8) {
ffc61ed2 4457 loceol = PL_regeol;
1a4fad37 4458 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4459 while (hardcount < max && scan < loceol &&
3568d838 4460 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4461 scan += UTF8SKIP(scan);
4462 hardcount++;
4463 }
4464 } else {
4465 while (scan < loceol && !isALNUM(*scan))
4466 scan++;
a0ed51b3
LW
4467 }
4468 break;
bbce6d69 4469 case NALNUML:
3280af22 4470 PL_reg_flags |= RF_tainted;
1aa99e6b 4471 if (do_utf8) {
ffc61ed2 4472 loceol = PL_regeol;
1aa99e6b
IH
4473 while (hardcount < max && scan < loceol &&
4474 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4475 scan += UTF8SKIP(scan);
4476 hardcount++;
4477 }
4478 } else {
4479 while (scan < loceol && !isALNUM_LC(*scan))
4480 scan++;
a0ed51b3
LW
4481 }
4482 break;
a0d0e21e 4483 case SPACE:
1aa99e6b 4484 if (do_utf8) {
ffc61ed2 4485 loceol = PL_regeol;
1a4fad37 4486 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4487 while (hardcount < max && scan < loceol &&
3568d838
JH
4488 (*scan == ' ' ||
4489 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4490 scan += UTF8SKIP(scan);
4491 hardcount++;
4492 }
4493 } else {
4494 while (scan < loceol && isSPACE(*scan))
4495 scan++;
a0ed51b3
LW
4496 }
4497 break;
bbce6d69 4498 case SPACEL:
3280af22 4499 PL_reg_flags |= RF_tainted;
1aa99e6b 4500 if (do_utf8) {
ffc61ed2 4501 loceol = PL_regeol;
1aa99e6b 4502 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4503 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4504 scan += UTF8SKIP(scan);
4505 hardcount++;
4506 }
4507 } else {
4508 while (scan < loceol && isSPACE_LC(*scan))
4509 scan++;
a0ed51b3
LW
4510 }
4511 break;
a0d0e21e 4512 case NSPACE:
1aa99e6b 4513 if (do_utf8) {
ffc61ed2 4514 loceol = PL_regeol;
1a4fad37 4515 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4516 while (hardcount < max && scan < loceol &&
3568d838
JH
4517 !(*scan == ' ' ||
4518 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4519 scan += UTF8SKIP(scan);
4520 hardcount++;
4521 }
4522 } else {
4523 while (scan < loceol && !isSPACE(*scan))
4524 scan++;
4525 break;
a0ed51b3 4526 }
bbce6d69 4527 case NSPACEL:
3280af22 4528 PL_reg_flags |= RF_tainted;
1aa99e6b 4529 if (do_utf8) {
ffc61ed2 4530 loceol = PL_regeol;
1aa99e6b 4531 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4532 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4533 scan += UTF8SKIP(scan);
4534 hardcount++;
4535 }
4536 } else {
4537 while (scan < loceol && !isSPACE_LC(*scan))
4538 scan++;
a0ed51b3
LW
4539 }
4540 break;
a0d0e21e 4541 case DIGIT:
1aa99e6b 4542 if (do_utf8) {
ffc61ed2 4543 loceol = PL_regeol;
1a4fad37 4544 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4545 while (hardcount < max && scan < loceol &&
3568d838 4546 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4547 scan += UTF8SKIP(scan);
4548 hardcount++;
4549 }
4550 } else {
4551 while (scan < loceol && isDIGIT(*scan))
4552 scan++;
a0ed51b3
LW
4553 }
4554 break;
a0d0e21e 4555 case NDIGIT:
1aa99e6b 4556 if (do_utf8) {
ffc61ed2 4557 loceol = PL_regeol;
1a4fad37 4558 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4559 while (hardcount < max && scan < loceol &&
3568d838 4560 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4561 scan += UTF8SKIP(scan);
4562 hardcount++;
4563 }
4564 } else {
4565 while (scan < loceol && !isDIGIT(*scan))
4566 scan++;
a0ed51b3
LW
4567 }
4568 break;
a0d0e21e
LW
4569 default: /* Called on something of 0 width. */
4570 break; /* So match right here or not at all. */
4571 }
a687059c 4572
a0ed51b3
LW
4573 if (hardcount)
4574 c = hardcount;
4575 else
4576 c = scan - PL_reginput;
3280af22 4577 PL_reginput = scan;
a687059c 4578
a3621e74 4579 DEBUG_r({
ab74612d 4580 SV *re_debug_flags = NULL;
6136c704 4581 SV * const prop = sv_newmortal();
a3621e74
YO
4582 GET_RE_DEBUG_FLAGS;
4583 DEBUG_EXECUTE_r({
c277df42 4584 regprop(prop, p);
9041c2e3
NIS
4585 PerlIO_printf(Perl_debug_log,
4586 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3f7c398e 4587 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
c277df42 4588 });
a3621e74 4589 });
9041c2e3 4590
a0d0e21e 4591 return(c);
a687059c
LW
4592}
4593
4594/*
c277df42 4595 - regrepeat_hard - repeatedly match something, report total lenth and length
9041c2e3 4596 *
0e788c72 4597 * The repeater is supposed to have constant non-zero length.
c277df42
IZ
4598 */
4599
76e3520e 4600STATIC I32
cea2e8a9 4601S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
c277df42 4602{
97aff369 4603 dVAR;
bd61b366 4604 register char *scan = NULL;
c277df42 4605 register char *start;
3280af22 4606 register char *loceol = PL_regeol;
a0ed51b3 4607 I32 l = 0;
708e3b05 4608 I32 count = 0, res = 1;
a0ed51b3
LW
4609
4610 if (!max)
4611 return 0;
c277df42 4612
3280af22 4613 start = PL_reginput;
53c4c00c 4614 if (PL_reg_match_utf8) {
708e3b05 4615 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4616 if (!count++) {
4617 l = 0;
4618 while (start < PL_reginput) {
4619 l++;
4620 start += UTF8SKIP(start);
4621 }
4622 *lp = l;
4623 if (l == 0)
4624 return max;
4625 }
4626 if (count == max)
4627 return count;
4628 }
4629 }
4630 else {
708e3b05 4631 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4632 if (!count++) {
4633 *lp = l = PL_reginput - start;
4634 if (max != REG_INFTY && l*max < loceol - scan)
4635 loceol = scan + l*max;
4636 if (l == 0)
4637 return max;
c277df42
IZ
4638 }
4639 }
4640 }
708e3b05 4641 if (!res)
3280af22 4642 PL_reginput = scan;
9041c2e3 4643
a0ed51b3 4644 return count;
c277df42
IZ
4645}
4646
4647/*
ffc61ed2
JH
4648- regclass_swash - prepare the utf8 swash
4649*/
4650
4651SV *
a3b680e6 4652Perl_regclass_swash(pTHX_ register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4653{
97aff369 4654 dVAR;
9e55ce06
JH
4655 SV *sw = NULL;
4656 SV *si = NULL;
4657 SV *alt = NULL;
ffc61ed2
JH
4658
4659 if (PL_regdata && PL_regdata->count) {
a3b680e6 4660 const U32 n = ARG(node);
ffc61ed2
JH
4661
4662 if (PL_regdata->what[n] == 's') {
890ce7af
AL
4663 SV * const rv = (SV*)PL_regdata->data[n];
4664 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 4665 SV **const ary = AvARRAY(av);
9e55ce06 4666 SV **a, **b;
9041c2e3 4667
711a919c 4668 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
4669 * documentation of these array elements. */
4670
b11f357e 4671 si = *ary;
8f7f7219 4672 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
4673 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4674
ffc61ed2
JH
4675 if (a)
4676 sw = *a;
4677 else if (si && doinit) {
4678 sw = swash_init("utf8", "", si, 1, 0);
4679 (void)av_store(av, 1, sw);
4680 }
9e55ce06
JH
4681 if (b)
4682 alt = *b;
ffc61ed2
JH
4683 }
4684 }
4685
9e55ce06
JH
4686 if (listsvp)
4687 *listsvp = si;
4688 if (altsvp)
4689 *altsvp = alt;
ffc61ed2
JH
4690
4691 return sw;
4692}
4693
4694/*
ba7b4546 4695 - reginclass - determine if a character falls into a character class
832705d4
JH
4696
4697 The n is the ANYOF regnode, the p is the target string, lenp
4698 is pointer to the maximum length of how far to go in the p
4699 (if the lenp is zero, UTF8SKIP(p) is used),
4700 do_utf8 tells whether the target string is in UTF-8.
4701
bbce6d69 4702 */
4703
76e3520e 4704STATIC bool
a3b680e6 4705S_reginclass(pTHX_ register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4706{
27da23d5 4707 dVAR;
a3b680e6 4708 const char flags = ANYOF_FLAGS(n);
bbce6d69 4709 bool match = FALSE;
cc07378b 4710 UV c = *p;
ae9ddab8 4711 STRLEN len = 0;
9e55ce06 4712 STRLEN plen;
1aa99e6b 4713
19f67299
TS
4714 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
4715 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4716 ckWARN(WARN_UTF8) ? UTF8_CHECK_ONLY :
4717 UTF8_ALLOW_ANYUV|UTF8_CHECK_ONLY);
4718 if (len == (STRLEN)-1)
4719 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
4720 }
bbce6d69 4721
0f0076b4 4722 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4723 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4724 if (lenp)
4725 *lenp = 0;
ffc61ed2 4726 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4727 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4728 match = TRUE;
bbce6d69 4729 }
3568d838 4730 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4731 match = TRUE;
ffc61ed2 4732 if (!match) {
9e55ce06 4733 AV *av;
890ce7af 4734 SV * const sw = regclass_swash(n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4735
4736 if (sw) {
3568d838 4737 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4738 match = TRUE;
4739 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4740 if (!match && lenp && av) {
4741 I32 i;
9e55ce06 4742 for (i = 0; i <= av_len(av); i++) {
890ce7af 4743 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 4744 STRLEN len;
890ce7af 4745 const char * const s = SvPV_const(sv, len);
9e55ce06 4746
061b10df 4747 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4748 *lenp = len;
4749 match = TRUE;
4750 break;
4751 }
4752 }
4753 }
4754 if (!match) {
89ebb4a3 4755 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
4756 STRLEN tmplen;
4757
9e55ce06
JH
4758 to_utf8_fold(p, tmpbuf, &tmplen);
4759 if (swash_fetch(sw, tmpbuf, do_utf8))
4760 match = TRUE;
4761 }
ffc61ed2
JH
4762 }
4763 }
bbce6d69 4764 }
9e55ce06 4765 if (match && lenp && *lenp == 0)
0f0076b4 4766 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4767 }
1aa99e6b 4768 if (!match && c < 256) {
ffc61ed2
JH
4769 if (ANYOF_BITMAP_TEST(n, c))
4770 match = TRUE;
4771 else if (flags & ANYOF_FOLD) {
eb160463 4772 U8 f;
a0ed51b3 4773
ffc61ed2
JH
4774 if (flags & ANYOF_LOCALE) {
4775 PL_reg_flags |= RF_tainted;
4776 f = PL_fold_locale[c];
4777 }
4778 else
4779 f = PL_fold[c];
4780 if (f != c && ANYOF_BITMAP_TEST(n, f))
4781 match = TRUE;
4782 }
4783
4784 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4785 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4786 if (
4787 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4788 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4789 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4790 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4791 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4792 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4793 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4794 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4795 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4796 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4797 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4798 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4799 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4800 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4801 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4802 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4803 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4804 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4805 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4806 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4807 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4808 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4809 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4810 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4811 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4812 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4813 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4814 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4815 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4816 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4817 ) /* How's that for a conditional? */
4818 {
4819 match = TRUE;
4820 }
a0ed51b3 4821 }
a0ed51b3
LW
4822 }
4823
a0ed51b3
LW
4824 return (flags & ANYOF_INVERT) ? !match : match;
4825}
161b471a 4826
dfe13c55 4827STATIC U8 *
cea2e8a9 4828S_reghop(pTHX_ U8 *s, I32 off)
9041c2e3 4829{
97aff369 4830 dVAR;
5f66b61c 4831 return S_reghop3(s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
1aa99e6b
IH
4832}
4833
4834STATIC U8 *
5f66b61c 4835S_reghop3(U8 *s, I32 off, U8* lim)
9041c2e3 4836{
97aff369 4837 dVAR;
a0ed51b3 4838 if (off >= 0) {
1aa99e6b 4839 while (off-- && s < lim) {
ffc61ed2 4840 /* XXX could check well-formedness here */
a0ed51b3 4841 s += UTF8SKIP(s);
ffc61ed2 4842 }
a0ed51b3
LW
4843 }
4844 else {
4845 while (off++) {
1aa99e6b 4846 if (s > lim) {
a0ed51b3 4847 s--;
ffc61ed2 4848 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4849 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4850 s--;
ffc61ed2
JH
4851 }
4852 /* XXX could check well-formedness here */
a0ed51b3
LW
4853 }
4854 }
4855 }
4856 return s;
4857}
161b471a 4858
dfe13c55 4859STATIC U8 *
1aa99e6b 4860S_reghopmaybe(pTHX_ U8 *s, I32 off)
9041c2e3 4861{
97aff369 4862 dVAR;
5f66b61c 4863 return S_reghopmaybe3(s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
1aa99e6b
IH
4864}
4865
4866STATIC U8 *
5f66b61c 4867S_reghopmaybe3(U8* s, I32 off, U8* lim)
a0ed51b3 4868{
97aff369 4869 dVAR;
a0ed51b3 4870 if (off >= 0) {
1aa99e6b 4871 while (off-- && s < lim) {
ffc61ed2 4872 /* XXX could check well-formedness here */
a0ed51b3 4873 s += UTF8SKIP(s);
ffc61ed2 4874 }
a0ed51b3
LW
4875 if (off >= 0)
4876 return 0;
4877 }
4878 else {
4879 while (off++) {
1aa99e6b 4880 if (s > lim) {
a0ed51b3 4881 s--;
ffc61ed2 4882 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4883 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4884 s--;
ffc61ed2
JH
4885 }
4886 /* XXX could check well-formedness here */
a0ed51b3
LW
4887 }
4888 else
4889 break;
4890 }
4891 if (off <= 0)
4892 return 0;
4893 }
4894 return s;
4895}
51371543 4896
51371543 4897static void
acfe0abc 4898restore_pos(pTHX_ void *arg)
51371543 4899{
97aff369 4900 dVAR;
9d4ba2ae 4901 PERL_UNUSED_ARG(arg);
51371543
GS
4902 if (PL_reg_eval_set) {
4903 if (PL_reg_oldsaved) {
4904 PL_reg_re->subbeg = PL_reg_oldsaved;
4905 PL_reg_re->sublen = PL_reg_oldsavedlen;
f8c7b90f 4906#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
4907 PL_reg_re->saved_copy = PL_nrs;
4908#endif
51371543
GS
4909 RX_MATCH_COPIED_on(PL_reg_re);
4910 }
4911 PL_reg_magic->mg_len = PL_reg_oldpos;
4912 PL_reg_eval_set = 0;
4913 PL_curpm = PL_reg_oldcurpm;
4914 }
4915}
33b8afdf
JH
4916
4917STATIC void
4918S_to_utf8_substr(pTHX_ register regexp *prog)
4919{
33b8afdf 4920 if (prog->float_substr && !prog->float_utf8) {
9d4ba2ae 4921 SV* sv;
f2b990bf 4922 prog->float_utf8 = sv = newSVsv(prog->float_substr);
33b8afdf
JH
4923 sv_utf8_upgrade(sv);
4924 if (SvTAIL(prog->float_substr))
4925 SvTAIL_on(sv);
4926 if (prog->float_substr == prog->check_substr)
4927 prog->check_utf8 = sv;
4928 }
4929 if (prog->anchored_substr && !prog->anchored_utf8) {
9d4ba2ae 4930 SV* sv;
f2b990bf 4931 prog->anchored_utf8 = sv = newSVsv(prog->anchored_substr);
33b8afdf
JH
4932 sv_utf8_upgrade(sv);
4933 if (SvTAIL(prog->anchored_substr))
4934 SvTAIL_on(sv);
4935 if (prog->anchored_substr == prog->check_substr)
4936 prog->check_utf8 = sv;
4937 }
4938}
4939
4940STATIC void
4941S_to_byte_substr(pTHX_ register regexp *prog)
4942{
97aff369 4943 dVAR;
33b8afdf 4944 if (prog->float_utf8 && !prog->float_substr) {
9d4ba2ae 4945 SV* sv;
f2b990bf 4946 prog->float_substr = sv = newSVsv(prog->float_utf8);
33b8afdf
JH
4947 if (sv_utf8_downgrade(sv, TRUE)) {
4948 if (SvTAIL(prog->float_utf8))
4949 SvTAIL_on(sv);
4950 } else {
4951 SvREFCNT_dec(sv);
4952 prog->float_substr = sv = &PL_sv_undef;
4953 }
4954 if (prog->float_utf8 == prog->check_utf8)
4955 prog->check_substr = sv;
4956 }
4957 if (prog->anchored_utf8 && !prog->anchored_substr) {
9d4ba2ae 4958 SV* sv;
f2b990bf 4959 prog->anchored_substr = sv = newSVsv(prog->anchored_utf8);
33b8afdf
JH
4960 if (sv_utf8_downgrade(sv, TRUE)) {
4961 if (SvTAIL(prog->anchored_utf8))
4962 SvTAIL_on(sv);
4963 } else {
4964 SvREFCNT_dec(sv);
4965 prog->anchored_substr = sv = &PL_sv_undef;
4966 }
4967 if (prog->anchored_utf8 == prog->check_utf8)
4968 prog->check_substr = sv;
4969 }
4970}
66610fdd
RGS
4971
4972/*
4973 * Local variables:
4974 * c-indentation-style: bsd
4975 * c-basic-offset: 4
4976 * indent-tabs-mode: t
4977 * End:
4978 *
37442d52
RGS
4979 * ex: set ts=8 sts=4 sw=4 noet:
4980 */