This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Make PERL_MEM_LOG more portable.
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
50/* *These* symbols are masked to allow static link. */
51# define Perl_pregexec my_pregexec
9041c2e3 52# define Perl_reginitcolors my_reginitcolors
490a3f88 53# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
9041c2e3 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 **** Alterations to Henry's code are...
79 ****
4bb101f2 80 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
f2b990bf 81 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 82 ****
9ef589d8
LW
83 **** You may distribute under the terms of either the GNU General Public
84 **** License or the Artistic License, as specified in the README file.
a687059c
LW
85 *
86 * Beware that some of this code is subtly aware of the way operator
87 * precedence is structured in regular expressions. Serious changes in
88 * regular-expression syntax might require a total rethink.
89 */
90#include "EXTERN.h"
864dbfa3 91#define PERL_IN_REGEXEC_C
a687059c 92#include "perl.h"
0f5d15d6 93
a687059c
LW
94#include "regcomp.h"
95
c277df42
IZ
96#define RF_tainted 1 /* tainted information used? */
97#define RF_warned 2 /* warned about big count? */
ce862d02 98#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
99#define RF_utf8 8 /* String contains multibyte chars? */
100
eb160463 101#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
102
103#define RS_init 1 /* eval environment created */
104#define RS_set 2 /* replsv value is set */
c277df42 105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
ba7b4546 110#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 111
c277df42
IZ
112/*
113 * Forwards.
114 */
115
33b8afdf 116#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 117#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 118
dfe13c55
GS
119#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
120#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
121#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
122#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
123#define HOPc(pos,off) ((char*)HOP(pos,off))
124#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 125
efb30f32 126#define HOPBACK(pos, off) ( \
e54858b0 127 (PL_reg_match_utf8) \
efb30f32
HS
128 ? reghopmaybe((U8*)pos, -off) \
129 : (pos - off >= PL_bostr) \
130 ? (U8*)(pos - off) \
131 : (U8*)NULL \
132)
133#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
134
1aa99e6b
IH
135#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
136#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
137#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
138#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
139#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
140#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
141
1a4fad37
AL
142#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
143 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
144#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
145#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
146#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
147#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 148
5f80c4cf 149/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
150#define JUMPABLE(rn) ( \
151 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
152 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
153 OP(rn) == PLUS || OP(rn) == MINMOD || \
154 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
155)
156
cca55fe3
JP
157#define HAS_TEXT(rn) ( \
158 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
159)
e2d8ce26 160
a84d97b6
HS
161/*
162 Search for mandatory following text node; for lookahead, the text must
163 follow but for lookbehind (rn->flags != 0) we skip to the next step.
164*/
cca55fe3 165#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 166 while (JUMPABLE(rn)) \
a84d97b6 167 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 168 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
169 else if (OP(rn) == PLUS) \
170 rn = NEXTOPER(rn); \
a84d97b6
HS
171 else if (OP(rn) == IFMATCH) \
172 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 173 else rn += NEXT_OFF(rn); \
5f80c4cf 174} STMT_END
74750237 175
acfe0abc 176static void restore_pos(pTHX_ void *arg);
51371543 177
76e3520e 178STATIC CHECKPOINT
cea2e8a9 179S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 180{
a3b680e6 181 const int retval = PL_savestack_ix;
b1ce53c5 182#define REGCP_PAREN_ELEMS 4
a3b680e6 183 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
184 int p;
185
e49a9654
IH
186 if (paren_elems_to_push < 0)
187 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
188
a01268b5 189#define REGCP_OTHER_ELEMS 6
4b3c1a47 190 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 191 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 192/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
193 SSPUSHINT(PL_regendp[p]);
194 SSPUSHINT(PL_regstartp[p]);
3280af22 195 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
196 SSPUSHINT(p);
197 }
b1ce53c5 198/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
199 SSPUSHINT(PL_regsize);
200 SSPUSHINT(*PL_reglastparen);
a01268b5 201 SSPUSHINT(*PL_reglastcloseparen);
3280af22 202 SSPUSHPTR(PL_reginput);
41123dfd
JH
203#define REGCP_FRAME_ELEMS 2
204/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
205 * are needed for the regexp context stack bookkeeping. */
206 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 207 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 208
a0d0e21e
LW
209 return retval;
210}
211
c277df42 212/* These are needed since we do not localize EVAL nodes: */
a3621e74 213# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 214 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 215 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 216
a3621e74 217# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 218 PerlIO_printf(Perl_debug_log, \
faccc32b 219 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 220 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 221
76e3520e 222STATIC char *
cea2e8a9 223S_regcppop(pTHX)
a0d0e21e 224{
b1ce53c5 225 I32 i;
a0d0e21e
LW
226 U32 paren = 0;
227 char *input;
b1ce53c5 228
a3621e74
YO
229 GET_RE_DEBUG_FLAGS_DECL;
230
b1ce53c5 231 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 232 i = SSPOPINT;
b1ce53c5
JH
233 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
234 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 235 input = (char *) SSPOPPTR;
a01268b5 236 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
237 *PL_reglastparen = SSPOPINT;
238 PL_regsize = SSPOPINT;
b1ce53c5
JH
239
240 /* Now restore the parentheses context. */
41123dfd
JH
241 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
242 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 243 I32 tmps;
a0d0e21e 244 paren = (U32)SSPOPINT;
3280af22 245 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
246 PL_regstartp[paren] = SSPOPINT;
247 tmps = SSPOPINT;
3280af22
NIS
248 if (paren <= *PL_reglastparen)
249 PL_regendp[paren] = tmps;
a3621e74 250 DEBUG_EXECUTE_r(
c3464db5 251 PerlIO_printf(Perl_debug_log,
b900a521 252 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 253 (UV)paren, (IV)PL_regstartp[paren],
b900a521 254 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 255 (IV)PL_regendp[paren],
3280af22 256 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 257 );
a0d0e21e 258 }
a3621e74 259 DEBUG_EXECUTE_r(
eb160463 260 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 261 PerlIO_printf(Perl_debug_log,
faccc32b
JH
262 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
263 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
264 }
265 );
daf18116 266#if 1
dafc8851
JH
267 /* It would seem that the similar code in regtry()
268 * already takes care of this, and in fact it is in
269 * a better location to since this code can #if 0-ed out
270 * but the code in regtry() is needed or otherwise tests
271 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
272 * (as of patchlevel 7877) will fail. Then again,
273 * this code seems to be necessary or otherwise
274 * building DynaLoader will fail:
275 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
276 * --jhi */
eb160463
GS
277 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
278 if ((I32)paren > PL_regsize)
cf93c79d
IZ
279 PL_regstartp[paren] = -1;
280 PL_regendp[paren] = -1;
a0d0e21e 281 }
dafc8851 282#endif
a0d0e21e
LW
283 return input;
284}
285
0f5d15d6 286STATIC char *
cea2e8a9 287S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6 288{
a3b680e6 289 const I32 tmp = PL_savestack_ix;
0f5d15d6
IZ
290
291 PL_savestack_ix = ss;
292 regcppop();
293 PL_savestack_ix = tmp;
942e002e 294 return Nullch;
0f5d15d6
IZ
295}
296
297typedef struct re_cc_state
298{
299 I32 ss;
300 regnode *node;
301 struct re_cc_state *prev;
302 CURCUR *cc;
303 regexp *re;
304} re_cc_state;
305
02db2b7b 306#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 307
29d1e993
HS
308#define TRYPAREN(paren, n, input) { \
309 if (paren) { \
310 if (n) { \
311 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
312 PL_regendp[paren] = input - PL_bostr; \
313 } \
314 else \
315 PL_regendp[paren] = -1; \
316 } \
317 if (regmatch(next)) \
318 sayYES; \
319 if (paren && n) \
320 PL_regendp[paren] = -1; \
321}
322
323
a687059c 324/*
e50aee73 325 * pregexec and friends
a687059c
LW
326 */
327
328/*
c277df42 329 - pregexec - match a regexp against a string
a687059c 330 */
c277df42 331I32
864dbfa3 332Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 333 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
334/* strend: pointer to null at end of string */
335/* strbeg: real beginning of string */
336/* minend: end of match must be >=minend after stringarg. */
337/* nosave: For optimizations. */
338{
339 return
9041c2e3 340 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
341 nosave ? 0 : REXEC_COPY_STR);
342}
0f5d15d6
IZ
343
344STATIC void
cea2e8a9 345S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
346{
347 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
348#ifdef DEBUGGING
349 PL_regprogram = prog->program;
350#endif
351 PL_regnpar = prog->nparens;
9041c2e3
NIS
352 PL_regdata = prog->data;
353 PL_reg_re = prog;
0f5d15d6 354}
22e551b9 355
9041c2e3 356/*
cad2e5aa
JH
357 * Need to implement the following flags for reg_anch:
358 *
359 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
360 * USE_INTUIT_ML
361 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
362 * INTUIT_AUTORITATIVE_ML
363 * INTUIT_ONCE_NOML - Intuit can match in one location only.
364 * INTUIT_ONCE_ML
365 *
366 * Another flag for this function: SECOND_TIME (so that float substrs
367 * with giant delta may be not rechecked).
368 */
369
370/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
371
3f7c398e 372/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
373 Otherwise, only SvCUR(sv) is used to get strbeg. */
374
375/* XXXX We assume that strpos is strbeg unless sv. */
376
6eb5f6b9
JH
377/* XXXX Some places assume that there is a fixed substring.
378 An update may be needed if optimizer marks as "INTUITable"
379 RExen without fixed substrings. Similarly, it is assumed that
380 lengths of all the strings are no more than minlen, thus they
381 cannot come from lookahead.
382 (Or minlen should take into account lookahead.) */
383
2c2d71f5
JH
384/* A failure to find a constant substring means that there is no need to make
385 an expensive call to REx engine, thus we celebrate a failure. Similarly,
386 finding a substring too deep into the string means that less calls to
30944b6d
IZ
387 regtry() should be needed.
388
389 REx compiler's optimizer found 4 possible hints:
390 a) Anchored substring;
391 b) Fixed substring;
392 c) Whether we are anchored (beginning-of-line or \G);
393 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 394 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
395 string which does not contradict any of them.
396 */
2c2d71f5 397
6eb5f6b9
JH
398/* Most of decisions we do here should have been done at compile time.
399 The nodes of the REx which we used for the search should have been
400 deleted from the finite automaton. */
401
cad2e5aa
JH
402char *
403Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
404 char *strend, U32 flags, re_scream_pos_data *data)
405{
b7953727 406 register I32 start_shift = 0;
cad2e5aa 407 /* Should be nonnegative! */
b7953727 408 register I32 end_shift = 0;
2c2d71f5
JH
409 register char *s;
410 register SV *check;
a1933d95 411 char *strbeg;
cad2e5aa 412 char *t;
a3b680e6 413 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 414 I32 ml_anch;
6eb5f6b9 415 register char *other_last = Nullch; /* other substr checked before this */
b7953727 416 char *check_at = Nullch; /* check substr found at this pos */
1df70142 417 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 418#ifdef DEBUGGING
890ce7af
AL
419 const char * const i_strpos = strpos;
420 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 421#endif
a3621e74
YO
422
423 GET_RE_DEBUG_FLAGS_DECL;
424
a30b2f1f 425 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 426
b8d68ded 427 if (prog->reganch & ROPT_UTF8) {
a3621e74 428 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
429 "UTF-8 regex...\n"));
430 PL_reg_flags |= RF_utf8;
431 }
432
a3621e74 433 DEBUG_EXECUTE_r({
1df70142 434 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
435 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
436 strpos;
1df70142 437 const int len = PL_reg_match_utf8 ?
b8d68ded 438 strlen(s) : strend - strpos;
2a782b5b
JH
439 if (!PL_colorset)
440 reginitcolors();
b8d68ded 441 if (PL_reg_match_utf8)
a3621e74 442 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 443 "UTF-8 target...\n"));
2a782b5b 444 PerlIO_printf(Perl_debug_log,
a0288114 445 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 446 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
447 prog->precomp,
448 PL_colors[1],
449 (strlen(prog->precomp) > 60 ? "..." : ""),
450 PL_colors[0],
451 (int)(len > 60 ? 60 : len),
452 s, PL_colors[1],
453 (len > 60 ? "..." : "")
454 );
455 });
cad2e5aa 456
c344f387
JH
457 /* CHR_DIST() would be more correct here but it makes things slow. */
458 if (prog->minlen > strend - strpos) {
a3621e74 459 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 460 "String too short... [re_intuit_start]\n"));
cad2e5aa 461 goto fail;
2c2d71f5 462 }
a1933d95 463 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 464 PL_regeol = strend;
33b8afdf
JH
465 if (do_utf8) {
466 if (!prog->check_utf8 && prog->check_substr)
467 to_utf8_substr(prog);
468 check = prog->check_utf8;
469 } else {
470 if (!prog->check_substr && prog->check_utf8)
471 to_byte_substr(prog);
472 check = prog->check_substr;
473 }
474 if (check == &PL_sv_undef) {
a3621e74 475 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
476 "Non-utf string cannot match utf check string\n"));
477 goto fail;
478 }
2c2d71f5 479 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
480 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
481 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 482 && !multiline ) ); /* Check after \n? */
cad2e5aa 483
7e25d62c
JH
484 if (!ml_anch) {
485 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
486 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 487 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
488 && sv && !SvROK(sv)
489 && (strpos != strbeg)) {
a3621e74 490 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
491 goto fail;
492 }
493 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 494 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 495 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
496 I32 slen;
497
1aa99e6b 498 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
499 if (SvTAIL(check)) {
500 slen = SvCUR(check); /* >= 1 */
cad2e5aa 501
9041c2e3 502 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 503 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 504 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 505 goto fail_finish;
cad2e5aa
JH
506 }
507 /* Now should match s[0..slen-2] */
508 slen--;
3f7c398e 509 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 510 || (slen > 1
3f7c398e 511 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 512 report_neq:
a3621e74 513 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
514 goto fail_finish;
515 }
cad2e5aa 516 }
3f7c398e 517 else if (*SvPVX_const(check) != *s
653099ff 518 || ((slen = SvCUR(check)) > 1
3f7c398e 519 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5
JH
520 goto report_neq;
521 goto success_at_start;
7e25d62c 522 }
cad2e5aa 523 }
2c2d71f5 524 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 525 s = strpos;
2c2d71f5 526 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 527 end_shift = prog->minlen - start_shift -
653099ff 528 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 529 if (!ml_anch) {
a3b680e6 530 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 531 - (SvTAIL(check) != 0);
a3b680e6 532 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
533
534 if (end_shift < eshift)
535 end_shift = eshift;
536 }
cad2e5aa 537 }
2c2d71f5 538 else { /* Can match at random position */
cad2e5aa
JH
539 ml_anch = 0;
540 s = strpos;
2c2d71f5
JH
541 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
542 /* Should be nonnegative! */
543 end_shift = prog->minlen - start_shift -
653099ff 544 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
545 }
546
2c2d71f5 547#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 548 if (end_shift < 0)
6bbae5e6 549 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
550#endif
551
2c2d71f5
JH
552 restart:
553 /* Find a possible match in the region s..strend by looking for
554 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 555 if (flags & REXEC_SCREAM) {
cad2e5aa 556 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 557 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 558
2c2d71f5
JH
559 if (PL_screamfirst[BmRARE(check)] >= 0
560 || ( BmRARE(check) == '\n'
561 && (BmPREVIOUS(check) == SvCUR(check) - 1)
562 && SvTAIL(check) ))
9041c2e3 563 s = screaminstr(sv, check,
2c2d71f5 564 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 565 else
2c2d71f5 566 goto fail_finish;
4addbd3b
HS
567 /* we may be pointing at the wrong string */
568 if (s && RX_MATCH_COPIED(prog))
3f7c398e 569 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
570 if (data)
571 *data->scream_olds = s;
572 }
f33976b4 573 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
574 s = fbm_instr((U8*)(s + start_shift),
575 (U8*)(strend - end_shift),
7fba1cd6 576 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 577 else
1aa99e6b
IH
578 s = fbm_instr(HOP3(s, start_shift, strend),
579 HOP3(strend, -end_shift, strbeg),
7fba1cd6 580 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
581
582 /* Update the count-of-usability, remove useless subpatterns,
583 unshift s. */
2c2d71f5 584
a0288114 585 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 586 (s ? "Found" : "Did not find"),
33b8afdf 587 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 588 PL_colors[0],
7b0972df 589 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 590 SvPVX_const(check),
2c2d71f5
JH
591 PL_colors[1], (SvTAIL(check) ? "$" : ""),
592 (s ? " at offset " : "...\n") ) );
593
594 if (!s)
595 goto fail_finish;
596
6eb5f6b9
JH
597 check_at = s;
598
2c2d71f5 599 /* Finish the diagnostic message */
a3621e74 600 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
601
602 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
603 Start with the other substr.
604 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 605 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
606 *always* match. Probably should be marked during compile...
607 Probably it is right to do no SCREAM here...
608 */
609
33b8afdf 610 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 611 /* Take into account the "other" substring. */
2c2d71f5
JH
612 /* XXXX May be hopelessly wrong for UTF... */
613 if (!other_last)
6eb5f6b9 614 other_last = strpos;
33b8afdf 615 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
616 do_other_anchored:
617 {
890ce7af
AL
618 char * const last = HOP3c(s, -start_shift, strbeg);
619 char *last1, *last2;
2c2d71f5 620 char *s1 = s;
33b8afdf 621 SV* must;
2c2d71f5 622
2c2d71f5
JH
623 t = s - prog->check_offset_max;
624 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 625 && (!do_utf8
1aa99e6b 626 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 627 && t > strpos)))
30944b6d 628 /* EMPTY */;
2c2d71f5
JH
629 else
630 t = strpos;
1aa99e6b 631 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
632 if (t < other_last) /* These positions already checked */
633 t = other_last;
1aa99e6b 634 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
635 if (last < last1)
636 last1 = last;
637 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
638 /* On end-of-str: see comment below. */
33b8afdf
JH
639 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
640 if (must == &PL_sv_undef) {
641 s = (char*)NULL;
a3621e74 642 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
643 }
644 else
645 s = fbm_instr(
646 (unsigned char*)t,
647 HOP3(HOP3(last1, prog->anchored_offset, strend)
648 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
649 must,
7fba1cd6 650 multiline ? FBMrf_MULTILINE : 0
33b8afdf 651 );
a3621e74 652 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 653 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
654 (s ? "Found" : "Contradicts"),
655 PL_colors[0],
33b8afdf
JH
656 (int)(SvCUR(must)
657 - (SvTAIL(must)!=0)),
3f7c398e 658 SvPVX_const(must),
33b8afdf 659 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
660 if (!s) {
661 if (last1 >= last2) {
a3621e74 662 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
663 ", giving up...\n"));
664 goto fail_finish;
665 }
a3621e74 666 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 667 ", trying floating at offset %ld...\n",
1aa99e6b
IH
668 (long)(HOP3c(s1, 1, strend) - i_strpos)));
669 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
670 s = HOP3c(last, 1, strend);
2c2d71f5
JH
671 goto restart;
672 }
673 else {
a3621e74 674 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 675 (long)(s - i_strpos)));
1aa99e6b
IH
676 t = HOP3c(s, -prog->anchored_offset, strbeg);
677 other_last = HOP3c(s, 1, strend);
30944b6d 678 s = s1;
2c2d71f5
JH
679 if (t == strpos)
680 goto try_at_start;
2c2d71f5
JH
681 goto try_at_offset;
682 }
30944b6d 683 }
2c2d71f5
JH
684 }
685 else { /* Take into account the floating substring. */
33b8afdf
JH
686 char *last, *last1;
687 char *s1 = s;
688 SV* must;
689
690 t = HOP3c(s, -start_shift, strbeg);
691 last1 = last =
692 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
693 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
694 last = HOP3c(t, prog->float_max_offset, strend);
695 s = HOP3c(t, prog->float_min_offset, strend);
696 if (s < other_last)
697 s = other_last;
2c2d71f5 698 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
699 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
700 /* fbm_instr() takes into account exact value of end-of-str
701 if the check is SvTAIL(ed). Since false positives are OK,
702 and end-of-str is not later than strend we are OK. */
703 if (must == &PL_sv_undef) {
704 s = (char*)NULL;
a3621e74 705 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
706 }
707 else
2c2d71f5 708 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
709 (unsigned char*)last + SvCUR(must)
710 - (SvTAIL(must)!=0),
7fba1cd6 711 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 712 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
713 (s ? "Found" : "Contradicts"),
714 PL_colors[0],
715 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 716 SvPVX_const(must),
33b8afdf
JH
717 PL_colors[1], (SvTAIL(must) ? "$" : "")));
718 if (!s) {
719 if (last1 == last) {
a3621e74 720 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
721 ", giving up...\n"));
722 goto fail_finish;
2c2d71f5 723 }
a3621e74 724 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
725 ", trying anchored starting at offset %ld...\n",
726 (long)(s1 + 1 - i_strpos)));
727 other_last = last;
728 s = HOP3c(t, 1, strend);
729 goto restart;
730 }
731 else {
a3621e74 732 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
733 (long)(s - i_strpos)));
734 other_last = s; /* Fix this later. --Hugo */
735 s = s1;
736 if (t == strpos)
737 goto try_at_start;
738 goto try_at_offset;
739 }
2c2d71f5 740 }
cad2e5aa 741 }
2c2d71f5
JH
742
743 t = s - prog->check_offset_max;
2c2d71f5 744 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 745 && (!do_utf8
1aa99e6b
IH
746 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
747 && t > strpos))) {
2c2d71f5
JH
748 /* Fixed substring is found far enough so that the match
749 cannot start at strpos. */
750 try_at_offset:
cad2e5aa 751 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
752 /* Eventually fbm_*() should handle this, but often
753 anchored_offset is not 0, so this check will not be wasted. */
754 /* XXXX In the code below we prefer to look for "^" even in
755 presence of anchored substrings. And we search even
756 beyond the found float position. These pessimizations
757 are historical artefacts only. */
758 find_anchor:
2c2d71f5 759 while (t < strend - prog->minlen) {
cad2e5aa 760 if (*t == '\n') {
4ee3650e 761 if (t < check_at - prog->check_offset_min) {
33b8afdf 762 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
763 /* Since we moved from the found position,
764 we definitely contradict the found anchored
30944b6d
IZ
765 substr. Due to the above check we do not
766 contradict "check" substr.
767 Thus we can arrive here only if check substr
768 is float. Redo checking for "other"=="fixed".
769 */
9041c2e3 770 strpos = t + 1;
a3621e74 771 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 772 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
773 goto do_other_anchored;
774 }
4ee3650e
GS
775 /* We don't contradict the found floating substring. */
776 /* XXXX Why not check for STCLASS? */
cad2e5aa 777 s = t + 1;
a3621e74 778 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 779 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
780 goto set_useful;
781 }
4ee3650e
GS
782 /* Position contradicts check-string */
783 /* XXXX probably better to look for check-string
784 than for "\n", so one should lower the limit for t? */
a3621e74 785 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 786 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 787 other_last = strpos = s = t + 1;
cad2e5aa
JH
788 goto restart;
789 }
790 t++;
791 }
a3621e74 792 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 793 PL_colors[0], PL_colors[1]));
2c2d71f5 794 goto fail_finish;
cad2e5aa 795 }
f5952150 796 else {
a3621e74 797 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 798 PL_colors[0], PL_colors[1]));
f5952150 799 }
cad2e5aa
JH
800 s = t;
801 set_useful:
33b8afdf 802 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
803 }
804 else {
f5952150 805 /* The found string does not prohibit matching at strpos,
2c2d71f5 806 - no optimization of calling REx engine can be performed,
f5952150
GS
807 unless it was an MBOL and we are not after MBOL,
808 or a future STCLASS check will fail this. */
2c2d71f5
JH
809 try_at_start:
810 /* Even in this situation we may use MBOL flag if strpos is offset
811 wrt the start of the string. */
05b4157f 812 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 813 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
814 /* May be due to an implicit anchor of m{.*foo} */
815 && !(prog->reganch & ROPT_IMPLICIT))
816 {
cad2e5aa
JH
817 t = strpos;
818 goto find_anchor;
819 }
a3621e74 820 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 821 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 822 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 823 );
2c2d71f5 824 success_at_start:
30944b6d 825 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
826 && (do_utf8 ? (
827 prog->check_utf8 /* Could be deleted already */
828 && --BmUSEFUL(prog->check_utf8) < 0
829 && (prog->check_utf8 == prog->float_utf8)
830 ) : (
831 prog->check_substr /* Could be deleted already */
832 && --BmUSEFUL(prog->check_substr) < 0
833 && (prog->check_substr == prog->float_substr)
834 )))
66e933ab 835 {
cad2e5aa 836 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 837 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
838 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
839 if (do_utf8 ? prog->check_substr : prog->check_utf8)
840 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
841 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
842 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 843 check = Nullsv; /* abort */
cad2e5aa 844 s = strpos;
3cf5c195
IZ
845 /* XXXX This is a remnant of the old implementation. It
846 looks wasteful, since now INTUIT can use many
6eb5f6b9 847 other heuristics. */
cad2e5aa
JH
848 prog->reganch &= ~RE_USE_INTUIT;
849 }
850 else
851 s = strpos;
852 }
853
6eb5f6b9
JH
854 /* Last resort... */
855 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
856 if (prog->regstclass) {
857 /* minlen == 0 is possible if regstclass is \b or \B,
858 and the fixed substr is ''$.
859 Since minlen is already taken into account, s+1 is before strend;
860 accidentally, minlen >= 1 guaranties no false positives at s + 1
861 even for \b or \B. But (minlen? 1 : 0) below assumes that
862 regstclass does not come from lookahead... */
863 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
864 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 865 const U8* const str = (U8*)STRING(prog->regstclass);
06b5626a 866 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 867 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 868 : 1);
a3b680e6 869 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 870 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 871 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
872 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
873 cl_l, strend)
874 : strend);
6eb5f6b9
JH
875
876 t = s;
9926ca43 877 cache_re(prog);
06b5626a 878 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
6eb5f6b9
JH
879 if (!s) {
880#ifdef DEBUGGING
e1ec3a88 881 const char *what = 0;
6eb5f6b9
JH
882#endif
883 if (endpos == strend) {
a3621e74 884 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
885 "Could not match STCLASS...\n") );
886 goto fail;
887 }
a3621e74 888 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 889 "This position contradicts STCLASS...\n") );
653099ff
GS
890 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
891 goto fail;
6eb5f6b9 892 /* Contradict one of substrings */
33b8afdf
JH
893 if (prog->anchored_substr || prog->anchored_utf8) {
894 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 895 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 896 hop_and_restart:
1aa99e6b 897 s = HOP3c(t, 1, strend);
66e933ab
GS
898 if (s + start_shift + end_shift > strend) {
899 /* XXXX Should be taken into account earlier? */
a3621e74 900 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
901 "Could not match STCLASS...\n") );
902 goto fail;
903 }
5e39e1e5
HS
904 if (!check)
905 goto giveup;
a3621e74 906 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 907 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
908 what, (long)(s + start_shift - i_strpos)) );
909 goto restart;
910 }
66e933ab 911 /* Have both, check_string is floating */
6eb5f6b9
JH
912 if (t + start_shift >= check_at) /* Contradicts floating=check */
913 goto retry_floating_check;
914 /* Recheck anchored substring, but not floating... */
9041c2e3 915 s = check_at;
5e39e1e5
HS
916 if (!check)
917 goto giveup;
a3621e74 918 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 919 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
920 (long)(other_last - i_strpos)) );
921 goto do_other_anchored;
922 }
60e71179
GS
923 /* Another way we could have checked stclass at the
924 current position only: */
925 if (ml_anch) {
926 s = t = t + 1;
5e39e1e5
HS
927 if (!check)
928 goto giveup;
a3621e74 929 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 930 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 931 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 932 goto try_at_offset;
66e933ab 933 }
33b8afdf 934 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 935 goto fail;
6eb5f6b9
JH
936 /* Check is floating subtring. */
937 retry_floating_check:
938 t = check_at - start_shift;
a3621e74 939 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
940 goto hop_and_restart;
941 }
b7953727 942 if (t != s) {
a3621e74 943 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 944 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
945 (long)(t - i_strpos), (long)(s - i_strpos))
946 );
947 }
948 else {
a3621e74 949 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
950 "Does not contradict STCLASS...\n");
951 );
952 }
6eb5f6b9 953 }
5e39e1e5 954 giveup:
a3621e74 955 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
956 PL_colors[4], (check ? "Guessed" : "Giving up"),
957 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 958 return s;
2c2d71f5
JH
959
960 fail_finish: /* Substring not found */
33b8afdf
JH
961 if (prog->check_substr || prog->check_utf8) /* could be removed already */
962 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 963 fail:
a3621e74 964 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 965 PL_colors[4], PL_colors[5]));
cad2e5aa
JH
966 return Nullch;
967}
9661b544 968
6eb5f6b9 969/* We know what class REx starts with. Try to find this position... */
3c3eec57 970STATIC char *
a3b680e6 971S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 norun)
a687059c 972{
27da23d5 973 dVAR;
1df70142 974 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 975 char *m;
d8093b23 976 STRLEN ln;
5dab1207 977 STRLEN lnc;
078c425b 978 register STRLEN uskip;
d8093b23
G
979 unsigned int c1;
980 unsigned int c2;
6eb5f6b9
JH
981 char *e;
982 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 983 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 984
6eb5f6b9
JH
985 /* We know what class it must start with. */
986 switch (OP(c)) {
6eb5f6b9 987 case ANYOF:
388cc4de 988 if (do_utf8) {
078c425b 989 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
990 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
991 !UTF8_IS_INVARIANT((U8)s[0]) ?
992 reginclass(c, (U8*)s, 0, do_utf8) :
993 REGINCLASS(c, (U8*)s)) {
994 if (tmp && (norun || regtry(prog, s)))
995 goto got_it;
996 else
997 tmp = doevery;
998 }
999 else
1000 tmp = 1;
078c425b 1001 s += uskip;
388cc4de
HS
1002 }
1003 }
1004 else {
1005 while (s < strend) {
1006 STRLEN skip = 1;
1007
1008 if (REGINCLASS(c, (U8*)s) ||
1009 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1010 /* The assignment of 2 is intentional:
1011 * for the folded sharp s, the skip is 2. */
1012 (skip = SHARP_S_SKIP))) {
1013 if (tmp && (norun || regtry(prog, s)))
1014 goto got_it;
1015 else
1016 tmp = doevery;
1017 }
1018 else
1019 tmp = 1;
1020 s += skip;
1021 }
a0d0e21e 1022 }
6eb5f6b9 1023 break;
f33976b4
DB
1024 case CANY:
1025 while (s < strend) {
1026 if (tmp && (norun || regtry(prog, s)))
1027 goto got_it;
1028 else
1029 tmp = doevery;
1030 s++;
1031 }
1032 break;
6eb5f6b9 1033 case EXACTF:
5dab1207
NIS
1034 m = STRING(c);
1035 ln = STR_LEN(c); /* length to match in octets/bytes */
1036 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1037 if (UTF) {
a2a2844f 1038 STRLEN ulen1, ulen2;
5dab1207 1039 U8 *sm = (U8 *) m;
89ebb4a3
JH
1040 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1041 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
1042
1043 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1044 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1045
89ebb4a3 1046 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
872c91ae 1047 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
89ebb4a3 1048 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
872c91ae 1049 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
5dab1207
NIS
1050 lnc = 0;
1051 while (sm < ((U8 *) m + ln)) {
1052 lnc++;
1053 sm += UTF8SKIP(sm);
1054 }
1aa99e6b
IH
1055 }
1056 else {
1057 c1 = *(U8*)m;
1058 c2 = PL_fold[c1];
1059 }
6eb5f6b9
JH
1060 goto do_exactf;
1061 case EXACTFL:
5dab1207
NIS
1062 m = STRING(c);
1063 ln = STR_LEN(c);
1064 lnc = (I32) ln;
d8093b23 1065 c1 = *(U8*)m;
6eb5f6b9
JH
1066 c2 = PL_fold_locale[c1];
1067 do_exactf:
db12adc6 1068 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1069
6eb5f6b9
JH
1070 if (norun && e < s)
1071 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1072
60a8b682
JH
1073 /* The idea in the EXACTF* cases is to first find the
1074 * first character of the EXACTF* node and then, if
1075 * necessary, case-insensitively compare the full
1076 * text of the node. The c1 and c2 are the first
1077 * characters (though in Unicode it gets a bit
1078 * more complicated because there are more cases
7f16dd3d
JH
1079 * than just upper and lower: one needs to use
1080 * the so-called folding case for case-insensitive
1081 * matching (called "loose matching" in Unicode).
1082 * ibcmp_utf8() will do just that. */
60a8b682 1083
1aa99e6b 1084 if (do_utf8) {
575cac57 1085 UV c, f;
89ebb4a3 1086 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1087 STRLEN len, foldlen;
d7f013c8 1088
09091399 1089 if (c1 == c2) {
5dab1207
NIS
1090 /* Upper and lower of 1st char are equal -
1091 * probably not a "letter". */
1aa99e6b 1092 while (s <= e) {
89ebb4a3 1093 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
872c91ae
JH
1094 ckWARN(WARN_UTF8) ?
1095 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1096 if ( c == c1
1097 && (ln == len ||
66423254 1098 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1099 m, (char **)0, ln, (bool)UTF))
55da9344 1100 && (norun || regtry(prog, s)) )
1aa99e6b 1101 goto got_it;
80aecb99 1102 else {
1df70142 1103 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1104 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1105 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1106 if ( f != c
1107 && (f == c1 || f == c2)
1108 && (ln == foldlen ||
66423254
JH
1109 !ibcmp_utf8((char *) foldbuf,
1110 (char **)0, foldlen, do_utf8,
d07ddd77 1111 m,
eb160463 1112 (char **)0, ln, (bool)UTF))
80aecb99
JH
1113 && (norun || regtry(prog, s)) )
1114 goto got_it;
1115 }
1aa99e6b
IH
1116 s += len;
1117 }
09091399
JH
1118 }
1119 else {
1aa99e6b 1120 while (s <= e) {
89ebb4a3 1121 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
872c91ae
JH
1122 ckWARN(WARN_UTF8) ?
1123 0 : UTF8_ALLOW_ANY);
80aecb99 1124
60a8b682 1125 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1126 * Note that not all the possible combinations
1127 * are handled here: some of them are handled
1128 * by the standard folding rules, and some of
1129 * them (the character class or ANYOF cases)
1130 * are handled during compiletime in
1131 * regexec.c:S_regclass(). */
880bd946
JH
1132 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1133 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1134 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1135
1136 if ( (c == c1 || c == c2)
1137 && (ln == len ||
66423254 1138 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1139 m, (char **)0, ln, (bool)UTF))
55da9344 1140 && (norun || regtry(prog, s)) )
1aa99e6b 1141 goto got_it;
80aecb99 1142 else {
1df70142 1143 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1144 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1145 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1146 if ( f != c
1147 && (f == c1 || f == c2)
1148 && (ln == foldlen ||
a6872d42 1149 !ibcmp_utf8((char *) foldbuf,
66423254 1150 (char **)0, foldlen, do_utf8,
d07ddd77 1151 m,
eb160463 1152 (char **)0, ln, (bool)UTF))
80aecb99
JH
1153 && (norun || regtry(prog, s)) )
1154 goto got_it;
1155 }
1aa99e6b
IH
1156 s += len;
1157 }
09091399 1158 }
1aa99e6b
IH
1159 }
1160 else {
1161 if (c1 == c2)
1162 while (s <= e) {
1163 if ( *(U8*)s == c1
1164 && (ln == 1 || !(OP(c) == EXACTF
1165 ? ibcmp(s, m, ln)
1166 : ibcmp_locale(s, m, ln)))
1167 && (norun || regtry(prog, s)) )
1168 goto got_it;
1169 s++;
1170 }
1171 else
1172 while (s <= e) {
1173 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1174 && (ln == 1 || !(OP(c) == EXACTF
1175 ? ibcmp(s, m, ln)
1176 : ibcmp_locale(s, m, ln)))
1177 && (norun || regtry(prog, s)) )
1178 goto got_it;
1179 s++;
1180 }
b3c9acc1
IZ
1181 }
1182 break;
bbce6d69 1183 case BOUNDL:
3280af22 1184 PL_reg_flags |= RF_tainted;
bbce6d69 1185 /* FALL THROUGH */
a0d0e21e 1186 case BOUND:
ffc61ed2 1187 if (do_utf8) {
12d33761 1188 if (s == PL_bostr)
ffc61ed2
JH
1189 tmp = '\n';
1190 else {
b4f7163a 1191 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1192
b4f7163a 1193 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1194 }
1195 tmp = ((OP(c) == BOUND ?
9041c2e3 1196 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1197 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1198 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1199 if (tmp == !(OP(c) == BOUND ?
3568d838 1200 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1201 isALNUM_LC_utf8((U8*)s)))
1202 {
1203 tmp = !tmp;
1204 if ((norun || regtry(prog, s)))
1205 goto got_it;
1206 }
078c425b 1207 s += uskip;
a687059c 1208 }
a0d0e21e 1209 }
667bb95a 1210 else {
12d33761 1211 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1212 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1213 while (s < strend) {
1214 if (tmp ==
1215 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1216 tmp = !tmp;
1217 if ((norun || regtry(prog, s)))
1218 goto got_it;
1219 }
1220 s++;
a0ed51b3 1221 }
a0ed51b3 1222 }
6eb5f6b9 1223 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1224 goto got_it;
1225 break;
bbce6d69 1226 case NBOUNDL:
3280af22 1227 PL_reg_flags |= RF_tainted;
bbce6d69 1228 /* FALL THROUGH */
a0d0e21e 1229 case NBOUND:
ffc61ed2 1230 if (do_utf8) {
12d33761 1231 if (s == PL_bostr)
ffc61ed2
JH
1232 tmp = '\n';
1233 else {
b4f7163a 1234 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1235
b4f7163a 1236 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1237 }
1238 tmp = ((OP(c) == NBOUND ?
9041c2e3 1239 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1240 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1241 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1242 if (tmp == !(OP(c) == NBOUND ?
3568d838 1243 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1244 isALNUM_LC_utf8((U8*)s)))
1245 tmp = !tmp;
1246 else if ((norun || regtry(prog, s)))
1247 goto got_it;
078c425b 1248 s += uskip;
ffc61ed2 1249 }
a0d0e21e 1250 }
667bb95a 1251 else {
12d33761 1252 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1253 tmp = ((OP(c) == NBOUND ?
1254 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1255 while (s < strend) {
1256 if (tmp ==
1257 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1258 tmp = !tmp;
1259 else if ((norun || regtry(prog, s)))
1260 goto got_it;
1261 s++;
1262 }
a0ed51b3 1263 }
6eb5f6b9 1264 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1265 goto got_it;
1266 break;
a0d0e21e 1267 case ALNUM:
ffc61ed2 1268 if (do_utf8) {
1a4fad37 1269 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1270 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1271 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1272 if (tmp && (norun || regtry(prog, s)))
1273 goto got_it;
1274 else
1275 tmp = doevery;
1276 }
bbce6d69 1277 else
ffc61ed2 1278 tmp = 1;
078c425b 1279 s += uskip;
bbce6d69 1280 }
bbce6d69 1281 }
ffc61ed2
JH
1282 else {
1283 while (s < strend) {
1284 if (isALNUM(*s)) {
1285 if (tmp && (norun || regtry(prog, s)))
1286 goto got_it;
1287 else
1288 tmp = doevery;
1289 }
a0ed51b3 1290 else
ffc61ed2
JH
1291 tmp = 1;
1292 s++;
a0ed51b3 1293 }
a0ed51b3
LW
1294 }
1295 break;
bbce6d69 1296 case ALNUML:
3280af22 1297 PL_reg_flags |= RF_tainted;
ffc61ed2 1298 if (do_utf8) {
078c425b 1299 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1300 if (isALNUM_LC_utf8((U8*)s)) {
1301 if (tmp && (norun || regtry(prog, s)))
1302 goto got_it;
1303 else
1304 tmp = doevery;
1305 }
a687059c 1306 else
ffc61ed2 1307 tmp = 1;
078c425b 1308 s += uskip;
a0d0e21e 1309 }
a0d0e21e 1310 }
ffc61ed2
JH
1311 else {
1312 while (s < strend) {
1313 if (isALNUM_LC(*s)) {
1314 if (tmp && (norun || regtry(prog, s)))
1315 goto got_it;
1316 else
1317 tmp = doevery;
1318 }
a0ed51b3 1319 else
ffc61ed2
JH
1320 tmp = 1;
1321 s++;
a0ed51b3 1322 }
a0ed51b3
LW
1323 }
1324 break;
a0d0e21e 1325 case NALNUM:
ffc61ed2 1326 if (do_utf8) {
1a4fad37 1327 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1328 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1329 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1330 if (tmp && (norun || regtry(prog, s)))
1331 goto got_it;
1332 else
1333 tmp = doevery;
1334 }
bbce6d69 1335 else
ffc61ed2 1336 tmp = 1;
078c425b 1337 s += uskip;
bbce6d69 1338 }
bbce6d69 1339 }
ffc61ed2
JH
1340 else {
1341 while (s < strend) {
1342 if (!isALNUM(*s)) {
1343 if (tmp && (norun || regtry(prog, s)))
1344 goto got_it;
1345 else
1346 tmp = doevery;
1347 }
a0ed51b3 1348 else
ffc61ed2
JH
1349 tmp = 1;
1350 s++;
a0ed51b3 1351 }
a0ed51b3
LW
1352 }
1353 break;
bbce6d69 1354 case NALNUML:
3280af22 1355 PL_reg_flags |= RF_tainted;
ffc61ed2 1356 if (do_utf8) {
078c425b 1357 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1358 if (!isALNUM_LC_utf8((U8*)s)) {
1359 if (tmp && (norun || regtry(prog, s)))
1360 goto got_it;
1361 else
1362 tmp = doevery;
1363 }
a687059c 1364 else
ffc61ed2 1365 tmp = 1;
078c425b 1366 s += uskip;
a687059c 1367 }
a0d0e21e 1368 }
ffc61ed2
JH
1369 else {
1370 while (s < strend) {
1371 if (!isALNUM_LC(*s)) {
1372 if (tmp && (norun || regtry(prog, s)))
1373 goto got_it;
1374 else
1375 tmp = doevery;
1376 }
a0ed51b3 1377 else
ffc61ed2
JH
1378 tmp = 1;
1379 s++;
a0ed51b3 1380 }
a0ed51b3
LW
1381 }
1382 break;
a0d0e21e 1383 case SPACE:
ffc61ed2 1384 if (do_utf8) {
1a4fad37 1385 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1386 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1387 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1388 if (tmp && (norun || regtry(prog, s)))
1389 goto got_it;
1390 else
1391 tmp = doevery;
1392 }
a0d0e21e 1393 else
ffc61ed2 1394 tmp = 1;
078c425b 1395 s += uskip;
2304df62 1396 }
a0d0e21e 1397 }
ffc61ed2
JH
1398 else {
1399 while (s < strend) {
1400 if (isSPACE(*s)) {
1401 if (tmp && (norun || regtry(prog, s)))
1402 goto got_it;
1403 else
1404 tmp = doevery;
1405 }
a0ed51b3 1406 else
ffc61ed2
JH
1407 tmp = 1;
1408 s++;
a0ed51b3 1409 }
a0ed51b3
LW
1410 }
1411 break;
bbce6d69 1412 case SPACEL:
3280af22 1413 PL_reg_flags |= RF_tainted;
ffc61ed2 1414 if (do_utf8) {
078c425b 1415 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1416 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1417 if (tmp && (norun || regtry(prog, s)))
1418 goto got_it;
1419 else
1420 tmp = doevery;
1421 }
bbce6d69 1422 else
ffc61ed2 1423 tmp = 1;
078c425b 1424 s += uskip;
bbce6d69 1425 }
bbce6d69 1426 }
ffc61ed2
JH
1427 else {
1428 while (s < strend) {
1429 if (isSPACE_LC(*s)) {
1430 if (tmp && (norun || regtry(prog, s)))
1431 goto got_it;
1432 else
1433 tmp = doevery;
1434 }
a0ed51b3 1435 else
ffc61ed2
JH
1436 tmp = 1;
1437 s++;
a0ed51b3 1438 }
a0ed51b3
LW
1439 }
1440 break;
a0d0e21e 1441 case NSPACE:
ffc61ed2 1442 if (do_utf8) {
1a4fad37 1443 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1444 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1445 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1446 if (tmp && (norun || regtry(prog, s)))
1447 goto got_it;
1448 else
1449 tmp = doevery;
1450 }
a0d0e21e 1451 else
ffc61ed2 1452 tmp = 1;
078c425b 1453 s += uskip;
a687059c 1454 }
a0d0e21e 1455 }
ffc61ed2
JH
1456 else {
1457 while (s < strend) {
1458 if (!isSPACE(*s)) {
1459 if (tmp && (norun || regtry(prog, s)))
1460 goto got_it;
1461 else
1462 tmp = doevery;
1463 }
a0ed51b3 1464 else
ffc61ed2
JH
1465 tmp = 1;
1466 s++;
a0ed51b3 1467 }
a0ed51b3
LW
1468 }
1469 break;
bbce6d69 1470 case NSPACEL:
3280af22 1471 PL_reg_flags |= RF_tainted;
ffc61ed2 1472 if (do_utf8) {
078c425b 1473 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1474 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1475 if (tmp && (norun || regtry(prog, s)))
1476 goto got_it;
1477 else
1478 tmp = doevery;
1479 }
bbce6d69 1480 else
ffc61ed2 1481 tmp = 1;
078c425b 1482 s += uskip;
bbce6d69 1483 }
bbce6d69 1484 }
ffc61ed2
JH
1485 else {
1486 while (s < strend) {
1487 if (!isSPACE_LC(*s)) {
1488 if (tmp && (norun || regtry(prog, s)))
1489 goto got_it;
1490 else
1491 tmp = doevery;
1492 }
a0ed51b3 1493 else
ffc61ed2
JH
1494 tmp = 1;
1495 s++;
a0ed51b3 1496 }
a0ed51b3
LW
1497 }
1498 break;
a0d0e21e 1499 case DIGIT:
ffc61ed2 1500 if (do_utf8) {
1a4fad37 1501 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1502 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1503 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1504 if (tmp && (norun || regtry(prog, s)))
1505 goto got_it;
1506 else
1507 tmp = doevery;
1508 }
a0d0e21e 1509 else
ffc61ed2 1510 tmp = 1;
078c425b 1511 s += uskip;
2b69d0c2 1512 }
a0d0e21e 1513 }
ffc61ed2
JH
1514 else {
1515 while (s < strend) {
1516 if (isDIGIT(*s)) {
1517 if (tmp && (norun || regtry(prog, s)))
1518 goto got_it;
1519 else
1520 tmp = doevery;
1521 }
a0ed51b3 1522 else
ffc61ed2
JH
1523 tmp = 1;
1524 s++;
a0ed51b3 1525 }
a0ed51b3
LW
1526 }
1527 break;
b8c5462f
JH
1528 case DIGITL:
1529 PL_reg_flags |= RF_tainted;
ffc61ed2 1530 if (do_utf8) {
078c425b 1531 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1532 if (isDIGIT_LC_utf8((U8*)s)) {
1533 if (tmp && (norun || regtry(prog, s)))
1534 goto got_it;
1535 else
1536 tmp = doevery;
1537 }
b8c5462f 1538 else
ffc61ed2 1539 tmp = 1;
078c425b 1540 s += uskip;
b8c5462f 1541 }
b8c5462f 1542 }
ffc61ed2
JH
1543 else {
1544 while (s < strend) {
1545 if (isDIGIT_LC(*s)) {
1546 if (tmp && (norun || regtry(prog, s)))
1547 goto got_it;
1548 else
1549 tmp = doevery;
1550 }
b8c5462f 1551 else
ffc61ed2
JH
1552 tmp = 1;
1553 s++;
b8c5462f 1554 }
b8c5462f
JH
1555 }
1556 break;
a0d0e21e 1557 case NDIGIT:
ffc61ed2 1558 if (do_utf8) {
1a4fad37 1559 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1560 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1561 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1562 if (tmp && (norun || regtry(prog, s)))
1563 goto got_it;
1564 else
1565 tmp = doevery;
1566 }
a0d0e21e 1567 else
ffc61ed2 1568 tmp = 1;
078c425b 1569 s += uskip;
a687059c 1570 }
a0d0e21e 1571 }
ffc61ed2
JH
1572 else {
1573 while (s < strend) {
1574 if (!isDIGIT(*s)) {
1575 if (tmp && (norun || regtry(prog, s)))
1576 goto got_it;
1577 else
1578 tmp = doevery;
1579 }
a0ed51b3 1580 else
ffc61ed2
JH
1581 tmp = 1;
1582 s++;
a0ed51b3 1583 }
a0ed51b3
LW
1584 }
1585 break;
b8c5462f
JH
1586 case NDIGITL:
1587 PL_reg_flags |= RF_tainted;
ffc61ed2 1588 if (do_utf8) {
078c425b 1589 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1590 if (!isDIGIT_LC_utf8((U8*)s)) {
1591 if (tmp && (norun || regtry(prog, s)))
1592 goto got_it;
1593 else
1594 tmp = doevery;
1595 }
b8c5462f 1596 else
ffc61ed2 1597 tmp = 1;
078c425b 1598 s += uskip;
b8c5462f 1599 }
a0ed51b3 1600 }
ffc61ed2
JH
1601 else {
1602 while (s < strend) {
1603 if (!isDIGIT_LC(*s)) {
1604 if (tmp && (norun || regtry(prog, s)))
1605 goto got_it;
1606 else
1607 tmp = doevery;
1608 }
cf93c79d 1609 else
ffc61ed2
JH
1610 tmp = 1;
1611 s++;
b8c5462f 1612 }
b8c5462f
JH
1613 }
1614 break;
b3c9acc1 1615 default:
3c3eec57
GS
1616 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1617 break;
d6a28714 1618 }
6eb5f6b9
JH
1619 return 0;
1620 got_it:
1621 return s;
1622}
1623
1624/*
1625 - regexec_flags - match a regexp against a string
1626 */
1627I32
1628Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1629 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1630/* strend: pointer to null at end of string */
1631/* strbeg: real beginning of string */
1632/* minend: end of match must be >=minend after stringarg. */
1633/* data: May be used for some additional optimizations. */
1634/* nosave: For optimizations. */
1635{
6eb5f6b9
JH
1636 register char *s;
1637 register regnode *c;
1638 register char *startpos = stringarg;
6eb5f6b9
JH
1639 I32 minlen; /* must match at least this many chars */
1640 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1641 I32 end_shift = 0; /* Same for the end. */ /* CC */
1642 I32 scream_pos = -1; /* Internal iterator of scream. */
1643 char *scream_olds;
1644 SV* oreplsv = GvSV(PL_replgv);
1df70142 1645 const bool do_utf8 = DO_UTF8(sv);
a3b680e6 1646 const I32 multiline = prog->reganch & PMf_MULTILINE;
2a782b5b 1647#ifdef DEBUGGING
9e55ce06
JH
1648 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1649 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1650#endif
a3621e74
YO
1651
1652 GET_RE_DEBUG_FLAGS_DECL;
1653
9d4ba2ae 1654 PERL_UNUSED_ARG(data);
a30b2f1f 1655 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1656
1657 PL_regcc = 0;
1658
1659 cache_re(prog);
1660#ifdef DEBUGGING
aea4f609 1661 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1662#endif
1663
1664 /* Be paranoid... */
1665 if (prog == NULL || startpos == NULL) {
1666 Perl_croak(aTHX_ "NULL regexp parameter");
1667 return 0;
1668 }
1669
1670 minlen = prog->minlen;
61a36c01 1671 if (strend - startpos < minlen) {
a3621e74 1672 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1673 "String too short [regexec_flags]...\n"));
1674 goto phooey;
1aa99e6b 1675 }
6eb5f6b9 1676
6eb5f6b9
JH
1677 /* Check validity of program. */
1678 if (UCHARAT(prog->program) != REG_MAGIC) {
1679 Perl_croak(aTHX_ "corrupted regexp program");
1680 }
1681
1682 PL_reg_flags = 0;
1683 PL_reg_eval_set = 0;
1684 PL_reg_maxiter = 0;
1685
1686 if (prog->reganch & ROPT_UTF8)
1687 PL_reg_flags |= RF_utf8;
1688
1689 /* Mark beginning of line for ^ and lookbehind. */
1690 PL_regbol = startpos;
1691 PL_bostr = strbeg;
1692 PL_reg_sv = sv;
1693
1694 /* Mark end of line for $ (and such) */
1695 PL_regeol = strend;
1696
1697 /* see how far we have to get to not match where we matched before */
1698 PL_regtill = startpos+minend;
1699
1700 /* We start without call_cc context. */
1701 PL_reg_call_cc = 0;
1702
1703 /* If there is a "must appear" string, look for it. */
1704 s = startpos;
1705
1706 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1707 MAGIC *mg;
1708
1709 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1710 PL_reg_ganch = startpos;
1711 else if (sv && SvTYPE(sv) >= SVt_PVMG
1712 && SvMAGIC(sv)
14befaf4
DM
1713 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1714 && mg->mg_len >= 0) {
6eb5f6b9
JH
1715 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1716 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1717 if (s > PL_reg_ganch)
6eb5f6b9
JH
1718 goto phooey;
1719 s = PL_reg_ganch;
1720 }
1721 }
1722 else /* pos() not defined */
1723 PL_reg_ganch = strbeg;
1724 }
1725
33b8afdf 1726 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1727 re_scream_pos_data d;
1728
1729 d.scream_olds = &scream_olds;
1730 d.scream_pos = &scream_pos;
1731 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1732 if (!s) {
a3621e74 1733 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1734 goto phooey; /* not present */
3fa9c3d7 1735 }
6eb5f6b9
JH
1736 }
1737
a3621e74 1738 DEBUG_EXECUTE_r({
1df70142
AL
1739 const char * const s0 = UTF
1740 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1741 UNI_DISPLAY_REGEX)
1742 : prog->precomp;
1743 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1744 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1745 UNI_DISPLAY_REGEX) : startpos;
1df70142 1746 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1747 if (!PL_colorset)
1748 reginitcolors();
1749 PerlIO_printf(Perl_debug_log,
a0288114 1750 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1751 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1752 len0, len0, s0,
2a782b5b 1753 PL_colors[1],
9e55ce06 1754 len0 > 60 ? "..." : "",
2a782b5b 1755 PL_colors[0],
9e55ce06
JH
1756 (int)(len1 > 60 ? 60 : len1),
1757 s1, PL_colors[1],
1758 (len1 > 60 ? "..." : "")
2a782b5b
JH
1759 );
1760 });
6eb5f6b9
JH
1761
1762 /* Simplest case: anchored match need be tried only once. */
1763 /* [unless only anchor is BOL and multiline is set] */
1764 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1765 if (s == startpos && regtry(prog, startpos))
1766 goto got_it;
7fba1cd6 1767 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1768 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1769 {
1770 char *end;
1771
1772 if (minlen)
1773 dontbother = minlen - 1;
1aa99e6b 1774 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1775 /* for multiline we only have to try after newlines */
33b8afdf 1776 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1777 if (s == startpos)
1778 goto after_try;
1779 while (1) {
1780 if (regtry(prog, s))
1781 goto got_it;
1782 after_try:
1783 if (s >= end)
1784 goto phooey;
1785 if (prog->reganch & RE_USE_INTUIT) {
1786 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1787 if (!s)
1788 goto phooey;
1789 }
1790 else
1791 s++;
1792 }
1793 } else {
1794 if (s > startpos)
1795 s--;
1796 while (s < end) {
1797 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1798 if (regtry(prog, s))
1799 goto got_it;
1800 }
1801 }
1802 }
1803 }
1804 goto phooey;
1805 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1806 if (regtry(prog, PL_reg_ganch))
1807 goto got_it;
1808 goto phooey;
1809 }
1810
1811 /* Messy cases: unanchored match. */
33b8afdf 1812 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1813 /* we have /x+whatever/ */
1814 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1815 char ch;
bf93d4cc
GS
1816#ifdef DEBUGGING
1817 int did_match = 0;
1818#endif
33b8afdf
JH
1819 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1820 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1821 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1822
1aa99e6b 1823 if (do_utf8) {
6eb5f6b9
JH
1824 while (s < strend) {
1825 if (*s == ch) {
a3621e74 1826 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1827 if (regtry(prog, s)) goto got_it;
1828 s += UTF8SKIP(s);
1829 while (s < strend && *s == ch)
1830 s += UTF8SKIP(s);
1831 }
1832 s += UTF8SKIP(s);
1833 }
1834 }
1835 else {
1836 while (s < strend) {
1837 if (*s == ch) {
a3621e74 1838 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1839 if (regtry(prog, s)) goto got_it;
1840 s++;
1841 while (s < strend && *s == ch)
1842 s++;
1843 }
1844 s++;
1845 }
1846 }
a3621e74 1847 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1848 PerlIO_printf(Perl_debug_log,
b7953727
JH
1849 "Did not find anchored character...\n")
1850 );
6eb5f6b9 1851 }
33b8afdf
JH
1852 else if (prog->anchored_substr != Nullsv
1853 || prog->anchored_utf8 != Nullsv
1854 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1855 && prog->float_max_offset < strend - s)) {
1856 SV *must;
1857 I32 back_max;
1858 I32 back_min;
1859 char *last;
6eb5f6b9 1860 char *last1; /* Last position checked before */
bf93d4cc
GS
1861#ifdef DEBUGGING
1862 int did_match = 0;
1863#endif
33b8afdf
JH
1864 if (prog->anchored_substr || prog->anchored_utf8) {
1865 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1866 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1867 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1868 back_max = back_min = prog->anchored_offset;
1869 } else {
1870 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1871 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1872 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1873 back_max = prog->float_max_offset;
1874 back_min = prog->float_min_offset;
1875 }
1876 if (must == &PL_sv_undef)
1877 /* could not downgrade utf8 check substring, so must fail */
1878 goto phooey;
1879
1880 last = HOP3c(strend, /* Cannot start after this */
1881 -(I32)(CHR_SVLEN(must)
1882 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1883
1884 if (s > PL_bostr)
1885 last1 = HOPc(s, -1);
1886 else
1887 last1 = s - 1; /* bogus */
1888
a0288114 1889 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1890 check_substr==must. */
1891 scream_pos = -1;
1892 dontbother = end_shift;
1893 strend = HOPc(strend, -dontbother);
1894 while ( (s <= last) &&
9041c2e3 1895 ((flags & REXEC_SCREAM)
1aa99e6b 1896 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1897 end_shift, &scream_pos, 0))
1aa99e6b 1898 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1899 (unsigned char*)strend, must,
7fba1cd6 1900 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1901 /* we may be pointing at the wrong string */
1902 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1903 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1904 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1905 if (HOPc(s, -back_max) > last1) {
1906 last1 = HOPc(s, -back_min);
1907 s = HOPc(s, -back_max);
1908 }
1909 else {
1910 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1911
1912 last1 = HOPc(s, -back_min);
1913 s = t;
1914 }
1aa99e6b 1915 if (do_utf8) {
6eb5f6b9
JH
1916 while (s <= last1) {
1917 if (regtry(prog, s))
1918 goto got_it;
1919 s += UTF8SKIP(s);
1920 }
1921 }
1922 else {
1923 while (s <= last1) {
1924 if (regtry(prog, s))
1925 goto got_it;
1926 s++;
1927 }
1928 }
1929 }
a3621e74 1930 DEBUG_EXECUTE_r(if (!did_match)
b7953727 1931 PerlIO_printf(Perl_debug_log,
a0288114 1932 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1933 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1934 ? "anchored" : "floating"),
1935 PL_colors[0],
1936 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 1937 SvPVX_const(must),
b7953727
JH
1938 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1939 );
6eb5f6b9
JH
1940 goto phooey;
1941 }
155aba94 1942 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1943 if (minlen) {
1944 I32 op = (U8)OP(prog->regstclass);
66e933ab 1945 /* don't bother with what can't match */
f14c76ed
RGS
1946 if (PL_regkind[op] != EXACT && op != CANY)
1947 strend = HOPc(strend, -(minlen - 1));
1948 }
a3621e74 1949 DEBUG_EXECUTE_r({
ffc61ed2 1950 SV *prop = sv_newmortal();
cfd0369c
NC
1951 const char *s0;
1952 const char *s1;
9e55ce06
JH
1953 int len0;
1954 int len1;
1955
ffc61ed2 1956 regprop(prop, c);
9e55ce06 1957 s0 = UTF ?
3f7c398e 1958 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 1959 UNI_DISPLAY_REGEX) :
cfd0369c 1960 SvPVX_const(prop);
9e55ce06
JH
1961 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1962 s1 = UTF ?
c728cb41 1963 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1964 len1 = UTF ? SvCUR(dsv1) : strend - s;
1965 PerlIO_printf(Perl_debug_log,
a0288114 1966 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
9e55ce06
JH
1967 len0, len0, s0,
1968 len1, len1, s1);
ffc61ed2 1969 });
06b5626a 1970 if (find_byclass(prog, c, s, strend, 0))
6eb5f6b9 1971 goto got_it;
a3621e74 1972 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1973 }
1974 else {
1975 dontbother = 0;
33b8afdf
JH
1976 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1977 /* Trim the end. */
d6a28714 1978 char *last;
33b8afdf
JH
1979 SV* float_real;
1980
1981 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1982 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1983 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1984
1985 if (flags & REXEC_SCREAM) {
33b8afdf 1986 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1987 end_shift, &scream_pos, 1); /* last one */
1988 if (!last)
ffc61ed2 1989 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1990 /* we may be pointing at the wrong string */
1991 else if (RX_MATCH_COPIED(prog))
3f7c398e 1992 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1993 }
d6a28714
JH
1994 else {
1995 STRLEN len;
cfd0369c 1996 const char * const little = SvPV_const(float_real, len);
d6a28714 1997
33b8afdf 1998 if (SvTAIL(float_real)) {
d6a28714
JH
1999 if (memEQ(strend - len + 1, little, len - 1))
2000 last = strend - len + 1;
7fba1cd6 2001 else if (!multiline)
9041c2e3 2002 last = memEQ(strend - len, little, len)
d6a28714 2003 ? strend - len : Nullch;
b8c5462f 2004 else
d6a28714
JH
2005 goto find_last;
2006 } else {
2007 find_last:
9041c2e3 2008 if (len)
d6a28714 2009 last = rninstr(s, strend, little, little + len);
b8c5462f 2010 else
a0288114 2011 last = strend; /* matching "$" */
b8c5462f 2012 }
b8c5462f 2013 }
bf93d4cc 2014 if (last == NULL) {
a3621e74 2015 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2016 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2017 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2018 goto phooey; /* Should not happen! */
2019 }
d6a28714
JH
2020 dontbother = strend - last + prog->float_min_offset;
2021 }
2022 if (minlen && (dontbother < minlen))
2023 dontbother = minlen - 1;
2024 strend -= dontbother; /* this one's always in bytes! */
2025 /* We don't know much -- general case. */
1aa99e6b 2026 if (do_utf8) {
d6a28714
JH
2027 for (;;) {
2028 if (regtry(prog, s))
2029 goto got_it;
2030 if (s >= strend)
2031 break;
b8c5462f 2032 s += UTF8SKIP(s);
d6a28714
JH
2033 };
2034 }
2035 else {
2036 do {
2037 if (regtry(prog, s))
2038 goto got_it;
2039 } while (s++ < strend);
2040 }
2041 }
2042
2043 /* Failure. */
2044 goto phooey;
2045
2046got_it:
2047 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2048
2049 if (PL_reg_eval_set) {
2050 /* Preserve the current value of $^R */
2051 if (oreplsv != GvSV(PL_replgv))
2052 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2053 restored, the value remains
2054 the same. */
acfe0abc 2055 restore_pos(aTHX_ 0);
d6a28714
JH
2056 }
2057
2058 /* make sure $`, $&, $', and $digit will work later */
2059 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2060 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2061 if (flags & REXEC_COPY_STR) {
2062 I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2063#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2064 if ((SvIsCOW(sv)
2065 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2066 if (DEBUG_C_TEST) {
2067 PerlIO_printf(Perl_debug_log,
2068 "Copy on write: regexp capture, type %d\n",
2069 (int) SvTYPE(sv));
2070 }
2071 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2072 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2073 assert (SvPOKp(prog->saved_copy));
2074 } else
2075#endif
2076 {
2077 RX_MATCH_COPIED_on(prog);
2078 s = savepvn(strbeg, i);
2079 prog->subbeg = s;
2080 }
d6a28714 2081 prog->sublen = i;
d6a28714
JH
2082 }
2083 else {
2084 prog->subbeg = strbeg;
2085 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2086 }
2087 }
9041c2e3 2088
d6a28714
JH
2089 return 1;
2090
2091phooey:
a3621e74 2092 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2093 PL_colors[4], PL_colors[5]));
d6a28714 2094 if (PL_reg_eval_set)
acfe0abc 2095 restore_pos(aTHX_ 0);
d6a28714
JH
2096 return 0;
2097}
2098
2099/*
2100 - regtry - try match at specific point
2101 */
2102STATIC I32 /* 0 failure, 1 success */
2103S_regtry(pTHX_ regexp *prog, char *startpos)
2104{
d6a28714
JH
2105 register I32 i;
2106 register I32 *sp;
2107 register I32 *ep;
2108 CHECKPOINT lastcp;
a3621e74 2109 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2110
02db2b7b
IZ
2111#ifdef DEBUGGING
2112 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2113#endif
d6a28714
JH
2114 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2115 MAGIC *mg;
2116
2117 PL_reg_eval_set = RS_init;
a3621e74 2118 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2119 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2120 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2121 ));
e8347627 2122 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2123 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2124 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2125 SAVETMPS;
2126 /* Apparently this is not needed, judging by wantarray. */
e8347627 2127 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2128 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2129
2130 if (PL_reg_sv) {
2131 /* Make $_ available to executed code. */
2132 if (PL_reg_sv != DEFSV) {
59f00321 2133 SAVE_DEFSV;
d6a28714 2134 DEFSV = PL_reg_sv;
b8c5462f 2135 }
d6a28714 2136
9041c2e3 2137 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2138 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2139 /* prepare for quick setting of pos */
14befaf4
DM
2140 sv_magic(PL_reg_sv, (SV*)0,
2141 PERL_MAGIC_regex_global, Nullch, 0);
2142 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2143 mg->mg_len = -1;
b8c5462f 2144 }
d6a28714
JH
2145 PL_reg_magic = mg;
2146 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2147 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2148 }
09687e5a 2149 if (!PL_reg_curpm) {
a02a5408 2150 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2151#ifdef USE_ITHREADS
2152 {
2153 SV* repointer = newSViv(0);
577e12cc 2154 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2155 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2156 av_push(PL_regex_padav,repointer);
2157 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2158 PL_regex_pad = AvARRAY(PL_regex_padav);
2159 }
2160#endif
2161 }
aaa362c4 2162 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2163 PL_reg_oldcurpm = PL_curpm;
2164 PL_curpm = PL_reg_curpm;
2165 if (RX_MATCH_COPIED(prog)) {
2166 /* Here is a serious problem: we cannot rewrite subbeg,
2167 since it may be needed if this match fails. Thus
2168 $` inside (?{}) could fail... */
2169 PL_reg_oldsaved = prog->subbeg;
2170 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2171#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2172 PL_nrs = prog->saved_copy;
2173#endif
d6a28714
JH
2174 RX_MATCH_COPIED_off(prog);
2175 }
2176 else
2177 PL_reg_oldsaved = Nullch;
2178 prog->subbeg = PL_bostr;
2179 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2180 }
2181 prog->startp[0] = startpos - PL_bostr;
2182 PL_reginput = startpos;
2183 PL_regstartp = prog->startp;
2184 PL_regendp = prog->endp;
2185 PL_reglastparen = &prog->lastparen;
a01268b5 2186 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2187 prog->lastparen = 0;
03994de8 2188 prog->lastcloseparen = 0;
d6a28714 2189 PL_regsize = 0;
a3621e74 2190 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2191 if (PL_reg_start_tmpl <= prog->nparens) {
2192 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2193 if(PL_reg_start_tmp)
2194 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2195 else
a02a5408 2196 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2197 }
2198
2199 /* XXXX What this code is doing here?!!! There should be no need
2200 to do this again and again, PL_reglastparen should take care of
3dd2943c 2201 this! --ilya*/
dafc8851
JH
2202
2203 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2204 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2205 * PL_reglastparen), is not needed at all by the test suite
2206 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2207 * enough, for building DynaLoader, or otherwise this
2208 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2209 * will happen. Meanwhile, this code *is* needed for the
2210 * above-mentioned test suite tests to succeed. The common theme
2211 * on those tests seems to be returning null fields from matches.
2212 * --jhi */
dafc8851 2213#if 1
d6a28714
JH
2214 sp = prog->startp;
2215 ep = prog->endp;
2216 if (prog->nparens) {
eb160463 2217 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2218 *++sp = -1;
2219 *++ep = -1;
2220 }
2221 }
dafc8851 2222#endif
02db2b7b 2223 REGCP_SET(lastcp);
d6a28714
JH
2224 if (regmatch(prog->program + 1)) {
2225 prog->endp[0] = PL_reginput - PL_bostr;
2226 return 1;
2227 }
02db2b7b 2228 REGCP_UNWIND(lastcp);
d6a28714
JH
2229 return 0;
2230}
2231
02db2b7b
IZ
2232#define RE_UNWIND_BRANCH 1
2233#define RE_UNWIND_BRANCHJ 2
2234
2235union re_unwind_t;
2236
2237typedef struct { /* XX: makes sense to enlarge it... */
2238 I32 type;
2239 I32 prev;
2240 CHECKPOINT lastcp;
2241} re_unwind_generic_t;
2242
2243typedef struct {
2244 I32 type;
2245 I32 prev;
2246 CHECKPOINT lastcp;
2247 I32 lastparen;
2248 regnode *next;
2249 char *locinput;
2250 I32 nextchr;
2251#ifdef DEBUGGING
2252 int regindent;
2253#endif
2254} re_unwind_branch_t;
2255
2256typedef union re_unwind_t {
2257 I32 type;
2258 re_unwind_generic_t generic;
2259 re_unwind_branch_t branch;
2260} re_unwind_t;
2261
8ba1375e
MJD
2262#define sayYES goto yes
2263#define sayNO goto no
e0f9d4a8 2264#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2265#define sayYES_FINAL goto yes_final
2266#define sayYES_LOUD goto yes_loud
2267#define sayNO_FINAL goto no_final
2268#define sayNO_SILENT goto do_no
2269#define saySAME(x) if (x) goto yes; else goto no
2270
3ab3c9b4
HS
2271#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2272#define POSCACHE_SEEN 1 /* we know what we're caching */
2273#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2274#define CACHEsayYES STMT_START { \
2275 if (cache_offset | cache_bit) { \
2276 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2277 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2278 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2279 /* cache records failure, but this is success */ \
2280 DEBUG_r( \
2281 PerlIO_printf(Perl_debug_log, \
2282 "%*s (remove success from failure cache)\n", \
2283 REPORT_CODE_OFF+PL_regindent*2, "") \
2284 ); \
2285 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2286 } \
2287 } \
2288 sayYES; \
2289} STMT_END
2290#define CACHEsayNO STMT_START { \
2291 if (cache_offset | cache_bit) { \
2292 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2293 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2294 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2295 /* cache records success, but this is failure */ \
2296 DEBUG_r( \
2297 PerlIO_printf(Perl_debug_log, \
2298 "%*s (remove failure from success cache)\n", \
2299 REPORT_CODE_OFF+PL_regindent*2, "") \
2300 ); \
2301 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2302 } \
2303 } \
2304 sayNO; \
2305} STMT_END
2306
a3621e74
YO
2307/* this is used to determine how far from the left messages like
2308 'failed...' are printed. Currently 29 makes these messages line
2309 up with the opcode they refer to. Earlier perls used 25 which
2310 left these messages outdented making reviewing a debug output
2311 quite difficult.
2312*/
2313#define REPORT_CODE_OFF 29
2314
2315
2316/* Make sure there is a test for this +1 options in re_tests */
2317#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2318
2319#define TRIE_CHECK_STATE_IS_ACCEPTING STMT_START { \
2320 if ( trie->states[ state ].wordnum ) { \
2321 if ( !accepted ) { \
2322 ENTER; \
2323 SAVETMPS; \
2324 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN ; \
2325 sv_accept_buff=NEWSV( 1234, \
2326 bufflen * sizeof(reg_trie_accepted) - 1 ); \
2327 SvCUR_set( sv_accept_buff, sizeof(reg_trie_accepted) ); \
2328 SvPOK_on( sv_accept_buff ); \
2329 sv_2mortal( sv_accept_buff ); \
2330 accept_buff = (reg_trie_accepted*)SvPV_nolen( sv_accept_buff );\
2331 } else { \
2332 if ( accepted >= bufflen ) { \
2333 bufflen *= 2; \
2334 accept_buff =(reg_trie_accepted*)SvGROW( sv_accept_buff, \
2335 bufflen * sizeof(reg_trie_accepted) ); \
2336 } \
2337 SvCUR_set( sv_accept_buff,SvCUR( sv_accept_buff ) \
2338 + sizeof( reg_trie_accepted ) ); \
2339 } \
2340 accept_buff[ accepted ].wordnum = trie->states[ state ].wordnum; \
2341 accept_buff[ accepted ].endpos = uc; \
2342 ++accepted; \
2343 } } STMT_END
2344
2345#define TRIE_HANDLE_CHAR STMT_START { \
2346 if ( uvc < 256 ) { \
2347 charid = trie->charmap[ uvc ]; \
2348 } else { \
2349 charid = 0; \
2350 if( trie->widecharmap ) { \
2351 SV** svpp = (SV**)NULL; \
2352 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, \
2353 sizeof( UV ), 0 ); \
2354 if ( svpp ) { \
2355 charid = (U16)SvIV( *svpp ); \
2356 } \
2357 } \
2358 } \
2359 if ( charid && \
cc601c31
YO
2360 ( base + charid > trie->uniquecharcount ) && \
2361 ( base + charid - 1 - trie->uniquecharcount < trie->lasttrans) && \
a3621e74
YO
2362 trie->trans[ base + charid - 1 - trie->uniquecharcount ].check == state ) \
2363 { \
2364 state = trie->trans[ base + charid - 1 - trie->uniquecharcount ].next; \
2365 } else { \
2366 state = 0; \
2367 } \
2368 uc += len; \
2369 } STMT_END
8ba1375e 2370
d6a28714
JH
2371/*
2372 - regmatch - main matching routine
2373 *
2374 * Conceptually the strategy is simple: check to see whether the current
2375 * node matches, call self recursively to see whether the rest matches,
2376 * and then act accordingly. In practice we make some effort to avoid
2377 * recursion, in particular by going through "ordinary" nodes (that don't
2378 * need to know whether the rest of the match failed) by a loop instead of
2379 * by recursion.
2380 */
2381/* [lwall] I've hoisted the register declarations to the outer block in order to
2382 * maybe save a little bit of pushing and popping on the stack. It also takes
2383 * advantage of machines that use a register save mask on subroutine entry.
2384 */
2385STATIC I32 /* 0 failure, 1 success */
2386S_regmatch(pTHX_ regnode *prog)
2387{
27da23d5 2388 dVAR;
d6a28714
JH
2389 register regnode *scan; /* Current node. */
2390 regnode *next; /* Next node. */
2391 regnode *inner; /* Next node in internal branch. */
2392 register I32 nextchr; /* renamed nextchr - nextchar colides with
2393 function of same name */
2394 register I32 n; /* no or next */
b7953727
JH
2395 register I32 ln = 0; /* len or last */
2396 register char *s = Nullch; /* operand or save */
d6a28714 2397 register char *locinput = PL_reginput;
b7953727 2398 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2399 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2400 I32 unwind = 0;
a3621e74
YO
2401
2402 /* used by the trie code */
ab74612d
NC
2403 SV *sv_accept_buff = 0; /* accepting states we have traversed */
2404 reg_trie_accepted *accept_buff = 0; /* "" */
2405 reg_trie_data *trie; /* what trie are we using right now */
2406 U32 accepted = 0; /* how many accepting states we have seen*/
a3621e74 2407
b7953727 2408#if 0
02db2b7b 2409 I32 firstcp = PL_savestack_ix;
b7953727 2410#endif
a3b680e6 2411 const register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2412#ifdef DEBUGGING
ce333219
JH
2413 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2414 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2415 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
a3621e74 2416
ab74612d 2417 SV *re_debug_flags = NULL;
2a782b5b 2418#endif
02db2b7b 2419
a3621e74
YO
2420 GET_RE_DEBUG_FLAGS;
2421
d6a28714
JH
2422#ifdef DEBUGGING
2423 PL_regindent++;
2424#endif
2425
a3621e74 2426
d6a28714
JH
2427 /* Note that nextchr is a byte even in UTF */
2428 nextchr = UCHARAT(locinput);
2429 scan = prog;
2430 while (scan != NULL) {
8ba1375e 2431
a3621e74 2432 DEBUG_EXECUTE_r( {
d6a28714 2433 SV *prop = sv_newmortal();
1df70142
AL
2434 const int docolor = *PL_colors[0];
2435 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2436 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2437 /* The part of the string before starttry has one color
2438 (pref0_len chars), between starttry and current
2439 position another one (pref_len - pref0_len chars),
2440 after the current position the third one.
2441 We assume that pref0_len <= pref_len, otherwise we
2442 decrease pref0_len. */
9041c2e3 2443 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2444 ? (5 + taill) - l : locinput - PL_bostr;
2445 int pref0_len;
d6a28714 2446
df1ffd02 2447 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2448 pref_len++;
2449 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2450 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2451 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2452 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2453 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2454 l--;
d6a28714
JH
2455 if (pref0_len < 0)
2456 pref0_len = 0;
2457 if (pref0_len > pref_len)
2458 pref0_len = pref_len;
2459 regprop(prop, scan);
2a782b5b 2460 {
1df70142 2461 const char * const s0 =
f14c76ed 2462 do_utf8 && OP(scan) != CANY ?
2a782b5b 2463 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2464 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2465 locinput - pref_len;
1df70142
AL
2466 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2467 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2468 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2469 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2470 locinput - pref_len + pref0_len;
1df70142
AL
2471 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2472 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2473 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2474 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2475 locinput;
1df70142 2476 const int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2477 PerlIO_printf(Perl_debug_log,
2478 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2479 (IV)(locinput - PL_bostr),
2480 PL_colors[4],
2481 len0, s0,
2482 PL_colors[5],
2483 PL_colors[2],
2484 len1, s1,
2485 PL_colors[3],
2486 (docolor ? "" : "> <"),
2487 PL_colors[0],
2488 len2, s2,
2489 PL_colors[1],
2490 15 - l - pref_len + 1,
2491 "",
2492 (IV)(scan - PL_regprogram), PL_regindent*2, "",
3f7c398e 2493 SvPVX_const(prop));
2a782b5b
JH
2494 }
2495 });
d6a28714
JH
2496
2497 next = scan + NEXT_OFF(scan);
2498 if (next == scan)
2499 next = NULL;
2500
2501 switch (OP(scan)) {
2502 case BOL:
7fba1cd6 2503 if (locinput == PL_bostr)
d6a28714
JH
2504 {
2505 /* regtill = regbol; */
b8c5462f
JH
2506 break;
2507 }
d6a28714
JH
2508 sayNO;
2509 case MBOL:
12d33761
HS
2510 if (locinput == PL_bostr ||
2511 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2512 {
b8c5462f
JH
2513 break;
2514 }
d6a28714
JH
2515 sayNO;
2516 case SBOL:
c2a73568 2517 if (locinput == PL_bostr)
b8c5462f 2518 break;
d6a28714
JH
2519 sayNO;
2520 case GPOS:
2521 if (locinput == PL_reg_ganch)
2522 break;
2523 sayNO;
2524 case EOL:
d6a28714
JH
2525 goto seol;
2526 case MEOL:
d6a28714 2527 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2528 sayNO;
b8c5462f 2529 break;
d6a28714
JH
2530 case SEOL:
2531 seol:
2532 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2533 sayNO;
d6a28714 2534 if (PL_regeol - locinput > 1)
b8c5462f 2535 sayNO;
b8c5462f 2536 break;
d6a28714
JH
2537 case EOS:
2538 if (PL_regeol != locinput)
b8c5462f 2539 sayNO;
d6a28714 2540 break;
ffc61ed2 2541 case SANY:
d6a28714 2542 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2543 sayNO;
f33976b4
DB
2544 if (do_utf8) {
2545 locinput += PL_utf8skip[nextchr];
2546 if (locinput > PL_regeol)
2547 sayNO;
2548 nextchr = UCHARAT(locinput);
2549 }
2550 else
2551 nextchr = UCHARAT(++locinput);
2552 break;
2553 case CANY:
2554 if (!nextchr && locinput >= PL_regeol)
2555 sayNO;
b8c5462f 2556 nextchr = UCHARAT(++locinput);
a0d0e21e 2557 break;
ffc61ed2 2558 case REG_ANY:
1aa99e6b
IH
2559 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2560 sayNO;
2561 if (do_utf8) {
b8c5462f 2562 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2563 if (locinput > PL_regeol)
2564 sayNO;
a0ed51b3 2565 nextchr = UCHARAT(locinput);
a0ed51b3 2566 }
1aa99e6b
IH
2567 else
2568 nextchr = UCHARAT(++locinput);
a0ed51b3 2569 break;
a3621e74
YO
2570
2571
2572
2573 /*
2574 traverse the TRIE keeping track of all accepting states
2575 we transition through until we get to a failing node.
2576
2577 we use two slightly different pieces of code to handle
2578 the traversal depending on whether its case sensitive or
2579 not. we reuse the accept code however. (this should probably
2580 be turned into a macro.)
2581
2582 */
2583 case TRIEF:
2584 case TRIEFL:
2585 {
2586
a3b680e6 2587 const U32 uniflags = ckWARN( WARN_UTF8 ) ? 0 : UTF8_ALLOW_ANY;
a3621e74
YO
2588 U8 *uc = ( U8* )locinput;
2589 U32 state = 1;
2590 U16 charid = 0;
2591 U32 base = 0;
2592 UV uvc = 0;
2593 STRLEN len = 0;
2594 STRLEN foldlen = 0;
a3621e74
YO
2595 U8 *uscan = (U8*)NULL;
2596 STRLEN bufflen=0;
2597 accepted = 0;
2598
2599 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2600
2601 while ( state && uc <= (U8*)PL_regeol ) {
2602
2603 TRIE_CHECK_STATE_IS_ACCEPTING;
2604
2605 base = trie->states[ state ].trans.base;
2606
2607 DEBUG_TRIE_EXECUTE_r(
2608 PerlIO_printf( Perl_debug_log,
e4584336 2609 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2610 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
e4584336 2611 (UV)state, (UV)base, (UV)accepted );
a3621e74
YO
2612 );
2613
2614 if ( base ) {
2615
108bb1ad 2616 if ( do_utf8 ) {
a3621e74
YO
2617 if ( foldlen>0 ) {
2618 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2619 foldlen -= len;
2620 uscan += len;
2621 len=0;
2622 } else {
1df70142 2623 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2624 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2625 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2626 foldlen -= UNISKIP( uvc );
2627 uscan = foldbuf + UNISKIP( uvc );
2628 }
2629 } else {
e4584336 2630 uvc = (UV)*uc;
a3621e74
YO
2631 len = 1;
2632 }
2633
2634 TRIE_HANDLE_CHAR;
2635
2636 } else {
2637 state = 0;
2638 }
2639 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2640 PerlIO_printf( Perl_debug_log,
2641 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2642 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2643 );
2644 }
2645 if ( !accepted ) {
2646 sayNO;
2647 } else {
2648 goto TrieAccept;
2649 }
2650 }
2651 /* unreached codepoint: we jump into the middle of the next case
2652 from previous if blocks */
2653 case TRIE:
2654 {
a3b680e6 2655 const U32 uniflags = ckWARN( WARN_UTF8 ) ? 0 : UTF8_ALLOW_ANY;
a3621e74
YO
2656 U8 *uc = (U8*)locinput;
2657 U32 state = 1;
2658 U16 charid = 0;
2659 U32 base = 0;
2660 UV uvc = 0;
2661 STRLEN len = 0;
2662 STRLEN bufflen = 0;
2663 accepted = 0;
2664
2665 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2666
2667 while ( state && uc <= (U8*)PL_regeol ) {
2668
2669 TRIE_CHECK_STATE_IS_ACCEPTING;
2670
2671 base = trie->states[ state ].trans.base;
2672
2673 DEBUG_TRIE_EXECUTE_r(
2674 PerlIO_printf( Perl_debug_log,
e4584336 2675 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2676 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
e4584336 2677 (UV)state, (UV)base, (UV)accepted );
a3621e74
YO
2678 );
2679
2680 if ( base ) {
2681
108bb1ad 2682 if ( do_utf8 ) {
a3621e74
YO
2683 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2684 } else {
2685 uvc = (U32)*uc;
2686 len = 1;
2687 }
2688
2689 TRIE_HANDLE_CHAR;
2690
2691 } else {
2692 state = 0;
2693 }
2694 DEBUG_TRIE_EXECUTE_r(
2695 PerlIO_printf( Perl_debug_log,
e4584336
RB
2696 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2697 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2698 );
2699 }
2700 if ( !accepted ) {
2701 sayNO;
2702 }
2703 }
2704
2705
2706 /*
2707 There was at least one accepting state that we
2708 transitioned through. Presumably the number of accepting
2709 states is going to be low, typically one or two. So we
2710 simply scan through to find the one with lowest wordnum.
2711 Once we find it, we swap the last state into its place
2712 and decrement the size. We then try to match the rest of
2713 the pattern at the point where the word ends, if we
2714 succeed then we end the loop, otherwise the loop
2715 eventually terminates once all of the accepting states
2716 have been tried.
2717 */
2718 TrieAccept:
2719 {
2720 int gotit = 0;
2721
2722 if ( accepted == 1 ) {
2723 DEBUG_EXECUTE_r({
2724 SV **tmp = av_fetch( trie->words, accept_buff[ 0 ].wordnum-1, 0 );
2725 PerlIO_printf( Perl_debug_log,
2726 "%*s %sonly one match : #%d <%s>%s\n",
e4584336 2727 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74 2728 accept_buff[ 0 ].wordnum,
cfd0369c 2729 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
a3621e74
YO
2730 PL_colors[5] );
2731 });
cc601c31 2732 PL_reginput = (char *)accept_buff[ 0 ].endpos;
a3621e74
YO
2733 /* in this case we free tmps/leave before we call regmatch
2734 as we wont be using accept_buff again. */
2735 FREETMPS;
2736 LEAVE;
2737 gotit = regmatch( scan + NEXT_OFF( scan ) );
2738 } else {
2739 DEBUG_EXECUTE_r(
e4584336
RB
2740 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
2741 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)accepted,
a3621e74
YO
2742 PL_colors[5] );
2743 );
2744 while ( !gotit && accepted-- ) {
2745 U32 best = 0;
2746 U32 cur;
2747 for( cur = 1 ; cur <= accepted ; cur++ ) {
e4584336
RB
2748 DEBUG_TRIE_EXECUTE_r(
2749 PerlIO_printf( Perl_debug_log,
2750 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2751 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2752 (IV)best, accept_buff[ best ].wordnum, (IV)cur,
2753 accept_buff[ cur ].wordnum, PL_colors[5] );
2754 );
a3621e74
YO
2755
2756 if ( accept_buff[ cur ].wordnum < accept_buff[ best ].wordnum )
2757 best = cur;
2758 }
2759 DEBUG_EXECUTE_r({
2760 SV **tmp = av_fetch( trie->words, accept_buff[ best ].wordnum - 1, 0 );
2761 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
e4584336 2762 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74 2763 accept_buff[best].wordnum,
cfd0369c 2764 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",scan,
a3621e74
YO
2765 PL_colors[5] );
2766 });
2767 if ( best<accepted ) {
2768 reg_trie_accepted tmp = accept_buff[ best ];
2769 accept_buff[ best ] = accept_buff[ accepted ];
2770 accept_buff[ accepted ] = tmp;
2771 best = accepted;
2772 }
cc601c31 2773 PL_reginput = (char *)accept_buff[ best ].endpos;
a3621e74
YO
2774
2775 /*
2776 as far as I can tell we only need the SAVETMPS/FREETMPS
2777 for re's with EVAL in them but I'm leaving them in for
2778 all until I can be sure.
2779 */
2780 SAVETMPS;
2781 gotit = regmatch( scan + NEXT_OFF( scan ) ) ;
2782 FREETMPS;
2783 }
2784 FREETMPS;
2785 LEAVE;
2786 }
2787
2788 if ( gotit ) {
2789 sayYES;
2790 } else {
2791 sayNO;
2792 }
2793 }
2794 /* unreached codepoint */
d6a28714 2795 case EXACT:
cd439c50
IZ
2796 s = STRING(scan);
2797 ln = STR_LEN(scan);
eb160463 2798 if (do_utf8 != UTF) {
bc517b45 2799 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2800 char *l = locinput;
a3b680e6 2801 const char *e = s + ln;
a72c7584 2802
5ff6fc6d
JH
2803 if (do_utf8) {
2804 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2805 while (s < e) {
a3b680e6 2806 STRLEN ulen;
1aa99e6b 2807 if (l >= PL_regeol)
5ff6fc6d
JH
2808 sayNO;
2809 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2810 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
872c91ae
JH
2811 ckWARN(WARN_UTF8) ?
2812 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2813 sayNO;
bc517b45 2814 l += ulen;
5ff6fc6d 2815 s ++;
1aa99e6b 2816 }
5ff6fc6d
JH
2817 }
2818 else {
2819 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2820 while (s < e) {
a3b680e6 2821 STRLEN ulen;
1aa99e6b
IH
2822 if (l >= PL_regeol)
2823 sayNO;
5ff6fc6d 2824 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2825 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
872c91ae
JH
2826 ckWARN(WARN_UTF8) ?
2827 0 : UTF8_ALLOW_ANY))
1aa99e6b 2828 sayNO;
bc517b45 2829 s += ulen;
a72c7584 2830 l ++;
1aa99e6b 2831 }
5ff6fc6d 2832 }
1aa99e6b
IH
2833 locinput = l;
2834 nextchr = UCHARAT(locinput);
2835 break;
2836 }
bc517b45 2837 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2838 /* Inline the first character, for speed. */
2839 if (UCHARAT(s) != nextchr)
2840 sayNO;
2841 if (PL_regeol - locinput < ln)
2842 sayNO;
2843 if (ln > 1 && memNE(s, locinput, ln))
2844 sayNO;
2845 locinput += ln;
2846 nextchr = UCHARAT(locinput);
2847 break;
2848 case EXACTFL:
b8c5462f
JH
2849 PL_reg_flags |= RF_tainted;
2850 /* FALL THROUGH */
d6a28714 2851 case EXACTF:
cd439c50
IZ
2852 s = STRING(scan);
2853 ln = STR_LEN(scan);
d6a28714 2854
d07ddd77
JH
2855 if (do_utf8 || UTF) {
2856 /* Either target or the pattern are utf8. */
d6a28714 2857 char *l = locinput;
d07ddd77 2858 char *e = PL_regeol;
bc517b45 2859
eb160463 2860 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2861 l, &e, 0, do_utf8)) {
5486206c
JH
2862 /* One more case for the sharp s:
2863 * pack("U0U*", 0xDF) =~ /ss/i,
2864 * the 0xC3 0x9F are the UTF-8
2865 * byte sequence for the U+00DF. */
2866 if (!(do_utf8 &&
2867 toLOWER(s[0]) == 's' &&
2868 ln >= 2 &&
2869 toLOWER(s[1]) == 's' &&
2870 (U8)l[0] == 0xC3 &&
2871 e - l >= 2 &&
2872 (U8)l[1] == 0x9F))
2873 sayNO;
2874 }
d07ddd77
JH
2875 locinput = e;
2876 nextchr = UCHARAT(locinput);
2877 break;
a0ed51b3 2878 }
d6a28714 2879
bc517b45
JH
2880 /* Neither the target and the pattern are utf8. */
2881
d6a28714
JH
2882 /* Inline the first character, for speed. */
2883 if (UCHARAT(s) != nextchr &&
2884 UCHARAT(s) != ((OP(scan) == EXACTF)
2885 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2886 sayNO;
d6a28714 2887 if (PL_regeol - locinput < ln)
b8c5462f 2888 sayNO;
d6a28714
JH
2889 if (ln > 1 && (OP(scan) == EXACTF
2890 ? ibcmp(s, locinput, ln)
2891 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2892 sayNO;
d6a28714
JH
2893 locinput += ln;
2894 nextchr = UCHARAT(locinput);
a0d0e21e 2895 break;
d6a28714 2896 case ANYOF:
ffc61ed2 2897 if (do_utf8) {
9e55ce06
JH
2898 STRLEN inclasslen = PL_regeol - locinput;
2899
ba7b4546 2900 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2901 sayNO_ANYOF;
ffc61ed2
JH
2902 if (locinput >= PL_regeol)
2903 sayNO;
0f0076b4 2904 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2905 nextchr = UCHARAT(locinput);
e0f9d4a8 2906 break;
ffc61ed2
JH
2907 }
2908 else {
2909 if (nextchr < 0)
2910 nextchr = UCHARAT(locinput);
7d3e948e 2911 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2912 sayNO_ANYOF;
ffc61ed2
JH
2913 if (!nextchr && locinput >= PL_regeol)
2914 sayNO;
2915 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2916 break;
2917 }
2918 no_anyof:
2919 /* If we might have the case of the German sharp s
2920 * in a casefolding Unicode character class. */
2921
ebc501f0
JH
2922 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2923 locinput += SHARP_S_SKIP;
e0f9d4a8 2924 nextchr = UCHARAT(locinput);
ffc61ed2 2925 }
e0f9d4a8
JH
2926 else
2927 sayNO;
b8c5462f 2928 break;
d6a28714 2929 case ALNUML:
b8c5462f
JH
2930 PL_reg_flags |= RF_tainted;
2931 /* FALL THROUGH */
d6a28714 2932 case ALNUM:
b8c5462f 2933 if (!nextchr)
4633a7c4 2934 sayNO;
ffc61ed2 2935 if (do_utf8) {
1a4fad37 2936 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2937 if (!(OP(scan) == ALNUM
3568d838 2938 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2939 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2940 {
2941 sayNO;
a0ed51b3 2942 }
b8c5462f 2943 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2944 nextchr = UCHARAT(locinput);
2945 break;
2946 }
ffc61ed2 2947 if (!(OP(scan) == ALNUM
d6a28714 2948 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2949 sayNO;
b8c5462f 2950 nextchr = UCHARAT(++locinput);
a0d0e21e 2951 break;
d6a28714 2952 case NALNUML:
b8c5462f
JH
2953 PL_reg_flags |= RF_tainted;
2954 /* FALL THROUGH */
d6a28714
JH
2955 case NALNUM:
2956 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2957 sayNO;
ffc61ed2 2958 if (do_utf8) {
1a4fad37 2959 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2960 if (OP(scan) == NALNUM
3568d838 2961 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2962 : isALNUM_LC_utf8((U8*)locinput))
2963 {
b8c5462f 2964 sayNO;
d6a28714 2965 }
b8c5462f
JH
2966 locinput += PL_utf8skip[nextchr];
2967 nextchr = UCHARAT(locinput);
2968 break;
2969 }
ffc61ed2 2970 if (OP(scan) == NALNUM
d6a28714 2971 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2972 sayNO;
76e3520e 2973 nextchr = UCHARAT(++locinput);
a0d0e21e 2974 break;
d6a28714
JH
2975 case BOUNDL:
2976 case NBOUNDL:
3280af22 2977 PL_reg_flags |= RF_tainted;
bbce6d69 2978 /* FALL THROUGH */
d6a28714
JH
2979 case BOUND:
2980 case NBOUND:
2981 /* was last char in word? */
ffc61ed2 2982 if (do_utf8) {
12d33761
HS
2983 if (locinput == PL_bostr)
2984 ln = '\n';
ffc61ed2 2985 else {
a3b680e6 2986 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2987
b4f7163a 2988 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2989 }
2990 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2991 ln = isALNUM_uni(ln);
1a4fad37 2992 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 2993 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2994 }
2995 else {
9041c2e3 2996 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2997 n = isALNUM_LC_utf8((U8*)locinput);
2998 }
a0ed51b3 2999 }
d6a28714 3000 else {
12d33761
HS
3001 ln = (locinput != PL_bostr) ?
3002 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
3003 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3004 ln = isALNUM(ln);
3005 n = isALNUM(nextchr);
3006 }
3007 else {
3008 ln = isALNUM_LC(ln);
3009 n = isALNUM_LC(nextchr);
3010 }
d6a28714 3011 }
ffc61ed2
JH
3012 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3013 OP(scan) == BOUNDL))
3014 sayNO;
a0ed51b3 3015 break;
d6a28714 3016 case SPACEL:
3280af22 3017 PL_reg_flags |= RF_tainted;
bbce6d69 3018 /* FALL THROUGH */
d6a28714 3019 case SPACE:
9442cb0e 3020 if (!nextchr)
4633a7c4 3021 sayNO;
1aa99e6b 3022 if (do_utf8) {
fd400ab9 3023 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3024 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3025 if (!(OP(scan) == SPACE
3568d838 3026 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3027 : isSPACE_LC_utf8((U8*)locinput)))
3028 {
3029 sayNO;
3030 }
3031 locinput += PL_utf8skip[nextchr];
3032 nextchr = UCHARAT(locinput);
3033 break;
d6a28714 3034 }
ffc61ed2
JH
3035 if (!(OP(scan) == SPACE
3036 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3037 sayNO;
3038 nextchr = UCHARAT(++locinput);
3039 }
3040 else {
3041 if (!(OP(scan) == SPACE
3042 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3043 sayNO;
3044 nextchr = UCHARAT(++locinput);
a0ed51b3 3045 }
a0ed51b3 3046 break;
d6a28714 3047 case NSPACEL:
3280af22 3048 PL_reg_flags |= RF_tainted;
bbce6d69 3049 /* FALL THROUGH */
d6a28714 3050 case NSPACE:
9442cb0e 3051 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3052 sayNO;
1aa99e6b 3053 if (do_utf8) {
1a4fad37 3054 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3055 if (OP(scan) == NSPACE
3568d838 3056 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3057 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3058 {
3059 sayNO;
3060 }
3061 locinput += PL_utf8skip[nextchr];
3062 nextchr = UCHARAT(locinput);
3063 break;
a0ed51b3 3064 }
ffc61ed2 3065 if (OP(scan) == NSPACE
d6a28714 3066 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3067 sayNO;
76e3520e 3068 nextchr = UCHARAT(++locinput);
a0d0e21e 3069 break;
d6a28714 3070 case DIGITL:
a0ed51b3
LW
3071 PL_reg_flags |= RF_tainted;
3072 /* FALL THROUGH */
d6a28714 3073 case DIGIT:
9442cb0e 3074 if (!nextchr)
a0ed51b3 3075 sayNO;
1aa99e6b 3076 if (do_utf8) {
1a4fad37 3077 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3078 if (!(OP(scan) == DIGIT
3568d838 3079 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3080 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3081 {
a0ed51b3 3082 sayNO;
dfe13c55 3083 }
6f06b55f 3084 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3085 nextchr = UCHARAT(locinput);
3086 break;
3087 }
ffc61ed2 3088 if (!(OP(scan) == DIGIT
9442cb0e 3089 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3090 sayNO;
3091 nextchr = UCHARAT(++locinput);
3092 break;
d6a28714 3093 case NDIGITL:
b8c5462f
JH
3094 PL_reg_flags |= RF_tainted;
3095 /* FALL THROUGH */
d6a28714 3096 case NDIGIT:
9442cb0e 3097 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3098 sayNO;
1aa99e6b 3099 if (do_utf8) {
1a4fad37 3100 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3101 if (OP(scan) == NDIGIT
3568d838 3102 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3103 : isDIGIT_LC_utf8((U8*)locinput))
3104 {
a0ed51b3 3105 sayNO;
9442cb0e 3106 }
6f06b55f 3107 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3108 nextchr = UCHARAT(locinput);
3109 break;
3110 }
ffc61ed2 3111 if (OP(scan) == NDIGIT
9442cb0e 3112 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3113 sayNO;
3114 nextchr = UCHARAT(++locinput);
3115 break;
3116 case CLUMP:
b7c83a7e 3117 if (locinput >= PL_regeol)
a0ed51b3 3118 sayNO;
b7c83a7e 3119 if (do_utf8) {
1a4fad37 3120 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3121 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3122 sayNO;
3123 locinput += PL_utf8skip[nextchr];
3124 while (locinput < PL_regeol &&
3125 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3126 locinput += UTF8SKIP(locinput);
3127 if (locinput > PL_regeol)
3128 sayNO;
eb08e2da
JH
3129 }
3130 else
3131 locinput++;
a0ed51b3
LW
3132 nextchr = UCHARAT(locinput);
3133 break;
c8756f30 3134 case REFFL:
3280af22 3135 PL_reg_flags |= RF_tainted;
c8756f30 3136 /* FALL THROUGH */
c277df42 3137 case REF:
c8756f30 3138 case REFF:
c277df42 3139 n = ARG(scan); /* which paren pair */
cf93c79d 3140 ln = PL_regstartp[n];
2c2d71f5 3141 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 3142 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 3143 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 3144 if (ln == PL_regendp[n])
a0d0e21e 3145 break;
a0ed51b3 3146
cf93c79d 3147 s = PL_bostr + ln;
1aa99e6b 3148 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3149 char *l = locinput;
a3b680e6 3150 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3151 /*
3152 * Note that we can't do the "other character" lookup trick as
3153 * in the 8-bit case (no pun intended) because in Unicode we
3154 * have to map both upper and title case to lower case.
3155 */
3156 if (OP(scan) == REFF) {
3157 while (s < e) {
a3b680e6
AL
3158 STRLEN ulen1, ulen2;
3159 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3160 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3161
a0ed51b3
LW
3162 if (l >= PL_regeol)
3163 sayNO;
a2a2844f
JH
3164 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3165 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3166 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3167 sayNO;
a2a2844f
JH
3168 s += ulen1;
3169 l += ulen2;
a0ed51b3
LW
3170 }
3171 }
3172 locinput = l;
3173 nextchr = UCHARAT(locinput);
3174 break;
3175 }
3176
a0d0e21e 3177 /* Inline the first character, for speed. */
76e3520e 3178 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3179 (OP(scan) == REF ||
3180 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3181 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3182 sayNO;
cf93c79d 3183 ln = PL_regendp[n] - ln;
3280af22 3184 if (locinput + ln > PL_regeol)
4633a7c4 3185 sayNO;
c8756f30
AK
3186 if (ln > 1 && (OP(scan) == REF
3187 ? memNE(s, locinput, ln)
3188 : (OP(scan) == REFF
3189 ? ibcmp(s, locinput, ln)
3190 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3191 sayNO;
a0d0e21e 3192 locinput += ln;
76e3520e 3193 nextchr = UCHARAT(locinput);
a0d0e21e
LW
3194 break;
3195
3196 case NOTHING:
c277df42 3197 case TAIL:
a0d0e21e
LW
3198 break;
3199 case BACK:
3200 break;
c277df42
IZ
3201 case EVAL:
3202 {
3203 dSP;
533c011a 3204 OP_4tree *oop = PL_op;
3280af22 3205 COP *ocurcop = PL_curcop;
f3548bdc 3206 PAD *old_comppad;
c277df42 3207 SV *ret;
080c2dec 3208 struct regexp *oreg = PL_reg_re;
9041c2e3 3209
c277df42 3210 n = ARG(scan);
533c011a 3211 PL_op = (OP_4tree*)PL_regdata->data[n];
a3621e74 3212 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 3213 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 3214 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 3215
8e5e9ebe
RGS
3216 {
3217 SV **before = SP;
3218 CALLRUNOPS(aTHX); /* Scalar context. */
3219 SPAGAIN;
3220 if (SP == before)
075aa684 3221 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3222 else {
3223 ret = POPs;
3224 PUTBACK;
3225 }
3226 }
3227
0f5d15d6 3228 PL_op = oop;
f3548bdc 3229 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 3230 PL_curcop = ocurcop;
c277df42 3231 if (logical) {
0f5d15d6
IZ
3232 if (logical == 2) { /* Postponed subexpression. */
3233 regexp *re;
22c35a8c 3234 MAGIC *mg = Null(MAGIC*);
0f5d15d6 3235 re_cc_state state;
0f5d15d6 3236 CHECKPOINT cp, lastcp;
cb50f42d 3237 int toggleutf;
faf82a0b 3238 register SV *sv;
0f5d15d6 3239
faf82a0b
AE
3240 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3241 mg = mg_find(sv, PERL_MAGIC_qr);
3242 else if (SvSMAGICAL(ret)) {
3243 if (SvGMAGICAL(ret))
3244 sv_unmagic(ret, PERL_MAGIC_qr);
3245 else
3246 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3247 }
faf82a0b 3248
0f5d15d6
IZ
3249 if (mg) {
3250 re = (regexp *)mg->mg_obj;
df0003d4 3251 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3252 }
3253 else {
3254 STRLEN len;
83003860 3255 const char *t = SvPV_const(ret, len);
0f5d15d6 3256 PMOP pm;
a3b680e6
AL
3257 char * const oprecomp = PL_regprecomp;
3258 const I32 osize = PL_regsize;
3259 const I32 onpar = PL_regnpar;
0f5d15d6 3260
5fcd1c1b 3261 Zero(&pm, 1, PMOP);
cb50f42d 3262 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3263 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3264 if (!(SvFLAGS(ret)
faf82a0b
AE
3265 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3266 | SVs_GMG)))
14befaf4
DM
3267 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3268 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
3269 PL_regprecomp = oprecomp;
3270 PL_regsize = osize;
3271 PL_regnpar = onpar;
3272 }
a3621e74 3273 DEBUG_EXECUTE_r(
9041c2e3 3274 PerlIO_printf(Perl_debug_log,
a0288114 3275 "Entering embedded \"%s%.60s%s%s\"\n",
0f5d15d6
IZ
3276 PL_colors[0],
3277 re->precomp,
3278 PL_colors[1],
3279 (strlen(re->precomp) > 60 ? "..." : ""))
3280 );
3281 state.node = next;
3282 state.prev = PL_reg_call_cc;
3283 state.cc = PL_regcc;
3284 state.re = PL_reg_re;
3285
2ab05381 3286 PL_regcc = 0;
9041c2e3 3287
0f5d15d6 3288 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3289 REGCP_SET(lastcp);
0f5d15d6
IZ
3290 cache_re(re);
3291 state.ss = PL_savestack_ix;
3292 *PL_reglastparen = 0;
a01268b5 3293 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
3294 PL_reg_call_cc = &state;
3295 PL_reginput = locinput;
cb50f42d
YST
3296 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3297 ((re->reganch & ROPT_UTF8) != 0);
3298 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3299
3300 /* XXXX This is too dramatic a measure... */
3301 PL_reg_maxiter = 0;
3302
0f5d15d6 3303 if (regmatch(re->program + 1)) {
2c914db6
IZ
3304 /* Even though we succeeded, we need to restore
3305 global variables, since we may be wrapped inside
3306 SUSPEND, thus the match may be not finished yet. */
3307
3308 /* XXXX Do this only if SUSPENDed? */
3309 PL_reg_call_cc = state.prev;
3310 PL_regcc = state.cc;
3311 PL_reg_re = state.re;
3312 cache_re(PL_reg_re);
cb50f42d 3313 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
3314
3315 /* XXXX This is too dramatic a measure... */
3316 PL_reg_maxiter = 0;
3317
3318 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
3319 ReREFCNT_dec(re);
3320 regcpblow(cp);
3321 sayYES;
3322 }
0f5d15d6 3323 ReREFCNT_dec(re);
02db2b7b 3324 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3325 regcppop();
3326 PL_reg_call_cc = state.prev;
3327 PL_regcc = state.cc;
3328 PL_reg_re = state.re;
d3790889 3329 cache_re(PL_reg_re);
cb50f42d 3330 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3331
3332 /* XXXX This is too dramatic a measure... */
3333 PL_reg_maxiter = 0;
3334
8e514ae6 3335 logical = 0;
0f5d15d6
IZ
3336 sayNO;
3337 }
c277df42 3338 sw = SvTRUE(ret);
0f5d15d6 3339 logical = 0;
a0ed51b3 3340 }
080c2dec 3341 else {
3280af22 3342 sv_setsv(save_scalar(PL_replgv), ret);
080c2dec
AE
3343 cache_re(oreg);
3344 }
c277df42
IZ
3345 break;
3346 }
a0d0e21e 3347 case OPEN:
c277df42 3348 n = ARG(scan); /* which paren pair */
3280af22
NIS
3349 PL_reg_start_tmp[n] = locinput;
3350 if (n > PL_regsize)
3351 PL_regsize = n;
a0d0e21e
LW
3352 break;
3353 case CLOSE:
c277df42 3354 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3355 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3356 PL_regendp[n] = locinput - PL_bostr;
eb160463 3357 if (n > (I32)*PL_reglastparen)
3280af22 3358 *PL_reglastparen = n;
a01268b5 3359 *PL_reglastcloseparen = n;
a0d0e21e 3360 break;
c277df42
IZ
3361 case GROUPP:
3362 n = ARG(scan); /* which paren pair */
eb160463 3363 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3364 break;
3365 case IFTHEN:
2c2d71f5 3366 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
3367 if (sw)
3368 next = NEXTOPER(NEXTOPER(scan));
3369 else {
3370 next = scan + ARG(scan);
3371 if (OP(next) == IFTHEN) /* Fake one. */
3372 next = NEXTOPER(NEXTOPER(next));
3373 }
3374 break;
3375 case LOGICAL:
0f5d15d6 3376 logical = scan->flags;
c277df42 3377 break;
2ab05381
IZ
3378/*******************************************************************
3379 PL_regcc contains infoblock about the innermost (...)* loop, and
3380 a pointer to the next outer infoblock.
3381
3382 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3383
3384 1) After matching X, regnode for CURLYX is processed;
3385
9041c2e3 3386 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3387 regmatch() recursively with the starting point at WHILEM node;
3388
3389 3) Each hit of WHILEM node tries to match A and Z (in the order
3390 depending on the current iteration, min/max of {min,max} and
3391 greediness). The information about where are nodes for "A"
3392 and "Z" is read from the infoblock, as is info on how many times "A"
3393 was already matched, and greediness.
3394
3395 4) After A matches, the same WHILEM node is hit again.
3396
3397 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3398 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3399 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3400 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3401 of the external loop.
3402
3403 Currently present infoblocks form a tree with a stem formed by PL_curcc
3404 and whatever it mentions via ->next, and additional attached trees
3405 corresponding to temporarily unset infoblocks as in "5" above.
3406
9041c2e3 3407 In the following picture infoblocks for outer loop of
2ab05381
IZ
3408 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3409 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3410 infoblocks are drawn below the "reset" infoblock.
3411
3412 In fact in the picture below we do not show failed matches for Z and T
3413 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3414 more obvious *why* one needs to *temporary* unset infoblocks.]
3415
3416 Matched REx position InfoBlocks Comment
3417 (Y(A)*?Z)*?T x
3418 Y(A)*?Z)*?T x <- O
3419 Y (A)*?Z)*?T x <- O
3420 Y A)*?Z)*?T x <- O <- I
3421 YA )*?Z)*?T x <- O <- I
3422 YA A)*?Z)*?T x <- O <- I
3423 YAA )*?Z)*?T x <- O <- I
3424 YAA Z)*?T x <- O # Temporary unset I
3425 I
3426
3427 YAAZ Y(A)*?Z)*?T x <- O
3428 I
3429
3430 YAAZY (A)*?Z)*?T x <- O
3431 I
3432
3433 YAAZY A)*?Z)*?T x <- O <- I
3434 I
3435
3436 YAAZYA )*?Z)*?T x <- O <- I
3437 I
3438
3439 YAAZYA Z)*?T x <- O # Temporary unset I
3440 I,I
3441
3442 YAAZYAZ )*?T x <- O
3443 I,I
3444
3445 YAAZYAZ T x # Temporary unset O
3446 O
3447 I,I
3448
3449 YAAZYAZT x
3450 O
3451 I,I
3452 *******************************************************************/
a0d0e21e
LW
3453 case CURLYX: {
3454 CURCUR cc;
3280af22 3455 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3456 /* No need to save/restore up to this paren */
3457 I32 parenfloor = scan->flags;
c277df42
IZ
3458
3459 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3460 next += ARG(next);
3280af22
NIS
3461 cc.oldcc = PL_regcc;
3462 PL_regcc = &cc;
cb434fcc
IZ
3463 /* XXXX Probably it is better to teach regpush to support
3464 parenfloor > PL_regsize... */
eb160463 3465 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3466 parenfloor = *PL_reglastparen; /* Pessimization... */
3467 cc.parenfloor = parenfloor;
a0d0e21e
LW
3468 cc.cur = -1;
3469 cc.min = ARG1(scan);
3470 cc.max = ARG2(scan);
c277df42 3471 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3472 cc.next = next;
3473 cc.minmod = minmod;
3474 cc.lastloc = 0;
3280af22 3475 PL_reginput = locinput;
a0d0e21e
LW
3476 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3477 regcpblow(cp);
3280af22 3478 PL_regcc = cc.oldcc;
4633a7c4 3479 saySAME(n);
a0d0e21e
LW
3480 }
3481 /* NOT REACHED */
3482 case WHILEM: {
3483 /*
3484 * This is really hard to understand, because after we match
3485 * what we're trying to match, we must make sure the rest of
2c2d71f5 3486 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3487 * to go back UP the parse tree by recursing ever deeper. And
3488 * if it fails, we have to reset our parent's current state
3489 * that we can try again after backing off.
3490 */
3491
c277df42 3492 CHECKPOINT cp, lastcp;
3280af22 3493 CURCUR* cc = PL_regcc;
c277df42 3494 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3ab3c9b4 3495 I32 cache_offset = 0, cache_bit = 0;
c277df42 3496
4633a7c4 3497 n = cc->cur + 1; /* how many we know we matched */
3280af22 3498 PL_reginput = locinput;
a0d0e21e 3499
a3621e74 3500 DEBUG_EXECUTE_r(
9041c2e3 3501 PerlIO_printf(Perl_debug_log,
91f3b821 3502 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3503 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3504 (long)n, (long)cc->min,
2797576d 3505 (long)cc->max, PTR2UV(cc))
c277df42 3506 );
4633a7c4 3507
a0d0e21e
LW
3508 /* If degenerate scan matches "", assume scan done. */
3509
579cf2c3 3510 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3511 PL_regcc = cc->oldcc;
2ab05381
IZ
3512 if (PL_regcc)
3513 ln = PL_regcc->cur;
a3621e74 3514 DEBUG_EXECUTE_r(
c3464db5
DD
3515 PerlIO_printf(Perl_debug_log,
3516 "%*s empty match detected, try continuation...\n",
3280af22 3517 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3518 );
a0d0e21e 3519 if (regmatch(cc->next))
4633a7c4 3520 sayYES;
2ab05381
IZ
3521 if (PL_regcc)
3522 PL_regcc->cur = ln;
3280af22 3523 PL_regcc = cc;
4633a7c4 3524 sayNO;
a0d0e21e
LW
3525 }
3526
3527 /* First just match a string of min scans. */
3528
3529 if (n < cc->min) {
3530 cc->cur = n;
3531 cc->lastloc = locinput;
4633a7c4
LW
3532 if (regmatch(cc->scan))
3533 sayYES;
3534 cc->cur = n - 1;
c277df42 3535 cc->lastloc = lastloc;
4633a7c4 3536 sayNO;
a0d0e21e
LW
3537 }
3538
2c2d71f5
JH
3539 if (scan->flags) {
3540 /* Check whether we already were at this position.
3541 Postpone detection until we know the match is not
3542 *that* much linear. */
3543 if (!PL_reg_maxiter) {
3544 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3545 PL_reg_leftiter = PL_reg_maxiter;
3546 }
3547 if (PL_reg_leftiter-- == 0) {
a3b680e6 3548 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
2c2d71f5 3549 if (PL_reg_poscache) {
eb160463 3550 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3551 Renew(PL_reg_poscache, size, char);
3552 PL_reg_poscache_size = size;
3553 }
3554 Zero(PL_reg_poscache, size, char);
3555 }
3556 else {
3557 PL_reg_poscache_size = size;
a02a5408 3558 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3559 }
a3621e74 3560 DEBUG_EXECUTE_r(
2c2d71f5
JH
3561 PerlIO_printf(Perl_debug_log,
3562 "%sDetected a super-linear match, switching on caching%s...\n",
3563 PL_colors[4], PL_colors[5])
3564 );
3565 }
3566 if (PL_reg_leftiter < 0) {
3ab3c9b4 3567 cache_offset = locinput - PL_bostr;
2c2d71f5 3568
3ab3c9b4
HS
3569 cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3570 + cache_offset * (scan->flags>>4);
3571 cache_bit = cache_offset % 8;
3572 cache_offset /= 8;
3573 if (PL_reg_poscache[cache_offset] & (1<<cache_bit)) {
a3621e74 3574 DEBUG_EXECUTE_r(
2c2d71f5
JH
3575 PerlIO_printf(Perl_debug_log,
3576 "%*s already tried at this position...\n",
3577 REPORT_CODE_OFF+PL_regindent*2, "")
3578 );
3ab3c9b4
HS
3579 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3580 /* cache records success */
c2b0868c
HS
3581 sayYES;
3582 else
3ab3c9b4 3583 /* cache records failure */
c2b0868c 3584 sayNO_SILENT;
2c2d71f5 3585 }
3ab3c9b4 3586 PL_reg_poscache[cache_offset] |= (1<<cache_bit);
2c2d71f5
JH
3587 }
3588 }
3589
a0d0e21e
LW
3590 /* Prefer next over scan for minimal matching. */
3591
3592 if (cc->minmod) {
3280af22 3593 PL_regcc = cc->oldcc;
2ab05381
IZ
3594 if (PL_regcc)
3595 ln = PL_regcc->cur;
5f05dabc 3596 cp = regcppush(cc->parenfloor);
02db2b7b 3597 REGCP_SET(lastcp);
5f05dabc 3598 if (regmatch(cc->next)) {
c277df42 3599 regcpblow(cp);
3ab3c9b4 3600 CACHEsayYES; /* All done. */
5f05dabc 3601 }
02db2b7b 3602 REGCP_UNWIND(lastcp);
5f05dabc 3603 regcppop();
2ab05381
IZ
3604 if (PL_regcc)
3605 PL_regcc->cur = ln;
3280af22 3606 PL_regcc = cc;
a0d0e21e 3607
c277df42 3608 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3609 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3610 && !(PL_reg_flags & RF_warned)) {
3611 PL_reg_flags |= RF_warned;
9014280d 3612 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3613 "Complex regular subexpression recursion",
3614 REG_INFTY - 1);
c277df42 3615 }
3ab3c9b4 3616 CACHEsayNO;
c277df42 3617 }
a687059c 3618
a3621e74 3619 DEBUG_EXECUTE_r(
c3464db5
DD
3620 PerlIO_printf(Perl_debug_log,
3621 "%*s trying longer...\n",
3280af22 3622 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3623 );
a0d0e21e 3624 /* Try scanning more and see if it helps. */
3280af22 3625 PL_reginput = locinput;
a0d0e21e
LW
3626 cc->cur = n;
3627 cc->lastloc = locinput;
5f05dabc 3628 cp = regcppush(cc->parenfloor);
02db2b7b 3629 REGCP_SET(lastcp);
5f05dabc 3630 if (regmatch(cc->scan)) {
c277df42 3631 regcpblow(cp);
3ab3c9b4 3632 CACHEsayYES;
5f05dabc 3633 }
02db2b7b 3634 REGCP_UNWIND(lastcp);
5f05dabc 3635 regcppop();
4633a7c4 3636 cc->cur = n - 1;
c277df42 3637 cc->lastloc = lastloc;
3ab3c9b4 3638 CACHEsayNO;
a0d0e21e
LW
3639 }
3640
3641 /* Prefer scan over next for maximal matching. */
3642
3643 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3644 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3645 cc->cur = n;
3646 cc->lastloc = locinput;
02db2b7b 3647 REGCP_SET(lastcp);
5f05dabc 3648 if (regmatch(cc->scan)) {
c277df42 3649 regcpblow(cp);
3ab3c9b4 3650 CACHEsayYES;
5f05dabc 3651 }
02db2b7b 3652 REGCP_UNWIND(lastcp);
a0d0e21e 3653 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3654 PL_reginput = locinput;
a3621e74 3655 DEBUG_EXECUTE_r(
c3464db5
DD
3656 PerlIO_printf(Perl_debug_log,
3657 "%*s failed, try continuation...\n",
3280af22 3658 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3659 );
3660 }
9041c2e3 3661 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3662 && !(PL_reg_flags & RF_warned)) {
3280af22 3663 PL_reg_flags |= RF_warned;
9014280d 3664 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3665 "Complex regular subexpression recursion",
3666 REG_INFTY - 1);
a0d0e21e
LW
3667 }
3668
3669 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3670 PL_regcc = cc->oldcc;
2ab05381
IZ
3671 if (PL_regcc)
3672 ln = PL_regcc->cur;
a0d0e21e 3673 if (regmatch(cc->next))
3ab3c9b4 3674 CACHEsayYES;
2ab05381
IZ
3675 if (PL_regcc)
3676 PL_regcc->cur = ln;
3280af22 3677 PL_regcc = cc;
4633a7c4 3678 cc->cur = n - 1;
c277df42 3679 cc->lastloc = lastloc;
3ab3c9b4 3680 CACHEsayNO;
a0d0e21e
LW
3681 }
3682 /* NOT REACHED */
9041c2e3 3683 case BRANCHJ:
c277df42
IZ
3684 next = scan + ARG(scan);
3685 if (next == scan)
3686 next = NULL;
3687 inner = NEXTOPER(NEXTOPER(scan));
3688 goto do_branch;
9041c2e3 3689 case BRANCH:
c277df42
IZ
3690 inner = NEXTOPER(scan);
3691 do_branch:
3692 {
c277df42
IZ
3693 c1 = OP(scan);
3694 if (OP(next) != c1) /* No choice. */
3695 next = inner; /* Avoid recursion. */
a0d0e21e 3696 else {
a3b680e6 3697 const I32 lastparen = *PL_reglastparen;
02db2b7b
IZ
3698 I32 unwind1;
3699 re_unwind_branch_t *uw;
3700
3701 /* Put unwinding data on stack */
3702 unwind1 = SSNEWt(1,re_unwind_branch_t);
3703 uw = SSPTRt(unwind1,re_unwind_branch_t);
3704 uw->prev = unwind;
3705 unwind = unwind1;
3706 uw->type = ((c1 == BRANCH)
3707 ? RE_UNWIND_BRANCH
3708 : RE_UNWIND_BRANCHJ);
3709 uw->lastparen = lastparen;
3710 uw->next = next;
3711 uw->locinput = locinput;
3712 uw->nextchr = nextchr;
3713#ifdef DEBUGGING
3714 uw->regindent = ++PL_regindent;
3715#endif
c277df42 3716
02db2b7b
IZ
3717 REGCP_SET(uw->lastcp);
3718
3719 /* Now go into the first branch */
3720 next = inner;
a687059c 3721 }
a0d0e21e
LW
3722 }
3723 break;
3724 case MINMOD:
3725 minmod = 1;
3726 break;
c277df42
IZ
3727 case CURLYM:
3728 {
00db4c45 3729 I32 l = 0;
c277df42 3730 CHECKPOINT lastcp;
9041c2e3 3731
c277df42 3732 /* We suppose that the next guy does not need
0e788c72 3733 backtracking: in particular, it is of constant non-zero length,
c277df42
IZ
3734 and has no parenths to influence future backrefs. */
3735 ln = ARG1(scan); /* min to match */
3736 n = ARG2(scan); /* max to match */
c277df42
IZ
3737 paren = scan->flags;
3738 if (paren) {
3280af22
NIS
3739 if (paren > PL_regsize)
3740 PL_regsize = paren;
eb160463 3741 if (paren > (I32)*PL_reglastparen)
3280af22 3742 *PL_reglastparen = paren;
c277df42 3743 }
dc45a647 3744 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3745 if (paren)
3746 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3747 PL_reginput = locinput;
c277df42
IZ
3748 if (minmod) {
3749 minmod = 0;
3750 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3751 sayNO;
3280af22 3752 locinput = PL_reginput;
cca55fe3 3753 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3754 regnode *text_node = next;
3755
cca55fe3 3756 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3757
cca55fe3 3758 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3759 else {
cca55fe3 3760 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3761 c1 = c2 = -1000;
3762 goto assume_ok_MM;
cca55fe3
JP
3763 }
3764 else { c1 = (U8)*STRING(text_node); }
af5decee 3765 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3766 c2 = PL_fold[c1];
af5decee 3767 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3768 c2 = PL_fold_locale[c1];
3769 else
3770 c2 = c1;
3771 }
a0ed51b3
LW
3772 }
3773 else
c277df42 3774 c1 = c2 = -1000;
cca55fe3 3775 assume_ok_MM:
02db2b7b 3776 REGCP_SET(lastcp);
0e788c72 3777 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
c277df42
IZ
3778 /* If it could work, try it. */
3779 if (c1 == -1000 ||
3280af22
NIS
3780 UCHARAT(PL_reginput) == c1 ||
3781 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3782 {
3783 if (paren) {
f31a99c8 3784 if (ln) {
cf93c79d
IZ
3785 PL_regstartp[paren] =
3786 HOPc(PL_reginput, -l) - PL_bostr;
3787 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3788 }
3789 else
cf93c79d 3790 PL_regendp[paren] = -1;
c277df42
IZ
3791 }
3792 if (regmatch(next))
3793 sayYES;
02db2b7b 3794 REGCP_UNWIND(lastcp);
c277df42
IZ
3795 }
3796 /* Couldn't or didn't -- move forward. */
3280af22 3797 PL_reginput = locinput;
c277df42
IZ
3798 if (regrepeat_hard(scan, 1, &l)) {
3799 ln++;
3280af22 3800 locinput = PL_reginput;
c277df42
IZ
3801 }
3802 else
3803 sayNO;
3804 }
a0ed51b3
LW
3805 }
3806 else {
c277df42 3807 n = regrepeat_hard(scan, n, &l);
3280af22 3808 locinput = PL_reginput;
a3621e74 3809 DEBUG_EXECUTE_r(
5c0ca799 3810 PerlIO_printf(Perl_debug_log,
faccc32b 3811 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3812 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3813 (IV) n, (IV)l)
c277df42
IZ
3814 );
3815 if (n >= ln) {
cca55fe3 3816 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3817 regnode *text_node = next;
3818
cca55fe3 3819 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3820
cca55fe3 3821 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3822 else {
cca55fe3 3823 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3824 c1 = c2 = -1000;
3825 goto assume_ok_REG;
cca55fe3
JP
3826 }
3827 else { c1 = (U8)*STRING(text_node); }
3828
af5decee 3829 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3830 c2 = PL_fold[c1];
af5decee 3831 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3832 c2 = PL_fold_locale[c1];
3833 else
3834 c2 = c1;
3835 }
a0ed51b3
LW
3836 }
3837 else
c277df42
IZ
3838 c1 = c2 = -1000;
3839 }
cca55fe3 3840 assume_ok_REG:
02db2b7b 3841 REGCP_SET(lastcp);
c277df42
IZ
3842 while (n >= ln) {
3843 /* If it could work, try it. */
3844 if (c1 == -1000 ||
3280af22
NIS
3845 UCHARAT(PL_reginput) == c1 ||
3846 UCHARAT(PL_reginput) == c2)
a0ed51b3 3847 {
a3621e74 3848 DEBUG_EXECUTE_r(
c3464db5 3849 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3850 "%*s trying tail with n=%"IVdf"...\n",
3851 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3852 );
3853 if (paren) {
3854 if (n) {
cf93c79d
IZ
3855 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3856 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3857 }
a0ed51b3 3858 else
cf93c79d 3859 PL_regendp[paren] = -1;
c277df42 3860 }
a0ed51b3
LW
3861 if (regmatch(next))
3862 sayYES;
02db2b7b 3863 REGCP_UNWIND(lastcp);
a0ed51b3 3864 }
c277df42
IZ
3865 /* Couldn't or didn't -- back up. */
3866 n--;
dfe13c55 3867 locinput = HOPc(locinput, -l);
3280af22 3868 PL_reginput = locinput;
c277df42
IZ
3869 }
3870 }
3871 sayNO;
3872 break;
3873 }
3874 case CURLYN:
3875 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3876 if (paren > PL_regsize)
3877 PL_regsize = paren;
eb160463 3878 if (paren > (I32)*PL_reglastparen)
3280af22 3879 *PL_reglastparen = paren;
c277df42
IZ
3880 ln = ARG1(scan); /* min to match */
3881 n = ARG2(scan); /* max to match */
dc45a647 3882 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3883 goto repeat;
a0d0e21e 3884 case CURLY:
c277df42 3885 paren = 0;
a0d0e21e
LW
3886 ln = ARG1(scan); /* min to match */
3887 n = ARG2(scan); /* max to match */
dc45a647 3888 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3889 goto repeat;
3890 case STAR:
3891 ln = 0;
c277df42 3892 n = REG_INFTY;
a0d0e21e 3893 scan = NEXTOPER(scan);
c277df42 3894 paren = 0;
a0d0e21e
LW
3895 goto repeat;
3896 case PLUS:
c277df42
IZ
3897 ln = 1;
3898 n = REG_INFTY;
3899 scan = NEXTOPER(scan);
3900 paren = 0;
3901 repeat:
a0d0e21e
LW
3902 /*
3903 * Lookahead to avoid useless match attempts
3904 * when we know what character comes next.
3905 */
5f80c4cf
JP
3906
3907 /*
3908 * Used to only do .*x and .*?x, but now it allows
3909 * for )'s, ('s and (?{ ... })'s to be in the way
3910 * of the quantifier and the EXACT-like node. -- japhy
3911 */
3912
cca55fe3 3913 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3914 U8 *s;
3915 regnode *text_node = next;
3916
cca55fe3 3917 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3918
cca55fe3 3919 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3920 else {
cca55fe3 3921 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3922 c1 = c2 = -1000;
3923 goto assume_ok_easy;
cca55fe3
JP
3924 }
3925 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
3926
3927 if (!UTF) {
3928 c2 = c1 = *s;
f65d3ee7 3929 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3930 c2 = PL_fold[c1];
f65d3ee7 3931 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf 3932 c2 = PL_fold_locale[c1];
1aa99e6b 3933 }
5f80c4cf 3934 else { /* UTF */
f65d3ee7 3935 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3936 STRLEN ulen1, ulen2;
89ebb4a3
JH
3937 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3938 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
3939
3940 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3941 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3942
89ebb4a3 3943 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
872c91ae
JH
3944 ckWARN(WARN_UTF8) ?
3945 0 : UTF8_ALLOW_ANY);
89ebb4a3 3946 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
872c91ae
JH
3947 ckWARN(WARN_UTF8) ?
3948 0 : UTF8_ALLOW_ANY);
5f80c4cf
JP
3949 }
3950 else {
89ebb4a3 3951 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
872c91ae
JH
3952 ckWARN(WARN_UTF8) ?
3953 0 : UTF8_ALLOW_ANY);
5f80c4cf 3954 }
1aa99e6b
IH
3955 }
3956 }
bbce6d69 3957 }
a0d0e21e 3958 else
bbce6d69 3959 c1 = c2 = -1000;
cca55fe3 3960 assume_ok_easy:
3280af22 3961 PL_reginput = locinput;
a0d0e21e 3962 if (minmod) {
c277df42 3963 CHECKPOINT lastcp;
a0d0e21e
LW
3964 minmod = 0;
3965 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 3966 sayNO;
a0ed51b3 3967 locinput = PL_reginput;
02db2b7b 3968 REGCP_SET(lastcp);
0fe9bf95 3969 if (c1 != -1000) {
1aa99e6b 3970 char *e; /* Should not check after this */
0fe9bf95 3971 char *old = locinput;
b2f2f093 3972 int count = 0;
0fe9bf95 3973
1aa99e6b 3974 if (n == REG_INFTY) {
0fe9bf95 3975 e = PL_regeol - 1;
1aa99e6b
IH
3976 if (do_utf8)
3977 while (UTF8_IS_CONTINUATION(*(U8*)e))
3978 e--;
3979 }
3980 else if (do_utf8) {
3981 int m = n - ln;
3982 for (e = locinput;
3983 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3984 e += UTF8SKIP(e);
3985 }
3986 else {
3987 e = locinput + n - ln;
3988 if (e >= PL_regeol)
3989 e = PL_regeol - 1;
3990 }
0fe9bf95
IZ
3991 while (1) {
3992 /* Find place 'next' could work */
1aa99e6b
IH
3993 if (!do_utf8) {
3994 if (c1 == c2) {
a8e8ab15
JH
3995 while (locinput <= e &&
3996 UCHARAT(locinput) != c1)
1aa99e6b
IH
3997 locinput++;
3998 } else {
9041c2e3 3999 while (locinput <= e
a8e8ab15
JH
4000 && UCHARAT(locinput) != c1
4001 && UCHARAT(locinput) != c2)
1aa99e6b
IH
4002 locinput++;
4003 }
4004 count = locinput - old;
4005 }
4006 else {
1aa99e6b 4007 if (c1 == c2) {
a3b680e6 4008 STRLEN len;
872c91ae
JH
4009 /* count initialised to
4010 * utf8_distance(old, locinput) */
b2f2f093 4011 while (locinput <= e &&
872c91ae 4012 utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4013 UTF8_MAXBYTES, &len,
872c91ae 4014 ckWARN(WARN_UTF8) ?
eb160463 4015 0 : UTF8_ALLOW_ANY) != (UV)c1) {
1aa99e6b 4016 locinput += len;
b2f2f093
JH
4017 count++;
4018 }
1aa99e6b 4019 } else {
a3b680e6 4020 STRLEN len;
872c91ae
JH
4021 /* count initialised to
4022 * utf8_distance(old, locinput) */
b2f2f093 4023 while (locinput <= e) {
872c91ae 4024 UV c = utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4025 UTF8_MAXBYTES, &len,
872c91ae
JH
4026 ckWARN(WARN_UTF8) ?
4027 0 : UTF8_ALLOW_ANY);
eb160463 4028 if (c == (UV)c1 || c == (UV)c2)
1aa99e6b 4029 break;
b2f2f093
JH
4030 locinput += len;
4031 count++;
1aa99e6b
IH
4032 }
4033 }
0fe9bf95 4034 }
9041c2e3 4035 if (locinput > e)
0fe9bf95
IZ
4036 sayNO;
4037 /* PL_reginput == old now */
4038 if (locinput != old) {
4039 ln = 1; /* Did some */
1aa99e6b 4040 if (regrepeat(scan, count) < count)
0fe9bf95
IZ
4041 sayNO;
4042 }
4043 /* PL_reginput == locinput now */
29d1e993 4044 TRYPAREN(paren, ln, locinput);
0fe9bf95 4045 PL_reginput = locinput; /* Could be reset... */
02db2b7b 4046 REGCP_UNWIND(lastcp);
0fe9bf95 4047 /* Couldn't or didn't -- move forward. */
1aa99e6b
IH
4048 old = locinput;
4049 if (do_utf8)
4050 locinput += UTF8SKIP(locinput);
4051 else
4052 locinput++;
b2f2f093 4053 count = 1;
0fe9bf95
IZ
4054 }
4055 }
4056 else
c277df42 4057 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
1aa99e6b
IH
4058 UV c;
4059 if (c1 != -1000) {
4060 if (do_utf8)
872c91ae 4061 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4062 UTF8_MAXBYTES, 0,
872c91ae
JH
4063 ckWARN(WARN_UTF8) ?
4064 0 : UTF8_ALLOW_ANY);
1aa99e6b 4065 else
9041c2e3 4066 c = UCHARAT(PL_reginput);
2390ecbc 4067 /* If it could work, try it. */
eb160463 4068 if (c == (UV)c1 || c == (UV)c2)
2390ecbc 4069 {
ecc99935 4070 TRYPAREN(paren, ln, PL_reginput);
2390ecbc
PP
4071 REGCP_UNWIND(lastcp);
4072 }
1aa99e6b 4073 }
a0d0e21e 4074 /* If it could work, try it. */
2390ecbc 4075 else if (c1 == -1000)
bbce6d69 4076 {
ecc99935 4077 TRYPAREN(paren, ln, PL_reginput);
02db2b7b 4078 REGCP_UNWIND(lastcp);
bbce6d69 4079 }
c277df42 4080 /* Couldn't or didn't -- move forward. */
a0ed51b3 4081 PL_reginput = locinput;
a0d0e21e
LW
4082 if (regrepeat(scan, 1)) {
4083 ln++;
a0ed51b3
LW
4084 locinput = PL_reginput;
4085 }
4086 else
4633a7c4 4087 sayNO;
a0d0e21e
LW
4088 }
4089 }
4090 else {
c277df42 4091 CHECKPOINT lastcp;
a0d0e21e 4092 n = regrepeat(scan, n);
a0ed51b3 4093 locinput = PL_reginput;
22c35a8c 4094 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
7fba1cd6 4095 (OP(next) != MEOL ||
15272685
HS
4096 OP(next) == SEOL || OP(next) == EOS))
4097 {
a0d0e21e 4098 ln = n; /* why back off? */
1aeab75a
GS
4099 /* ...because $ and \Z can match before *and* after
4100 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4101 We should back off by one in this case. */
4102 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
4103 ln--;
4104 }
02db2b7b 4105 REGCP_SET(lastcp);
c277df42 4106 if (paren) {
8fa7f367 4107 UV c = 0;
c277df42 4108 while (n >= ln) {
1aa99e6b
IH
4109 if (c1 != -1000) {
4110 if (do_utf8)
872c91ae 4111 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4112 UTF8_MAXBYTES, 0,
872c91ae
JH
4113 ckWARN(WARN_UTF8) ?
4114 0 : UTF8_ALLOW_ANY);
1aa99e6b 4115 else
9041c2e3 4116 c = UCHARAT(PL_reginput);
1aa99e6b 4117 }
c277df42 4118 /* If it could work, try it. */
eb160463 4119 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 4120 {
29d1e993 4121 TRYPAREN(paren, n, PL_reginput);
02db2b7b 4122 REGCP_UNWIND(lastcp);
c277df42
IZ
4123 }
4124 /* Couldn't or didn't -- back up. */
4125 n--;
dfe13c55 4126 PL_reginput = locinput = HOPc(locinput, -1);
c277df42 4127 }
a0ed51b3
LW
4128 }
4129 else {
8fa7f367 4130 UV c = 0;
c277df42 4131 while (n >= ln) {
1aa99e6b
IH
4132 if (c1 != -1000) {
4133 if (do_utf8)
872c91ae 4134 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4135 UTF8_MAXBYTES, 0,
872c91ae
JH
4136 ckWARN(WARN_UTF8) ?
4137 0 : UTF8_ALLOW_ANY);
1aa99e6b 4138 else
9041c2e3 4139 c = UCHARAT(PL_reginput);
1aa99e6b 4140 }
c277df42 4141 /* If it could work, try it. */
eb160463 4142 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 4143 {
29d1e993 4144 TRYPAREN(paren, n, PL_reginput);
02db2b7b 4145 REGCP_UNWIND(lastcp);
c277df42
IZ
4146 }
4147 /* Couldn't or didn't -- back up. */
4148 n--;
dfe13c55 4149 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 4150 }
a0d0e21e
LW
4151 }
4152 }
4633a7c4 4153 sayNO;
c277df42 4154 break;
a0d0e21e 4155 case END:
0f5d15d6
IZ
4156 if (PL_reg_call_cc) {
4157 re_cc_state *cur_call_cc = PL_reg_call_cc;
4158 CURCUR *cctmp = PL_regcc;
4159 regexp *re = PL_reg_re;
4160 CHECKPOINT cp, lastcp;
4161
4162 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 4163 REGCP_SET(lastcp);
0f5d15d6
IZ
4164 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
4165 the caller. */
4166 PL_reginput = locinput; /* Make position available to
4167 the callcc. */
4168 cache_re(PL_reg_call_cc->re);
4169 PL_regcc = PL_reg_call_cc->cc;
4170 PL_reg_call_cc = PL_reg_call_cc->prev;
4171 if (regmatch(cur_call_cc->node)) {
4172 PL_reg_call_cc = cur_call_cc;
4173 regcpblow(cp);
4174 sayYES;
4175 }
02db2b7b 4176 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
4177 regcppop();
4178 PL_reg_call_cc = cur_call_cc;
4179 PL_regcc = cctmp;
4180 PL_reg_re = re;
4181 cache_re(re);
4182
a3621e74 4183 DEBUG_EXECUTE_r(
0f5d15d6
IZ
4184 PerlIO_printf(Perl_debug_log,
4185 "%*s continuation failed...\n",
4186 REPORT_CODE_OFF+PL_regindent*2, "")
4187 );
7821416a 4188 sayNO_SILENT;
0f5d15d6 4189 }
7821416a 4190 if (locinput < PL_regtill) {
a3621e74 4191 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4192 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4193 PL_colors[4],
4194 (long)(locinput - PL_reg_starttry),
4195 (long)(PL_regtill - PL_reg_starttry),
4196 PL_colors[5]));
4197 sayNO_FINAL; /* Cannot match: too short. */
4198 }
4199 PL_reginput = locinput; /* put where regtry can find it */
4200 sayYES_FINAL; /* Success! */
7e5428c5 4201 case SUCCEED:
3280af22 4202 PL_reginput = locinput; /* put where regtry can find it */
7821416a 4203 sayYES_LOUD; /* Success! */
c277df42
IZ
4204 case SUSPEND:
4205 n = 1;
9fe1d20c 4206 PL_reginput = locinput;
9041c2e3 4207 goto do_ifmatch;
a0d0e21e 4208 case UNLESSM:
c277df42 4209 n = 0;
a0ed51b3 4210 if (scan->flags) {
efb30f32
HS
4211 s = HOPBACKc(locinput, scan->flags);
4212 if (!s)
4213 goto say_yes;
4214 PL_reginput = s;
a0ed51b3
LW
4215 }
4216 else
4217 PL_reginput = locinput;
c277df42
IZ
4218 goto do_ifmatch;
4219 case IFMATCH:
4220 n = 1;
a0ed51b3 4221 if (scan->flags) {
efb30f32
HS
4222 s = HOPBACKc(locinput, scan->flags);
4223 if (!s)
4224 goto say_no;
4225 PL_reginput = s;
a0ed51b3
LW
4226 }
4227 else
4228 PL_reginput = locinput;
4229
c277df42 4230 do_ifmatch:
c277df42
IZ
4231 inner = NEXTOPER(NEXTOPER(scan));
4232 if (regmatch(inner) != n) {
4233 say_no:
4234 if (logical) {
4235 logical = 0;
4236 sw = 0;
4237 goto do_longjump;
a0ed51b3
LW
4238 }
4239 else
c277df42
IZ
4240 sayNO;
4241 }
4242 say_yes:
4243 if (logical) {
4244 logical = 0;
4245 sw = 1;
4246 }
fe44a5e8 4247 if (OP(scan) == SUSPEND) {
3280af22 4248 locinput = PL_reginput;
565764a8 4249 nextchr = UCHARAT(locinput);
fe44a5e8 4250 }
c277df42
IZ
4251 /* FALL THROUGH. */
4252 case LONGJMP:
4253 do_longjump:
4254 next = scan + ARG(scan);
4255 if (next == scan)
4256 next = NULL;
a0d0e21e
LW
4257 break;
4258 default:
b900a521 4259 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4260 PTR2UV(scan), OP(scan));
cea2e8a9 4261 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4262 }
02db2b7b 4263 reenter:
a0d0e21e
LW
4264 scan = next;
4265 }
a687059c 4266
a0d0e21e
LW
4267 /*
4268 * We get here only if there's trouble -- normally "case END" is
4269 * the terminating point.
4270 */
cea2e8a9 4271 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4272 /*NOTREACHED*/
4633a7c4
LW
4273 sayNO;
4274
7821416a 4275yes_loud:
a3621e74 4276 DEBUG_EXECUTE_r(
7821416a
IZ
4277 PerlIO_printf(Perl_debug_log,
4278 "%*s %scould match...%s\n",
e4584336 4279 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a
IZ
4280 );
4281 goto yes;
4282yes_final:
a3621e74 4283 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4284 PL_colors[4], PL_colors[5]));
4633a7c4
LW
4285yes:
4286#ifdef DEBUGGING
3280af22 4287 PL_regindent--;
4633a7c4 4288#endif
02db2b7b
IZ
4289
4290#if 0 /* Breaks $^R */
4291 if (unwind)
4292 regcpblow(firstcp);
4293#endif
4633a7c4
LW
4294 return 1;
4295
4296no:
a3621e74 4297 DEBUG_EXECUTE_r(
7821416a
IZ
4298 PerlIO_printf(Perl_debug_log,
4299 "%*s %sfailed...%s\n",
e4584336 4300 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a
IZ
4301 );
4302 goto do_no;
4303no_final:
4304do_no:
02db2b7b
IZ
4305 if (unwind) {
4306 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
4307
4308 switch (uw->type) {
4309 case RE_UNWIND_BRANCH:
4310 case RE_UNWIND_BRANCHJ:
4311 {
4312 re_unwind_branch_t *uwb = &(uw->branch);
a3b680e6 4313 const I32 lastparen = uwb->lastparen;
9041c2e3 4314
02db2b7b
IZ
4315 REGCP_UNWIND(uwb->lastcp);
4316 for (n = *PL_reglastparen; n > lastparen; n--)
4317 PL_regendp[n] = -1;
4318 *PL_reglastparen = n;
4319 scan = next = uwb->next;
9041c2e3
NIS
4320 if ( !scan ||
4321 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b
IZ
4322 ? BRANCH : BRANCHJ) ) { /* Failure */
4323 unwind = uwb->prev;
4324#ifdef DEBUGGING
4325 PL_regindent--;
4326#endif
4327 goto do_no;
4328 }
4329 /* Have more choice yet. Reuse the same uwb. */
02db2b7b
IZ
4330 if ((n = (uwb->type == RE_UNWIND_BRANCH
4331 ? NEXT_OFF(next) : ARG(next))))
4332 next += n;
4333 else
4334 next = NULL; /* XXXX Needn't unwinding in this case... */
4335 uwb->next = next;
4336 next = NEXTOPER(scan);
4337 if (uwb->type == RE_UNWIND_BRANCHJ)
4338 next = NEXTOPER(next);
4339 locinput = uwb->locinput;
4340 nextchr = uwb->nextchr;
4341#ifdef DEBUGGING
4342 PL_regindent = uwb->regindent;
4343#endif
4344
4345 goto reenter;
4346 }
4347 /* NOT REACHED */
4348 default:
4349 Perl_croak(aTHX_ "regexp unwind memory corruption");
4350 }
4351 /* NOT REACHED */
4352 }
4633a7c4 4353#ifdef DEBUGGING
3280af22 4354 PL_regindent--;
4633a7c4 4355#endif
a0d0e21e 4356 return 0;
a687059c
LW
4357}
4358
4359/*
4360 - regrepeat - repeatedly match something simple, report how many
4361 */
4362/*
4363 * [This routine now assumes that it will only match on things of length 1.
4364 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4365 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4366 */
76e3520e 4367STATIC I32
a3b680e6 4368S_regrepeat(pTHX_ const regnode *p, I32 max)
a687059c 4369{
27da23d5 4370 dVAR;
a0d0e21e 4371 register char *scan;
a0d0e21e 4372 register I32 c;
3280af22 4373 register char *loceol = PL_regeol;
a0ed51b3 4374 register I32 hardcount = 0;
53c4c00c 4375 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4376
3280af22 4377 scan = PL_reginput;
faf11cac
HS
4378 if (max == REG_INFTY)
4379 max = I32_MAX;
4380 else if (max < loceol - scan)
a0d0e21e 4381 loceol = scan + max;
a0d0e21e 4382 switch (OP(p)) {
22c35a8c 4383 case REG_ANY:
1aa99e6b 4384 if (do_utf8) {
ffc61ed2 4385 loceol = PL_regeol;
1aa99e6b 4386 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4387 scan += UTF8SKIP(scan);
4388 hardcount++;
4389 }
4390 } else {
4391 while (scan < loceol && *scan != '\n')
4392 scan++;
a0ed51b3
LW
4393 }
4394 break;
ffc61ed2 4395 case SANY:
def8e4ea
JH
4396 if (do_utf8) {
4397 loceol = PL_regeol;
a0804c9e 4398 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4399 scan += UTF8SKIP(scan);
4400 hardcount++;
4401 }
4402 }
4403 else
4404 scan = loceol;
a0ed51b3 4405 break;
f33976b4
DB
4406 case CANY:
4407 scan = loceol;
4408 break;
090f7165
JH
4409 case EXACT: /* length of string is 1 */
4410 c = (U8)*STRING(p);
4411 while (scan < loceol && UCHARAT(scan) == c)
4412 scan++;
bbce6d69 4413 break;
4414 case EXACTF: /* length of string is 1 */
cd439c50 4415 c = (U8)*STRING(p);
bbce6d69 4416 while (scan < loceol &&
22c35a8c 4417 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4418 scan++;
4419 break;
4420 case EXACTFL: /* length of string is 1 */
3280af22 4421 PL_reg_flags |= RF_tainted;
cd439c50 4422 c = (U8)*STRING(p);
bbce6d69 4423 while (scan < loceol &&
22c35a8c 4424 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4425 scan++;
4426 break;
4427 case ANYOF:
ffc61ed2
JH
4428 if (do_utf8) {
4429 loceol = PL_regeol;
cfc92286
JH
4430 while (hardcount < max && scan < loceol &&
4431 reginclass(p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4432 scan += UTF8SKIP(scan);
4433 hardcount++;
4434 }
4435 } else {
7d3e948e 4436 while (scan < loceol && REGINCLASS(p, (U8*)scan))
ffc61ed2
JH
4437 scan++;
4438 }
a0d0e21e
LW
4439 break;
4440 case ALNUM:
1aa99e6b 4441 if (do_utf8) {
ffc61ed2 4442 loceol = PL_regeol;
1a4fad37 4443 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4444 while (hardcount < max && scan < loceol &&
3568d838 4445 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4446 scan += UTF8SKIP(scan);
4447 hardcount++;
4448 }
4449 } else {
4450 while (scan < loceol && isALNUM(*scan))
4451 scan++;
a0ed51b3
LW
4452 }
4453 break;
bbce6d69 4454 case ALNUML:
3280af22 4455 PL_reg_flags |= RF_tainted;
1aa99e6b 4456 if (do_utf8) {
ffc61ed2 4457 loceol = PL_regeol;
1aa99e6b
IH
4458 while (hardcount < max && scan < loceol &&
4459 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4460 scan += UTF8SKIP(scan);
4461 hardcount++;
4462 }
4463 } else {
4464 while (scan < loceol && isALNUM_LC(*scan))
4465 scan++;
a0ed51b3
LW
4466 }
4467 break;
a0d0e21e 4468 case NALNUM:
1aa99e6b 4469 if (do_utf8) {
ffc61ed2 4470 loceol = PL_regeol;
1a4fad37 4471 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4472 while (hardcount < max && scan < loceol &&
3568d838 4473 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4474 scan += UTF8SKIP(scan);
4475 hardcount++;
4476 }
4477 } else {
4478 while (scan < loceol && !isALNUM(*scan))
4479 scan++;
a0ed51b3
LW
4480 }
4481 break;
bbce6d69 4482 case NALNUML:
3280af22 4483 PL_reg_flags |= RF_tainted;
1aa99e6b 4484 if (do_utf8) {
ffc61ed2 4485 loceol = PL_regeol;
1aa99e6b
IH
4486 while (hardcount < max && scan < loceol &&
4487 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4488 scan += UTF8SKIP(scan);
4489 hardcount++;
4490 }
4491 } else {
4492 while (scan < loceol && !isALNUM_LC(*scan))
4493 scan++;
a0ed51b3
LW
4494 }
4495 break;
a0d0e21e 4496 case SPACE:
1aa99e6b 4497 if (do_utf8) {
ffc61ed2 4498 loceol = PL_regeol;
1a4fad37 4499 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4500 while (hardcount < max && scan < loceol &&
3568d838
JH
4501 (*scan == ' ' ||
4502 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4503 scan += UTF8SKIP(scan);
4504 hardcount++;
4505 }
4506 } else {
4507 while (scan < loceol && isSPACE(*scan))
4508 scan++;
a0ed51b3
LW
4509 }
4510 break;
bbce6d69 4511 case SPACEL:
3280af22 4512 PL_reg_flags |= RF_tainted;
1aa99e6b 4513 if (do_utf8) {
ffc61ed2 4514 loceol = PL_regeol;
1aa99e6b 4515 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4516 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4517 scan += UTF8SKIP(scan);
4518 hardcount++;
4519 }
4520 } else {
4521 while (scan < loceol && isSPACE_LC(*scan))
4522 scan++;
a0ed51b3
LW
4523 }
4524 break;
a0d0e21e 4525 case NSPACE:
1aa99e6b 4526 if (do_utf8) {
ffc61ed2 4527 loceol = PL_regeol;
1a4fad37 4528 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4529 while (hardcount < max && scan < loceol &&
3568d838
JH
4530 !(*scan == ' ' ||
4531 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4532 scan += UTF8SKIP(scan);
4533 hardcount++;
4534 }
4535 } else {
4536 while (scan < loceol && !isSPACE(*scan))
4537 scan++;
4538 break;
a0ed51b3 4539 }
bbce6d69 4540 case NSPACEL:
3280af22 4541 PL_reg_flags |= RF_tainted;
1aa99e6b 4542 if (do_utf8) {
ffc61ed2 4543 loceol = PL_regeol;
1aa99e6b 4544 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4545 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4546 scan += UTF8SKIP(scan);
4547 hardcount++;
4548 }
4549 } else {
4550 while (scan < loceol && !isSPACE_LC(*scan))
4551 scan++;
a0ed51b3
LW
4552 }
4553 break;
a0d0e21e 4554 case DIGIT:
1aa99e6b 4555 if (do_utf8) {
ffc61ed2 4556 loceol = PL_regeol;
1a4fad37 4557 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4558 while (hardcount < max && scan < loceol &&
3568d838 4559 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4560 scan += UTF8SKIP(scan);
4561 hardcount++;
4562 }
4563 } else {
4564 while (scan < loceol && isDIGIT(*scan))
4565 scan++;
a0ed51b3
LW
4566 }
4567 break;
a0d0e21e 4568 case NDIGIT:
1aa99e6b 4569 if (do_utf8) {
ffc61ed2 4570 loceol = PL_regeol;
1a4fad37 4571 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4572 while (hardcount < max && scan < loceol &&
3568d838 4573 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4574 scan += UTF8SKIP(scan);
4575 hardcount++;
4576 }
4577 } else {
4578 while (scan < loceol && !isDIGIT(*scan))
4579 scan++;
a0ed51b3
LW
4580 }
4581 break;
a0d0e21e
LW
4582 default: /* Called on something of 0 width. */
4583 break; /* So match right here or not at all. */
4584 }
a687059c 4585
a0ed51b3
LW
4586 if (hardcount)
4587 c = hardcount;
4588 else
4589 c = scan - PL_reginput;
3280af22 4590 PL_reginput = scan;
a687059c 4591
a3621e74 4592 DEBUG_r({
ab74612d 4593 SV *re_debug_flags = NULL;
c277df42 4594 SV *prop = sv_newmortal();
a3621e74
YO
4595 GET_RE_DEBUG_FLAGS;
4596 DEBUG_EXECUTE_r({
c277df42 4597 regprop(prop, p);
9041c2e3
NIS
4598 PerlIO_printf(Perl_debug_log,
4599 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3f7c398e 4600 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
c277df42 4601 });
a3621e74 4602 });
9041c2e3 4603
a0d0e21e 4604 return(c);
a687059c
LW
4605}
4606
4607/*
c277df42 4608 - regrepeat_hard - repeatedly match something, report total lenth and length
9041c2e3 4609 *
0e788c72 4610 * The repeater is supposed to have constant non-zero length.
c277df42
IZ
4611 */
4612
76e3520e 4613STATIC I32
cea2e8a9 4614S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
c277df42 4615{
b7953727 4616 register char *scan = Nullch;
c277df42 4617 register char *start;
3280af22 4618 register char *loceol = PL_regeol;
a0ed51b3 4619 I32 l = 0;
708e3b05 4620 I32 count = 0, res = 1;
a0ed51b3
LW
4621
4622 if (!max)
4623 return 0;
c277df42 4624
3280af22 4625 start = PL_reginput;
53c4c00c 4626 if (PL_reg_match_utf8) {
708e3b05 4627 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4628 if (!count++) {
4629 l = 0;
4630 while (start < PL_reginput) {
4631 l++;
4632 start += UTF8SKIP(start);
4633 }
4634 *lp = l;
4635 if (l == 0)
4636 return max;
4637 }
4638 if (count == max)
4639 return count;
4640 }
4641 }
4642 else {
708e3b05 4643 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4644 if (!count++) {
4645 *lp = l = PL_reginput - start;
4646 if (max != REG_INFTY && l*max < loceol - scan)
4647 loceol = scan + l*max;
4648 if (l == 0)
4649 return max;
c277df42
IZ
4650 }
4651 }
4652 }
708e3b05 4653 if (!res)
3280af22 4654 PL_reginput = scan;
9041c2e3 4655
a0ed51b3 4656 return count;
c277df42
IZ
4657}
4658
4659/*
ffc61ed2
JH
4660- regclass_swash - prepare the utf8 swash
4661*/
4662
4663SV *
a3b680e6 4664Perl_regclass_swash(pTHX_ register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4665{
9e55ce06
JH
4666 SV *sw = NULL;
4667 SV *si = NULL;
4668 SV *alt = NULL;
ffc61ed2
JH
4669
4670 if (PL_regdata && PL_regdata->count) {
a3b680e6 4671 const U32 n = ARG(node);
ffc61ed2
JH
4672
4673 if (PL_regdata->what[n] == 's') {
890ce7af
AL
4674 SV * const rv = (SV*)PL_regdata->data[n];
4675 AV * const av = (AV*)SvRV((SV*)rv);
b11f357e 4676 SV **ary = AvARRAY(av);
9e55ce06 4677 SV **a, **b;
9041c2e3 4678
9e55ce06
JH
4679 /* See the end of regcomp.c:S_reglass() for
4680 * documentation of these array elements. */
4681
b11f357e
JH
4682 si = *ary;
4683 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4684 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4685
ffc61ed2
JH
4686 if (a)
4687 sw = *a;
4688 else if (si && doinit) {
4689 sw = swash_init("utf8", "", si, 1, 0);
4690 (void)av_store(av, 1, sw);
4691 }
9e55ce06
JH
4692 if (b)
4693 alt = *b;
ffc61ed2
JH
4694 }
4695 }
4696
9e55ce06
JH
4697 if (listsvp)
4698 *listsvp = si;
4699 if (altsvp)
4700 *altsvp = alt;
ffc61ed2
JH
4701
4702 return sw;
4703}
4704
4705/*
ba7b4546 4706 - reginclass - determine if a character falls into a character class
832705d4
JH
4707
4708 The n is the ANYOF regnode, the p is the target string, lenp
4709 is pointer to the maximum length of how far to go in the p
4710 (if the lenp is zero, UTF8SKIP(p) is used),
4711 do_utf8 tells whether the target string is in UTF-8.
4712
bbce6d69 4713 */
4714
76e3520e 4715STATIC bool
a3b680e6 4716S_reginclass(pTHX_ register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4717{
27da23d5 4718 dVAR;
a3b680e6 4719 const char flags = ANYOF_FLAGS(n);
bbce6d69 4720 bool match = FALSE;
cc07378b 4721 UV c = *p;
ae9ddab8 4722 STRLEN len = 0;
9e55ce06 4723 STRLEN plen;
1aa99e6b 4724
ae9ddab8 4725 if (do_utf8 && !UTF8_IS_INVARIANT(c))
89ebb4a3 4726 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
ae9ddab8 4727 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
bbce6d69 4728
0f0076b4 4729 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4730 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4731 if (lenp)
4732 *lenp = 0;
ffc61ed2 4733 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4734 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4735 match = TRUE;
bbce6d69 4736 }
3568d838 4737 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4738 match = TRUE;
ffc61ed2 4739 if (!match) {
9e55ce06 4740 AV *av;
890ce7af 4741 SV * const sw = regclass_swash(n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4742
4743 if (sw) {
3568d838 4744 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4745 match = TRUE;
4746 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4747 if (!match && lenp && av) {
4748 I32 i;
9e55ce06 4749 for (i = 0; i <= av_len(av); i++) {
890ce7af 4750 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 4751 STRLEN len;
890ce7af 4752 const char * const s = SvPV_const(sv, len);
9e55ce06 4753
061b10df 4754 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4755 *lenp = len;
4756 match = TRUE;
4757 break;
4758 }
4759 }
4760 }
4761 if (!match) {
89ebb4a3 4762 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
4763 STRLEN tmplen;
4764
9e55ce06
JH
4765 to_utf8_fold(p, tmpbuf, &tmplen);
4766 if (swash_fetch(sw, tmpbuf, do_utf8))
4767 match = TRUE;
4768 }
ffc61ed2
JH
4769 }
4770 }
bbce6d69 4771 }
9e55ce06 4772 if (match && lenp && *lenp == 0)
0f0076b4 4773 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4774 }
1aa99e6b 4775 if (!match && c < 256) {
ffc61ed2
JH
4776 if (ANYOF_BITMAP_TEST(n, c))
4777 match = TRUE;
4778 else if (flags & ANYOF_FOLD) {
eb160463 4779 U8 f;
a0ed51b3 4780
ffc61ed2
JH
4781 if (flags & ANYOF_LOCALE) {
4782 PL_reg_flags |= RF_tainted;
4783 f = PL_fold_locale[c];
4784 }
4785 else
4786 f = PL_fold[c];
4787 if (f != c && ANYOF_BITMAP_TEST(n, f))
4788 match = TRUE;
4789 }
4790
4791 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4792 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4793 if (
4794 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4795 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4796 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4797 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4798 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4799 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4800 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4801 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4802 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4803 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4804 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4805 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4806 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4807 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4808 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4809 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4810 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4811 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4812 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4813 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4814 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4815 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4816 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4817 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4818 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4819 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4820 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4821 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4822 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4823 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4824 ) /* How's that for a conditional? */
4825 {
4826 match = TRUE;
4827 }
a0ed51b3 4828 }
a0ed51b3
LW
4829 }
4830
a0ed51b3
LW
4831 return (flags & ANYOF_INVERT) ? !match : match;
4832}
161b471a 4833
dfe13c55 4834STATIC U8 *
cea2e8a9 4835S_reghop(pTHX_ U8 *s, I32 off)
9041c2e3 4836{
1aa99e6b
IH
4837 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4838}
4839
4840STATIC U8 *
4841S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
9041c2e3 4842{
a0ed51b3 4843 if (off >= 0) {
1aa99e6b 4844 while (off-- && s < lim) {
ffc61ed2 4845 /* XXX could check well-formedness here */
a0ed51b3 4846 s += UTF8SKIP(s);
ffc61ed2 4847 }
a0ed51b3
LW
4848 }
4849 else {
4850 while (off++) {
1aa99e6b 4851 if (s > lim) {
a0ed51b3 4852 s--;
ffc61ed2 4853 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4854 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4855 s--;
ffc61ed2
JH
4856 }
4857 /* XXX could check well-formedness here */
a0ed51b3
LW
4858 }
4859 }
4860 }
4861 return s;
4862}
161b471a 4863
dfe13c55 4864STATIC U8 *
1aa99e6b 4865S_reghopmaybe(pTHX_ U8 *s, I32 off)
9041c2e3 4866{
1aa99e6b
IH
4867 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4868}
4869
4870STATIC U8 *
4871S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
a0ed51b3
LW
4872{
4873 if (off >= 0) {
1aa99e6b 4874 while (off-- && s < lim) {
ffc61ed2 4875 /* XXX could check well-formedness here */
a0ed51b3 4876 s += UTF8SKIP(s);
ffc61ed2 4877 }
a0ed51b3
LW
4878 if (off >= 0)
4879 return 0;
4880 }
4881 else {
4882 while (off++) {
1aa99e6b 4883 if (s > lim) {
a0ed51b3 4884 s--;
ffc61ed2 4885 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4886 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4887 s--;
ffc61ed2
JH
4888 }
4889 /* XXX could check well-formedness here */
a0ed51b3
LW
4890 }
4891 else
4892 break;
4893 }
4894 if (off <= 0)
4895 return 0;
4896 }
4897 return s;
4898}
51371543 4899
51371543 4900static void
acfe0abc 4901restore_pos(pTHX_ void *arg)
51371543 4902{
9d4ba2ae 4903 PERL_UNUSED_ARG(arg);
51371543
GS
4904 if (PL_reg_eval_set) {
4905 if (PL_reg_oldsaved) {
4906 PL_reg_re->subbeg = PL_reg_oldsaved;
4907 PL_reg_re->sublen = PL_reg_oldsavedlen;
f8c7b90f 4908#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
4909 PL_reg_re->saved_copy = PL_nrs;
4910#endif
51371543
GS
4911 RX_MATCH_COPIED_on(PL_reg_re);
4912 }
4913 PL_reg_magic->mg_len = PL_reg_oldpos;
4914 PL_reg_eval_set = 0;
4915 PL_curpm = PL_reg_oldcurpm;
4916 }
4917}
33b8afdf
JH
4918
4919STATIC void
4920S_to_utf8_substr(pTHX_ register regexp *prog)
4921{
33b8afdf 4922 if (prog->float_substr && !prog->float_utf8) {
9d4ba2ae 4923 SV* sv;
f2b990bf 4924 prog->float_utf8 = sv = newSVsv(prog->float_substr);
33b8afdf
JH
4925 sv_utf8_upgrade(sv);
4926 if (SvTAIL(prog->float_substr))
4927 SvTAIL_on(sv);
4928 if (prog->float_substr == prog->check_substr)
4929 prog->check_utf8 = sv;
4930 }
4931 if (prog->anchored_substr && !prog->anchored_utf8) {
9d4ba2ae 4932 SV* sv;
f2b990bf 4933 prog->anchored_utf8 = sv = newSVsv(prog->anchored_substr);
33b8afdf
JH
4934 sv_utf8_upgrade(sv);
4935 if (SvTAIL(prog->anchored_substr))
4936 SvTAIL_on(sv);
4937 if (prog->anchored_substr == prog->check_substr)
4938 prog->check_utf8 = sv;
4939 }
4940}
4941
4942STATIC void
4943S_to_byte_substr(pTHX_ register regexp *prog)
4944{
33b8afdf 4945 if (prog->float_utf8 && !prog->float_substr) {
9d4ba2ae 4946 SV* sv;
f2b990bf 4947 prog->float_substr = sv = newSVsv(prog->float_utf8);
33b8afdf
JH
4948 if (sv_utf8_downgrade(sv, TRUE)) {
4949 if (SvTAIL(prog->float_utf8))
4950 SvTAIL_on(sv);
4951 } else {
4952 SvREFCNT_dec(sv);
4953 prog->float_substr = sv = &PL_sv_undef;
4954 }
4955 if (prog->float_utf8 == prog->check_utf8)
4956 prog->check_substr = sv;
4957 }
4958 if (prog->anchored_utf8 && !prog->anchored_substr) {
9d4ba2ae 4959 SV* sv;
f2b990bf 4960 prog->anchored_substr = sv = newSVsv(prog->anchored_utf8);
33b8afdf
JH
4961 if (sv_utf8_downgrade(sv, TRUE)) {
4962 if (SvTAIL(prog->anchored_utf8))
4963 SvTAIL_on(sv);
4964 } else {
4965 SvREFCNT_dec(sv);
4966 prog->anchored_substr = sv = &PL_sv_undef;
4967 }
4968 if (prog->anchored_utf8 == prog->check_utf8)
4969 prog->check_substr = sv;
4970 }
4971}
66610fdd
RGS
4972
4973/*
4974 * Local variables:
4975 * c-indentation-style: bsd
4976 * c-basic-offset: 4
4977 * indent-tabs-mode: t
4978 * End:
4979 *
37442d52
RGS
4980 * ex: set ts=8 sts=4 sw=4 noet:
4981 */