This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: [perl #36207] UTF8/Latin 1/i regexp "Malformed character" warning
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
50/* *These* symbols are masked to allow static link. */
51# define Perl_pregexec my_pregexec
9041c2e3 52# define Perl_reginitcolors my_reginitcolors
490a3f88 53# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
9041c2e3 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 **** Alterations to Henry's code are...
79 ****
4bb101f2 80 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
f2b990bf 81 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 82 ****
9ef589d8
LW
83 **** You may distribute under the terms of either the GNU General Public
84 **** License or the Artistic License, as specified in the README file.
a687059c
LW
85 *
86 * Beware that some of this code is subtly aware of the way operator
87 * precedence is structured in regular expressions. Serious changes in
88 * regular-expression syntax might require a total rethink.
89 */
90#include "EXTERN.h"
864dbfa3 91#define PERL_IN_REGEXEC_C
a687059c 92#include "perl.h"
0f5d15d6 93
a687059c
LW
94#include "regcomp.h"
95
c277df42
IZ
96#define RF_tainted 1 /* tainted information used? */
97#define RF_warned 2 /* warned about big count? */
ce862d02 98#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
99#define RF_utf8 8 /* String contains multibyte chars? */
100
eb160463 101#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
102
103#define RS_init 1 /* eval environment created */
104#define RS_set 2 /* replsv value is set */
c277df42 105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
ba7b4546 110#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 111
c277df42
IZ
112/*
113 * Forwards.
114 */
115
33b8afdf 116#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 117#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 118
dfe13c55
GS
119#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
120#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
121#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
122#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
123#define HOPc(pos,off) ((char*)HOP(pos,off))
124#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 125
efb30f32 126#define HOPBACK(pos, off) ( \
e54858b0 127 (PL_reg_match_utf8) \
efb30f32
HS
128 ? reghopmaybe((U8*)pos, -off) \
129 : (pos - off >= PL_bostr) \
130 ? (U8*)(pos - off) \
131 : (U8*)NULL \
132)
133#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
134
1aa99e6b
IH
135#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
136#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
137#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
138#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
139#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
140#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
141
1a4fad37
AL
142#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
143 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
144#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
145#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
146#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
147#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 148
5f80c4cf 149/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
150#define JUMPABLE(rn) ( \
151 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
152 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
153 OP(rn) == PLUS || OP(rn) == MINMOD || \
154 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
155)
156
cca55fe3
JP
157#define HAS_TEXT(rn) ( \
158 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
159)
e2d8ce26 160
a84d97b6
HS
161/*
162 Search for mandatory following text node; for lookahead, the text must
163 follow but for lookbehind (rn->flags != 0) we skip to the next step.
164*/
cca55fe3 165#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 166 while (JUMPABLE(rn)) \
a84d97b6 167 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 168 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
169 else if (OP(rn) == PLUS) \
170 rn = NEXTOPER(rn); \
a84d97b6
HS
171 else if (OP(rn) == IFMATCH) \
172 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 173 else rn += NEXT_OFF(rn); \
5f80c4cf 174} STMT_END
74750237 175
acfe0abc 176static void restore_pos(pTHX_ void *arg);
51371543 177
76e3520e 178STATIC CHECKPOINT
cea2e8a9 179S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 180{
a3b680e6 181 const int retval = PL_savestack_ix;
b1ce53c5 182#define REGCP_PAREN_ELEMS 4
a3b680e6 183 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
184 int p;
185
e49a9654
IH
186 if (paren_elems_to_push < 0)
187 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
188
a01268b5 189#define REGCP_OTHER_ELEMS 6
4b3c1a47 190 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 191 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 192/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
193 SSPUSHINT(PL_regendp[p]);
194 SSPUSHINT(PL_regstartp[p]);
3280af22 195 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
196 SSPUSHINT(p);
197 }
b1ce53c5 198/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
199 SSPUSHINT(PL_regsize);
200 SSPUSHINT(*PL_reglastparen);
a01268b5 201 SSPUSHINT(*PL_reglastcloseparen);
3280af22 202 SSPUSHPTR(PL_reginput);
41123dfd
JH
203#define REGCP_FRAME_ELEMS 2
204/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
205 * are needed for the regexp context stack bookkeeping. */
206 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 207 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 208
a0d0e21e
LW
209 return retval;
210}
211
c277df42 212/* These are needed since we do not localize EVAL nodes: */
a3621e74 213# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 214 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 215 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 216
a3621e74 217# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 218 PerlIO_printf(Perl_debug_log, \
faccc32b 219 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 220 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 221
76e3520e 222STATIC char *
cea2e8a9 223S_regcppop(pTHX)
a0d0e21e 224{
b1ce53c5 225 I32 i;
a0d0e21e
LW
226 U32 paren = 0;
227 char *input;
b1ce53c5 228
a3621e74
YO
229 GET_RE_DEBUG_FLAGS_DECL;
230
b1ce53c5 231 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 232 i = SSPOPINT;
b1ce53c5
JH
233 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
234 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 235 input = (char *) SSPOPPTR;
a01268b5 236 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
237 *PL_reglastparen = SSPOPINT;
238 PL_regsize = SSPOPINT;
b1ce53c5
JH
239
240 /* Now restore the parentheses context. */
41123dfd
JH
241 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
242 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 243 I32 tmps;
a0d0e21e 244 paren = (U32)SSPOPINT;
3280af22 245 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
246 PL_regstartp[paren] = SSPOPINT;
247 tmps = SSPOPINT;
3280af22
NIS
248 if (paren <= *PL_reglastparen)
249 PL_regendp[paren] = tmps;
a3621e74 250 DEBUG_EXECUTE_r(
c3464db5 251 PerlIO_printf(Perl_debug_log,
b900a521 252 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 253 (UV)paren, (IV)PL_regstartp[paren],
b900a521 254 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 255 (IV)PL_regendp[paren],
3280af22 256 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 257 );
a0d0e21e 258 }
a3621e74 259 DEBUG_EXECUTE_r(
eb160463 260 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 261 PerlIO_printf(Perl_debug_log,
faccc32b
JH
262 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
263 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
264 }
265 );
daf18116 266#if 1
dafc8851
JH
267 /* It would seem that the similar code in regtry()
268 * already takes care of this, and in fact it is in
269 * a better location to since this code can #if 0-ed out
270 * but the code in regtry() is needed or otherwise tests
271 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
272 * (as of patchlevel 7877) will fail. Then again,
273 * this code seems to be necessary or otherwise
274 * building DynaLoader will fail:
275 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
276 * --jhi */
eb160463
GS
277 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
278 if ((I32)paren > PL_regsize)
cf93c79d
IZ
279 PL_regstartp[paren] = -1;
280 PL_regendp[paren] = -1;
a0d0e21e 281 }
dafc8851 282#endif
a0d0e21e
LW
283 return input;
284}
285
0f5d15d6 286STATIC char *
cea2e8a9 287S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6 288{
a3b680e6 289 const I32 tmp = PL_savestack_ix;
0f5d15d6
IZ
290
291 PL_savestack_ix = ss;
292 regcppop();
293 PL_savestack_ix = tmp;
942e002e 294 return Nullch;
0f5d15d6
IZ
295}
296
297typedef struct re_cc_state
298{
299 I32 ss;
300 regnode *node;
301 struct re_cc_state *prev;
302 CURCUR *cc;
303 regexp *re;
304} re_cc_state;
305
02db2b7b 306#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 307
29d1e993
HS
308#define TRYPAREN(paren, n, input) { \
309 if (paren) { \
310 if (n) { \
311 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
312 PL_regendp[paren] = input - PL_bostr; \
313 } \
314 else \
315 PL_regendp[paren] = -1; \
316 } \
317 if (regmatch(next)) \
318 sayYES; \
319 if (paren && n) \
320 PL_regendp[paren] = -1; \
321}
322
323
a687059c 324/*
e50aee73 325 * pregexec and friends
a687059c
LW
326 */
327
328/*
c277df42 329 - pregexec - match a regexp against a string
a687059c 330 */
c277df42 331I32
864dbfa3 332Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 333 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
334/* strend: pointer to null at end of string */
335/* strbeg: real beginning of string */
336/* minend: end of match must be >=minend after stringarg. */
337/* nosave: For optimizations. */
338{
339 return
9041c2e3 340 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
341 nosave ? 0 : REXEC_COPY_STR);
342}
0f5d15d6
IZ
343
344STATIC void
cea2e8a9 345S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
346{
347 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
348#ifdef DEBUGGING
349 PL_regprogram = prog->program;
350#endif
351 PL_regnpar = prog->nparens;
9041c2e3
NIS
352 PL_regdata = prog->data;
353 PL_reg_re = prog;
0f5d15d6 354}
22e551b9 355
9041c2e3 356/*
cad2e5aa
JH
357 * Need to implement the following flags for reg_anch:
358 *
359 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
360 * USE_INTUIT_ML
361 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
362 * INTUIT_AUTORITATIVE_ML
363 * INTUIT_ONCE_NOML - Intuit can match in one location only.
364 * INTUIT_ONCE_ML
365 *
366 * Another flag for this function: SECOND_TIME (so that float substrs
367 * with giant delta may be not rechecked).
368 */
369
370/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
371
3f7c398e 372/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
373 Otherwise, only SvCUR(sv) is used to get strbeg. */
374
375/* XXXX We assume that strpos is strbeg unless sv. */
376
6eb5f6b9
JH
377/* XXXX Some places assume that there is a fixed substring.
378 An update may be needed if optimizer marks as "INTUITable"
379 RExen without fixed substrings. Similarly, it is assumed that
380 lengths of all the strings are no more than minlen, thus they
381 cannot come from lookahead.
382 (Or minlen should take into account lookahead.) */
383
2c2d71f5
JH
384/* A failure to find a constant substring means that there is no need to make
385 an expensive call to REx engine, thus we celebrate a failure. Similarly,
386 finding a substring too deep into the string means that less calls to
30944b6d
IZ
387 regtry() should be needed.
388
389 REx compiler's optimizer found 4 possible hints:
390 a) Anchored substring;
391 b) Fixed substring;
392 c) Whether we are anchored (beginning-of-line or \G);
393 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 394 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
395 string which does not contradict any of them.
396 */
2c2d71f5 397
6eb5f6b9
JH
398/* Most of decisions we do here should have been done at compile time.
399 The nodes of the REx which we used for the search should have been
400 deleted from the finite automaton. */
401
cad2e5aa
JH
402char *
403Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
404 char *strend, U32 flags, re_scream_pos_data *data)
405{
b7953727 406 register I32 start_shift = 0;
cad2e5aa 407 /* Should be nonnegative! */
b7953727 408 register I32 end_shift = 0;
2c2d71f5
JH
409 register char *s;
410 register SV *check;
a1933d95 411 char *strbeg;
cad2e5aa 412 char *t;
a3b680e6 413 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 414 I32 ml_anch;
6eb5f6b9 415 register char *other_last = Nullch; /* other substr checked before this */
b7953727 416 char *check_at = Nullch; /* check substr found at this pos */
1df70142 417 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d
IZ
418#ifdef DEBUGGING
419 char *i_strpos = strpos;
ce333219 420 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 421#endif
a3621e74
YO
422
423 GET_RE_DEBUG_FLAGS_DECL;
424
a30b2f1f 425 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 426
b8d68ded 427 if (prog->reganch & ROPT_UTF8) {
a3621e74 428 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
429 "UTF-8 regex...\n"));
430 PL_reg_flags |= RF_utf8;
431 }
432
a3621e74 433 DEBUG_EXECUTE_r({
1df70142 434 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
435 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
436 strpos;
1df70142 437 const int len = PL_reg_match_utf8 ?
b8d68ded 438 strlen(s) : strend - strpos;
2a782b5b
JH
439 if (!PL_colorset)
440 reginitcolors();
b8d68ded 441 if (PL_reg_match_utf8)
a3621e74 442 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 443 "UTF-8 target...\n"));
2a782b5b 444 PerlIO_printf(Perl_debug_log,
a0288114 445 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 446 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
447 prog->precomp,
448 PL_colors[1],
449 (strlen(prog->precomp) > 60 ? "..." : ""),
450 PL_colors[0],
451 (int)(len > 60 ? 60 : len),
452 s, PL_colors[1],
453 (len > 60 ? "..." : "")
454 );
455 });
cad2e5aa 456
c344f387
JH
457 /* CHR_DIST() would be more correct here but it makes things slow. */
458 if (prog->minlen > strend - strpos) {
a3621e74 459 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 460 "String too short... [re_intuit_start]\n"));
cad2e5aa 461 goto fail;
2c2d71f5 462 }
a1933d95 463 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 464 PL_regeol = strend;
33b8afdf
JH
465 if (do_utf8) {
466 if (!prog->check_utf8 && prog->check_substr)
467 to_utf8_substr(prog);
468 check = prog->check_utf8;
469 } else {
470 if (!prog->check_substr && prog->check_utf8)
471 to_byte_substr(prog);
472 check = prog->check_substr;
473 }
474 if (check == &PL_sv_undef) {
a3621e74 475 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
476 "Non-utf string cannot match utf check string\n"));
477 goto fail;
478 }
2c2d71f5 479 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
480 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
481 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 482 && !multiline ) ); /* Check after \n? */
cad2e5aa 483
7e25d62c
JH
484 if (!ml_anch) {
485 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
486 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 487 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
488 && sv && !SvROK(sv)
489 && (strpos != strbeg)) {
a3621e74 490 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
491 goto fail;
492 }
493 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 494 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 495 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
496 I32 slen;
497
1aa99e6b 498 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
499 if (SvTAIL(check)) {
500 slen = SvCUR(check); /* >= 1 */
cad2e5aa 501
9041c2e3 502 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 503 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 504 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 505 goto fail_finish;
cad2e5aa
JH
506 }
507 /* Now should match s[0..slen-2] */
508 slen--;
3f7c398e 509 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 510 || (slen > 1
3f7c398e 511 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 512 report_neq:
a3621e74 513 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
514 goto fail_finish;
515 }
cad2e5aa 516 }
3f7c398e 517 else if (*SvPVX_const(check) != *s
653099ff 518 || ((slen = SvCUR(check)) > 1
3f7c398e 519 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5
JH
520 goto report_neq;
521 goto success_at_start;
7e25d62c 522 }
cad2e5aa 523 }
2c2d71f5 524 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 525 s = strpos;
2c2d71f5 526 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 527 end_shift = prog->minlen - start_shift -
653099ff 528 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 529 if (!ml_anch) {
a3b680e6 530 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 531 - (SvTAIL(check) != 0);
a3b680e6 532 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
533
534 if (end_shift < eshift)
535 end_shift = eshift;
536 }
cad2e5aa 537 }
2c2d71f5 538 else { /* Can match at random position */
cad2e5aa
JH
539 ml_anch = 0;
540 s = strpos;
2c2d71f5
JH
541 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
542 /* Should be nonnegative! */
543 end_shift = prog->minlen - start_shift -
653099ff 544 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
545 }
546
2c2d71f5 547#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 548 if (end_shift < 0)
6bbae5e6 549 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
550#endif
551
2c2d71f5
JH
552 restart:
553 /* Find a possible match in the region s..strend by looking for
554 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 555 if (flags & REXEC_SCREAM) {
cad2e5aa 556 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 557 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 558
2c2d71f5
JH
559 if (PL_screamfirst[BmRARE(check)] >= 0
560 || ( BmRARE(check) == '\n'
561 && (BmPREVIOUS(check) == SvCUR(check) - 1)
562 && SvTAIL(check) ))
9041c2e3 563 s = screaminstr(sv, check,
2c2d71f5 564 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 565 else
2c2d71f5 566 goto fail_finish;
4addbd3b
HS
567 /* we may be pointing at the wrong string */
568 if (s && RX_MATCH_COPIED(prog))
3f7c398e 569 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
570 if (data)
571 *data->scream_olds = s;
572 }
f33976b4 573 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
574 s = fbm_instr((U8*)(s + start_shift),
575 (U8*)(strend - end_shift),
7fba1cd6 576 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 577 else
1aa99e6b
IH
578 s = fbm_instr(HOP3(s, start_shift, strend),
579 HOP3(strend, -end_shift, strbeg),
7fba1cd6 580 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
581
582 /* Update the count-of-usability, remove useless subpatterns,
583 unshift s. */
2c2d71f5 584
a0288114 585 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 586 (s ? "Found" : "Did not find"),
33b8afdf 587 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 588 PL_colors[0],
7b0972df 589 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 590 SvPVX_const(check),
2c2d71f5
JH
591 PL_colors[1], (SvTAIL(check) ? "$" : ""),
592 (s ? " at offset " : "...\n") ) );
593
594 if (!s)
595 goto fail_finish;
596
6eb5f6b9
JH
597 check_at = s;
598
2c2d71f5 599 /* Finish the diagnostic message */
a3621e74 600 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
601
602 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
603 Start with the other substr.
604 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 605 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
606 *always* match. Probably should be marked during compile...
607 Probably it is right to do no SCREAM here...
608 */
609
33b8afdf 610 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 611 /* Take into account the "other" substring. */
2c2d71f5
JH
612 /* XXXX May be hopelessly wrong for UTF... */
613 if (!other_last)
6eb5f6b9 614 other_last = strpos;
33b8afdf 615 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
616 do_other_anchored:
617 {
1aa99e6b 618 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5 619 char *s1 = s;
33b8afdf 620 SV* must;
2c2d71f5 621
2c2d71f5
JH
622 t = s - prog->check_offset_max;
623 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 624 && (!do_utf8
1aa99e6b 625 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 626 && t > strpos)))
30944b6d 627 /* EMPTY */;
2c2d71f5
JH
628 else
629 t = strpos;
1aa99e6b 630 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
631 if (t < other_last) /* These positions already checked */
632 t = other_last;
1aa99e6b 633 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
634 if (last < last1)
635 last1 = last;
636 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
637 /* On end-of-str: see comment below. */
33b8afdf
JH
638 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
639 if (must == &PL_sv_undef) {
640 s = (char*)NULL;
a3621e74 641 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
642 }
643 else
644 s = fbm_instr(
645 (unsigned char*)t,
646 HOP3(HOP3(last1, prog->anchored_offset, strend)
647 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
648 must,
7fba1cd6 649 multiline ? FBMrf_MULTILINE : 0
33b8afdf 650 );
a3621e74 651 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 652 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
653 (s ? "Found" : "Contradicts"),
654 PL_colors[0],
33b8afdf
JH
655 (int)(SvCUR(must)
656 - (SvTAIL(must)!=0)),
3f7c398e 657 SvPVX_const(must),
33b8afdf 658 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
659 if (!s) {
660 if (last1 >= last2) {
a3621e74 661 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
662 ", giving up...\n"));
663 goto fail_finish;
664 }
a3621e74 665 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 666 ", trying floating at offset %ld...\n",
1aa99e6b
IH
667 (long)(HOP3c(s1, 1, strend) - i_strpos)));
668 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
669 s = HOP3c(last, 1, strend);
2c2d71f5
JH
670 goto restart;
671 }
672 else {
a3621e74 673 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 674 (long)(s - i_strpos)));
1aa99e6b
IH
675 t = HOP3c(s, -prog->anchored_offset, strbeg);
676 other_last = HOP3c(s, 1, strend);
30944b6d 677 s = s1;
2c2d71f5
JH
678 if (t == strpos)
679 goto try_at_start;
2c2d71f5
JH
680 goto try_at_offset;
681 }
30944b6d 682 }
2c2d71f5
JH
683 }
684 else { /* Take into account the floating substring. */
33b8afdf
JH
685 char *last, *last1;
686 char *s1 = s;
687 SV* must;
688
689 t = HOP3c(s, -start_shift, strbeg);
690 last1 = last =
691 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
692 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
693 last = HOP3c(t, prog->float_max_offset, strend);
694 s = HOP3c(t, prog->float_min_offset, strend);
695 if (s < other_last)
696 s = other_last;
2c2d71f5 697 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
698 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
699 /* fbm_instr() takes into account exact value of end-of-str
700 if the check is SvTAIL(ed). Since false positives are OK,
701 and end-of-str is not later than strend we are OK. */
702 if (must == &PL_sv_undef) {
703 s = (char*)NULL;
a3621e74 704 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
705 }
706 else
2c2d71f5 707 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
708 (unsigned char*)last + SvCUR(must)
709 - (SvTAIL(must)!=0),
7fba1cd6 710 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 711 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
712 (s ? "Found" : "Contradicts"),
713 PL_colors[0],
714 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 715 SvPVX_const(must),
33b8afdf
JH
716 PL_colors[1], (SvTAIL(must) ? "$" : "")));
717 if (!s) {
718 if (last1 == last) {
a3621e74 719 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
720 ", giving up...\n"));
721 goto fail_finish;
2c2d71f5 722 }
a3621e74 723 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
724 ", trying anchored starting at offset %ld...\n",
725 (long)(s1 + 1 - i_strpos)));
726 other_last = last;
727 s = HOP3c(t, 1, strend);
728 goto restart;
729 }
730 else {
a3621e74 731 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
732 (long)(s - i_strpos)));
733 other_last = s; /* Fix this later. --Hugo */
734 s = s1;
735 if (t == strpos)
736 goto try_at_start;
737 goto try_at_offset;
738 }
2c2d71f5 739 }
cad2e5aa 740 }
2c2d71f5
JH
741
742 t = s - prog->check_offset_max;
2c2d71f5 743 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 744 && (!do_utf8
1aa99e6b
IH
745 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
746 && t > strpos))) {
2c2d71f5
JH
747 /* Fixed substring is found far enough so that the match
748 cannot start at strpos. */
749 try_at_offset:
cad2e5aa 750 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
751 /* Eventually fbm_*() should handle this, but often
752 anchored_offset is not 0, so this check will not be wasted. */
753 /* XXXX In the code below we prefer to look for "^" even in
754 presence of anchored substrings. And we search even
755 beyond the found float position. These pessimizations
756 are historical artefacts only. */
757 find_anchor:
2c2d71f5 758 while (t < strend - prog->minlen) {
cad2e5aa 759 if (*t == '\n') {
4ee3650e 760 if (t < check_at - prog->check_offset_min) {
33b8afdf 761 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
762 /* Since we moved from the found position,
763 we definitely contradict the found anchored
30944b6d
IZ
764 substr. Due to the above check we do not
765 contradict "check" substr.
766 Thus we can arrive here only if check substr
767 is float. Redo checking for "other"=="fixed".
768 */
9041c2e3 769 strpos = t + 1;
a3621e74 770 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 771 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
772 goto do_other_anchored;
773 }
4ee3650e
GS
774 /* We don't contradict the found floating substring. */
775 /* XXXX Why not check for STCLASS? */
cad2e5aa 776 s = t + 1;
a3621e74 777 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 778 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
779 goto set_useful;
780 }
4ee3650e
GS
781 /* Position contradicts check-string */
782 /* XXXX probably better to look for check-string
783 than for "\n", so one should lower the limit for t? */
a3621e74 784 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 785 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 786 other_last = strpos = s = t + 1;
cad2e5aa
JH
787 goto restart;
788 }
789 t++;
790 }
a3621e74 791 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 792 PL_colors[0], PL_colors[1]));
2c2d71f5 793 goto fail_finish;
cad2e5aa 794 }
f5952150 795 else {
a3621e74 796 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 797 PL_colors[0], PL_colors[1]));
f5952150 798 }
cad2e5aa
JH
799 s = t;
800 set_useful:
33b8afdf 801 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
802 }
803 else {
f5952150 804 /* The found string does not prohibit matching at strpos,
2c2d71f5 805 - no optimization of calling REx engine can be performed,
f5952150
GS
806 unless it was an MBOL and we are not after MBOL,
807 or a future STCLASS check will fail this. */
2c2d71f5
JH
808 try_at_start:
809 /* Even in this situation we may use MBOL flag if strpos is offset
810 wrt the start of the string. */
05b4157f 811 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 812 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
813 /* May be due to an implicit anchor of m{.*foo} */
814 && !(prog->reganch & ROPT_IMPLICIT))
815 {
cad2e5aa
JH
816 t = strpos;
817 goto find_anchor;
818 }
a3621e74 819 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 820 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 821 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 822 );
2c2d71f5 823 success_at_start:
30944b6d 824 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
825 && (do_utf8 ? (
826 prog->check_utf8 /* Could be deleted already */
827 && --BmUSEFUL(prog->check_utf8) < 0
828 && (prog->check_utf8 == prog->float_utf8)
829 ) : (
830 prog->check_substr /* Could be deleted already */
831 && --BmUSEFUL(prog->check_substr) < 0
832 && (prog->check_substr == prog->float_substr)
833 )))
66e933ab 834 {
cad2e5aa 835 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 836 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
837 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
838 if (do_utf8 ? prog->check_substr : prog->check_utf8)
839 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
840 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
841 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 842 check = Nullsv; /* abort */
cad2e5aa 843 s = strpos;
3cf5c195
IZ
844 /* XXXX This is a remnant of the old implementation. It
845 looks wasteful, since now INTUIT can use many
6eb5f6b9 846 other heuristics. */
cad2e5aa
JH
847 prog->reganch &= ~RE_USE_INTUIT;
848 }
849 else
850 s = strpos;
851 }
852
6eb5f6b9
JH
853 /* Last resort... */
854 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
855 if (prog->regstclass) {
856 /* minlen == 0 is possible if regstclass is \b or \B,
857 and the fixed substr is ''$.
858 Since minlen is already taken into account, s+1 is before strend;
859 accidentally, minlen >= 1 guaranties no false positives at s + 1
860 even for \b or \B. But (minlen? 1 : 0) below assumes that
861 regstclass does not come from lookahead... */
862 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
863 This leaves EXACTF only, which is dealt with in find_byclass(). */
06b5626a
AL
864 const U8* str = (U8*)STRING(prog->regstclass);
865 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 866 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 867 : 1);
a3b680e6 868 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 869 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 870 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
871 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
872 cl_l, strend)
873 : strend);
6eb5f6b9
JH
874
875 t = s;
9926ca43 876 cache_re(prog);
06b5626a 877 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
6eb5f6b9
JH
878 if (!s) {
879#ifdef DEBUGGING
e1ec3a88 880 const char *what = 0;
6eb5f6b9
JH
881#endif
882 if (endpos == strend) {
a3621e74 883 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
884 "Could not match STCLASS...\n") );
885 goto fail;
886 }
a3621e74 887 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 888 "This position contradicts STCLASS...\n") );
653099ff
GS
889 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
890 goto fail;
6eb5f6b9 891 /* Contradict one of substrings */
33b8afdf
JH
892 if (prog->anchored_substr || prog->anchored_utf8) {
893 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 894 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 895 hop_and_restart:
1aa99e6b 896 s = HOP3c(t, 1, strend);
66e933ab
GS
897 if (s + start_shift + end_shift > strend) {
898 /* XXXX Should be taken into account earlier? */
a3621e74 899 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
900 "Could not match STCLASS...\n") );
901 goto fail;
902 }
5e39e1e5
HS
903 if (!check)
904 goto giveup;
a3621e74 905 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 906 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
907 what, (long)(s + start_shift - i_strpos)) );
908 goto restart;
909 }
66e933ab 910 /* Have both, check_string is floating */
6eb5f6b9
JH
911 if (t + start_shift >= check_at) /* Contradicts floating=check */
912 goto retry_floating_check;
913 /* Recheck anchored substring, but not floating... */
9041c2e3 914 s = check_at;
5e39e1e5
HS
915 if (!check)
916 goto giveup;
a3621e74 917 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 918 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
919 (long)(other_last - i_strpos)) );
920 goto do_other_anchored;
921 }
60e71179
GS
922 /* Another way we could have checked stclass at the
923 current position only: */
924 if (ml_anch) {
925 s = t = t + 1;
5e39e1e5
HS
926 if (!check)
927 goto giveup;
a3621e74 928 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 929 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 930 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 931 goto try_at_offset;
66e933ab 932 }
33b8afdf 933 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 934 goto fail;
6eb5f6b9
JH
935 /* Check is floating subtring. */
936 retry_floating_check:
937 t = check_at - start_shift;
a3621e74 938 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
939 goto hop_and_restart;
940 }
b7953727 941 if (t != s) {
a3621e74 942 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 943 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
944 (long)(t - i_strpos), (long)(s - i_strpos))
945 );
946 }
947 else {
a3621e74 948 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
949 "Does not contradict STCLASS...\n");
950 );
951 }
6eb5f6b9 952 }
5e39e1e5 953 giveup:
a3621e74 954 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
955 PL_colors[4], (check ? "Guessed" : "Giving up"),
956 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 957 return s;
2c2d71f5
JH
958
959 fail_finish: /* Substring not found */
33b8afdf
JH
960 if (prog->check_substr || prog->check_utf8) /* could be removed already */
961 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 962 fail:
a3621e74 963 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 964 PL_colors[4], PL_colors[5]));
cad2e5aa
JH
965 return Nullch;
966}
9661b544 967
6eb5f6b9 968/* We know what class REx starts with. Try to find this position... */
3c3eec57 969STATIC char *
a3b680e6 970S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 norun)
a687059c 971{
27da23d5 972 dVAR;
1df70142 973 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 974 char *m;
d8093b23 975 STRLEN ln;
5dab1207 976 STRLEN lnc;
078c425b 977 register STRLEN uskip;
d8093b23
G
978 unsigned int c1;
979 unsigned int c2;
6eb5f6b9
JH
980 char *e;
981 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 982 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 983
6eb5f6b9
JH
984 /* We know what class it must start with. */
985 switch (OP(c)) {
6eb5f6b9 986 case ANYOF:
388cc4de 987 if (do_utf8) {
078c425b 988 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
989 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
990 !UTF8_IS_INVARIANT((U8)s[0]) ?
991 reginclass(c, (U8*)s, 0, do_utf8) :
992 REGINCLASS(c, (U8*)s)) {
993 if (tmp && (norun || regtry(prog, s)))
994 goto got_it;
995 else
996 tmp = doevery;
997 }
998 else
999 tmp = 1;
078c425b 1000 s += uskip;
388cc4de
HS
1001 }
1002 }
1003 else {
1004 while (s < strend) {
1005 STRLEN skip = 1;
1006
1007 if (REGINCLASS(c, (U8*)s) ||
1008 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1009 /* The assignment of 2 is intentional:
1010 * for the folded sharp s, the skip is 2. */
1011 (skip = SHARP_S_SKIP))) {
1012 if (tmp && (norun || regtry(prog, s)))
1013 goto got_it;
1014 else
1015 tmp = doevery;
1016 }
1017 else
1018 tmp = 1;
1019 s += skip;
1020 }
a0d0e21e 1021 }
6eb5f6b9 1022 break;
f33976b4
DB
1023 case CANY:
1024 while (s < strend) {
1025 if (tmp && (norun || regtry(prog, s)))
1026 goto got_it;
1027 else
1028 tmp = doevery;
1029 s++;
1030 }
1031 break;
6eb5f6b9 1032 case EXACTF:
5dab1207
NIS
1033 m = STRING(c);
1034 ln = STR_LEN(c); /* length to match in octets/bytes */
1035 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1036 if (UTF) {
a2a2844f 1037 STRLEN ulen1, ulen2;
5dab1207 1038 U8 *sm = (U8 *) m;
89ebb4a3
JH
1039 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1040 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
1041
1042 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1043 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1044
89ebb4a3 1045 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
872c91ae 1046 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
89ebb4a3 1047 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
872c91ae 1048 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
5dab1207
NIS
1049 lnc = 0;
1050 while (sm < ((U8 *) m + ln)) {
1051 lnc++;
1052 sm += UTF8SKIP(sm);
1053 }
1aa99e6b
IH
1054 }
1055 else {
1056 c1 = *(U8*)m;
1057 c2 = PL_fold[c1];
1058 }
6eb5f6b9
JH
1059 goto do_exactf;
1060 case EXACTFL:
5dab1207
NIS
1061 m = STRING(c);
1062 ln = STR_LEN(c);
1063 lnc = (I32) ln;
d8093b23 1064 c1 = *(U8*)m;
6eb5f6b9
JH
1065 c2 = PL_fold_locale[c1];
1066 do_exactf:
db12adc6 1067 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1068
6eb5f6b9
JH
1069 if (norun && e < s)
1070 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1071
60a8b682
JH
1072 /* The idea in the EXACTF* cases is to first find the
1073 * first character of the EXACTF* node and then, if
1074 * necessary, case-insensitively compare the full
1075 * text of the node. The c1 and c2 are the first
1076 * characters (though in Unicode it gets a bit
1077 * more complicated because there are more cases
7f16dd3d
JH
1078 * than just upper and lower: one needs to use
1079 * the so-called folding case for case-insensitive
1080 * matching (called "loose matching" in Unicode).
1081 * ibcmp_utf8() will do just that. */
60a8b682 1082
1aa99e6b 1083 if (do_utf8) {
575cac57 1084 UV c, f;
89ebb4a3 1085 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1086 STRLEN len, foldlen;
d7f013c8 1087
09091399 1088 if (c1 == c2) {
5dab1207
NIS
1089 /* Upper and lower of 1st char are equal -
1090 * probably not a "letter". */
1aa99e6b 1091 while (s <= e) {
89ebb4a3 1092 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
872c91ae
JH
1093 ckWARN(WARN_UTF8) ?
1094 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1095 if ( c == c1
1096 && (ln == len ||
66423254 1097 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1098 m, (char **)0, ln, (bool)UTF))
55da9344 1099 && (norun || regtry(prog, s)) )
1aa99e6b 1100 goto got_it;
80aecb99 1101 else {
1df70142 1102 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1103 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1104 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1105 if ( f != c
1106 && (f == c1 || f == c2)
1107 && (ln == foldlen ||
66423254
JH
1108 !ibcmp_utf8((char *) foldbuf,
1109 (char **)0, foldlen, do_utf8,
d07ddd77 1110 m,
eb160463 1111 (char **)0, ln, (bool)UTF))
80aecb99
JH
1112 && (norun || regtry(prog, s)) )
1113 goto got_it;
1114 }
1aa99e6b
IH
1115 s += len;
1116 }
09091399
JH
1117 }
1118 else {
1aa99e6b 1119 while (s <= e) {
89ebb4a3 1120 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
872c91ae
JH
1121 ckWARN(WARN_UTF8) ?
1122 0 : UTF8_ALLOW_ANY);
80aecb99 1123
60a8b682 1124 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1125 * Note that not all the possible combinations
1126 * are handled here: some of them are handled
1127 * by the standard folding rules, and some of
1128 * them (the character class or ANYOF cases)
1129 * are handled during compiletime in
1130 * regexec.c:S_regclass(). */
880bd946
JH
1131 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1132 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1133 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1134
1135 if ( (c == c1 || c == c2)
1136 && (ln == len ||
66423254 1137 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1138 m, (char **)0, ln, (bool)UTF))
55da9344 1139 && (norun || regtry(prog, s)) )
1aa99e6b 1140 goto got_it;
80aecb99 1141 else {
1df70142 1142 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1143 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1144 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1145 if ( f != c
1146 && (f == c1 || f == c2)
1147 && (ln == foldlen ||
a6872d42 1148 !ibcmp_utf8((char *) foldbuf,
66423254 1149 (char **)0, foldlen, do_utf8,
d07ddd77 1150 m,
eb160463 1151 (char **)0, ln, (bool)UTF))
80aecb99
JH
1152 && (norun || regtry(prog, s)) )
1153 goto got_it;
1154 }
1aa99e6b
IH
1155 s += len;
1156 }
09091399 1157 }
1aa99e6b
IH
1158 }
1159 else {
1160 if (c1 == c2)
1161 while (s <= e) {
1162 if ( *(U8*)s == c1
1163 && (ln == 1 || !(OP(c) == EXACTF
1164 ? ibcmp(s, m, ln)
1165 : ibcmp_locale(s, m, ln)))
1166 && (norun || regtry(prog, s)) )
1167 goto got_it;
1168 s++;
1169 }
1170 else
1171 while (s <= e) {
1172 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1173 && (ln == 1 || !(OP(c) == EXACTF
1174 ? ibcmp(s, m, ln)
1175 : ibcmp_locale(s, m, ln)))
1176 && (norun || regtry(prog, s)) )
1177 goto got_it;
1178 s++;
1179 }
b3c9acc1
IZ
1180 }
1181 break;
bbce6d69 1182 case BOUNDL:
3280af22 1183 PL_reg_flags |= RF_tainted;
bbce6d69 1184 /* FALL THROUGH */
a0d0e21e 1185 case BOUND:
ffc61ed2 1186 if (do_utf8) {
12d33761 1187 if (s == PL_bostr)
ffc61ed2
JH
1188 tmp = '\n';
1189 else {
b4f7163a 1190 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1191
b4f7163a 1192 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1193 }
1194 tmp = ((OP(c) == BOUND ?
9041c2e3 1195 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1196 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1197 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1198 if (tmp == !(OP(c) == BOUND ?
3568d838 1199 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1200 isALNUM_LC_utf8((U8*)s)))
1201 {
1202 tmp = !tmp;
1203 if ((norun || regtry(prog, s)))
1204 goto got_it;
1205 }
078c425b 1206 s += uskip;
a687059c 1207 }
a0d0e21e 1208 }
667bb95a 1209 else {
12d33761 1210 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1211 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1212 while (s < strend) {
1213 if (tmp ==
1214 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1215 tmp = !tmp;
1216 if ((norun || regtry(prog, s)))
1217 goto got_it;
1218 }
1219 s++;
a0ed51b3 1220 }
a0ed51b3 1221 }
6eb5f6b9 1222 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1223 goto got_it;
1224 break;
bbce6d69 1225 case NBOUNDL:
3280af22 1226 PL_reg_flags |= RF_tainted;
bbce6d69 1227 /* FALL THROUGH */
a0d0e21e 1228 case NBOUND:
ffc61ed2 1229 if (do_utf8) {
12d33761 1230 if (s == PL_bostr)
ffc61ed2
JH
1231 tmp = '\n';
1232 else {
b4f7163a 1233 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1234
b4f7163a 1235 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1236 }
1237 tmp = ((OP(c) == NBOUND ?
9041c2e3 1238 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1239 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1240 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1241 if (tmp == !(OP(c) == NBOUND ?
3568d838 1242 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1243 isALNUM_LC_utf8((U8*)s)))
1244 tmp = !tmp;
1245 else if ((norun || regtry(prog, s)))
1246 goto got_it;
078c425b 1247 s += uskip;
ffc61ed2 1248 }
a0d0e21e 1249 }
667bb95a 1250 else {
12d33761 1251 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1252 tmp = ((OP(c) == NBOUND ?
1253 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1254 while (s < strend) {
1255 if (tmp ==
1256 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1257 tmp = !tmp;
1258 else if ((norun || regtry(prog, s)))
1259 goto got_it;
1260 s++;
1261 }
a0ed51b3 1262 }
6eb5f6b9 1263 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1264 goto got_it;
1265 break;
a0d0e21e 1266 case ALNUM:
ffc61ed2 1267 if (do_utf8) {
1a4fad37 1268 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1269 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1270 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1271 if (tmp && (norun || regtry(prog, s)))
1272 goto got_it;
1273 else
1274 tmp = doevery;
1275 }
bbce6d69 1276 else
ffc61ed2 1277 tmp = 1;
078c425b 1278 s += uskip;
bbce6d69 1279 }
bbce6d69 1280 }
ffc61ed2
JH
1281 else {
1282 while (s < strend) {
1283 if (isALNUM(*s)) {
1284 if (tmp && (norun || regtry(prog, s)))
1285 goto got_it;
1286 else
1287 tmp = doevery;
1288 }
a0ed51b3 1289 else
ffc61ed2
JH
1290 tmp = 1;
1291 s++;
a0ed51b3 1292 }
a0ed51b3
LW
1293 }
1294 break;
bbce6d69 1295 case ALNUML:
3280af22 1296 PL_reg_flags |= RF_tainted;
ffc61ed2 1297 if (do_utf8) {
078c425b 1298 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1299 if (isALNUM_LC_utf8((U8*)s)) {
1300 if (tmp && (norun || regtry(prog, s)))
1301 goto got_it;
1302 else
1303 tmp = doevery;
1304 }
a687059c 1305 else
ffc61ed2 1306 tmp = 1;
078c425b 1307 s += uskip;
a0d0e21e 1308 }
a0d0e21e 1309 }
ffc61ed2
JH
1310 else {
1311 while (s < strend) {
1312 if (isALNUM_LC(*s)) {
1313 if (tmp && (norun || regtry(prog, s)))
1314 goto got_it;
1315 else
1316 tmp = doevery;
1317 }
a0ed51b3 1318 else
ffc61ed2
JH
1319 tmp = 1;
1320 s++;
a0ed51b3 1321 }
a0ed51b3
LW
1322 }
1323 break;
a0d0e21e 1324 case NALNUM:
ffc61ed2 1325 if (do_utf8) {
1a4fad37 1326 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1327 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1328 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1329 if (tmp && (norun || regtry(prog, s)))
1330 goto got_it;
1331 else
1332 tmp = doevery;
1333 }
bbce6d69 1334 else
ffc61ed2 1335 tmp = 1;
078c425b 1336 s += uskip;
bbce6d69 1337 }
bbce6d69 1338 }
ffc61ed2
JH
1339 else {
1340 while (s < strend) {
1341 if (!isALNUM(*s)) {
1342 if (tmp && (norun || regtry(prog, s)))
1343 goto got_it;
1344 else
1345 tmp = doevery;
1346 }
a0ed51b3 1347 else
ffc61ed2
JH
1348 tmp = 1;
1349 s++;
a0ed51b3 1350 }
a0ed51b3
LW
1351 }
1352 break;
bbce6d69 1353 case NALNUML:
3280af22 1354 PL_reg_flags |= RF_tainted;
ffc61ed2 1355 if (do_utf8) {
078c425b 1356 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1357 if (!isALNUM_LC_utf8((U8*)s)) {
1358 if (tmp && (norun || regtry(prog, s)))
1359 goto got_it;
1360 else
1361 tmp = doevery;
1362 }
a687059c 1363 else
ffc61ed2 1364 tmp = 1;
078c425b 1365 s += uskip;
a687059c 1366 }
a0d0e21e 1367 }
ffc61ed2
JH
1368 else {
1369 while (s < strend) {
1370 if (!isALNUM_LC(*s)) {
1371 if (tmp && (norun || regtry(prog, s)))
1372 goto got_it;
1373 else
1374 tmp = doevery;
1375 }
a0ed51b3 1376 else
ffc61ed2
JH
1377 tmp = 1;
1378 s++;
a0ed51b3 1379 }
a0ed51b3
LW
1380 }
1381 break;
a0d0e21e 1382 case SPACE:
ffc61ed2 1383 if (do_utf8) {
1a4fad37 1384 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1385 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1386 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1387 if (tmp && (norun || regtry(prog, s)))
1388 goto got_it;
1389 else
1390 tmp = doevery;
1391 }
a0d0e21e 1392 else
ffc61ed2 1393 tmp = 1;
078c425b 1394 s += uskip;
2304df62 1395 }
a0d0e21e 1396 }
ffc61ed2
JH
1397 else {
1398 while (s < strend) {
1399 if (isSPACE(*s)) {
1400 if (tmp && (norun || regtry(prog, s)))
1401 goto got_it;
1402 else
1403 tmp = doevery;
1404 }
a0ed51b3 1405 else
ffc61ed2
JH
1406 tmp = 1;
1407 s++;
a0ed51b3 1408 }
a0ed51b3
LW
1409 }
1410 break;
bbce6d69 1411 case SPACEL:
3280af22 1412 PL_reg_flags |= RF_tainted;
ffc61ed2 1413 if (do_utf8) {
078c425b 1414 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1415 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1416 if (tmp && (norun || regtry(prog, s)))
1417 goto got_it;
1418 else
1419 tmp = doevery;
1420 }
bbce6d69 1421 else
ffc61ed2 1422 tmp = 1;
078c425b 1423 s += uskip;
bbce6d69 1424 }
bbce6d69 1425 }
ffc61ed2
JH
1426 else {
1427 while (s < strend) {
1428 if (isSPACE_LC(*s)) {
1429 if (tmp && (norun || regtry(prog, s)))
1430 goto got_it;
1431 else
1432 tmp = doevery;
1433 }
a0ed51b3 1434 else
ffc61ed2
JH
1435 tmp = 1;
1436 s++;
a0ed51b3 1437 }
a0ed51b3
LW
1438 }
1439 break;
a0d0e21e 1440 case NSPACE:
ffc61ed2 1441 if (do_utf8) {
1a4fad37 1442 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1443 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1444 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1445 if (tmp && (norun || regtry(prog, s)))
1446 goto got_it;
1447 else
1448 tmp = doevery;
1449 }
a0d0e21e 1450 else
ffc61ed2 1451 tmp = 1;
078c425b 1452 s += uskip;
a687059c 1453 }
a0d0e21e 1454 }
ffc61ed2
JH
1455 else {
1456 while (s < strend) {
1457 if (!isSPACE(*s)) {
1458 if (tmp && (norun || regtry(prog, s)))
1459 goto got_it;
1460 else
1461 tmp = doevery;
1462 }
a0ed51b3 1463 else
ffc61ed2
JH
1464 tmp = 1;
1465 s++;
a0ed51b3 1466 }
a0ed51b3
LW
1467 }
1468 break;
bbce6d69 1469 case NSPACEL:
3280af22 1470 PL_reg_flags |= RF_tainted;
ffc61ed2 1471 if (do_utf8) {
078c425b 1472 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1473 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1474 if (tmp && (norun || regtry(prog, s)))
1475 goto got_it;
1476 else
1477 tmp = doevery;
1478 }
bbce6d69 1479 else
ffc61ed2 1480 tmp = 1;
078c425b 1481 s += uskip;
bbce6d69 1482 }
bbce6d69 1483 }
ffc61ed2
JH
1484 else {
1485 while (s < strend) {
1486 if (!isSPACE_LC(*s)) {
1487 if (tmp && (norun || regtry(prog, s)))
1488 goto got_it;
1489 else
1490 tmp = doevery;
1491 }
a0ed51b3 1492 else
ffc61ed2
JH
1493 tmp = 1;
1494 s++;
a0ed51b3 1495 }
a0ed51b3
LW
1496 }
1497 break;
a0d0e21e 1498 case DIGIT:
ffc61ed2 1499 if (do_utf8) {
1a4fad37 1500 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1501 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1502 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1503 if (tmp && (norun || regtry(prog, s)))
1504 goto got_it;
1505 else
1506 tmp = doevery;
1507 }
a0d0e21e 1508 else
ffc61ed2 1509 tmp = 1;
078c425b 1510 s += uskip;
2b69d0c2 1511 }
a0d0e21e 1512 }
ffc61ed2
JH
1513 else {
1514 while (s < strend) {
1515 if (isDIGIT(*s)) {
1516 if (tmp && (norun || regtry(prog, s)))
1517 goto got_it;
1518 else
1519 tmp = doevery;
1520 }
a0ed51b3 1521 else
ffc61ed2
JH
1522 tmp = 1;
1523 s++;
a0ed51b3 1524 }
a0ed51b3
LW
1525 }
1526 break;
b8c5462f
JH
1527 case DIGITL:
1528 PL_reg_flags |= RF_tainted;
ffc61ed2 1529 if (do_utf8) {
078c425b 1530 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1531 if (isDIGIT_LC_utf8((U8*)s)) {
1532 if (tmp && (norun || regtry(prog, s)))
1533 goto got_it;
1534 else
1535 tmp = doevery;
1536 }
b8c5462f 1537 else
ffc61ed2 1538 tmp = 1;
078c425b 1539 s += uskip;
b8c5462f 1540 }
b8c5462f 1541 }
ffc61ed2
JH
1542 else {
1543 while (s < strend) {
1544 if (isDIGIT_LC(*s)) {
1545 if (tmp && (norun || regtry(prog, s)))
1546 goto got_it;
1547 else
1548 tmp = doevery;
1549 }
b8c5462f 1550 else
ffc61ed2
JH
1551 tmp = 1;
1552 s++;
b8c5462f 1553 }
b8c5462f
JH
1554 }
1555 break;
a0d0e21e 1556 case NDIGIT:
ffc61ed2 1557 if (do_utf8) {
1a4fad37 1558 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1559 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1560 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1561 if (tmp && (norun || regtry(prog, s)))
1562 goto got_it;
1563 else
1564 tmp = doevery;
1565 }
a0d0e21e 1566 else
ffc61ed2 1567 tmp = 1;
078c425b 1568 s += uskip;
a687059c 1569 }
a0d0e21e 1570 }
ffc61ed2
JH
1571 else {
1572 while (s < strend) {
1573 if (!isDIGIT(*s)) {
1574 if (tmp && (norun || regtry(prog, s)))
1575 goto got_it;
1576 else
1577 tmp = doevery;
1578 }
a0ed51b3 1579 else
ffc61ed2
JH
1580 tmp = 1;
1581 s++;
a0ed51b3 1582 }
a0ed51b3
LW
1583 }
1584 break;
b8c5462f
JH
1585 case NDIGITL:
1586 PL_reg_flags |= RF_tainted;
ffc61ed2 1587 if (do_utf8) {
078c425b 1588 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1589 if (!isDIGIT_LC_utf8((U8*)s)) {
1590 if (tmp && (norun || regtry(prog, s)))
1591 goto got_it;
1592 else
1593 tmp = doevery;
1594 }
b8c5462f 1595 else
ffc61ed2 1596 tmp = 1;
078c425b 1597 s += uskip;
b8c5462f 1598 }
a0ed51b3 1599 }
ffc61ed2
JH
1600 else {
1601 while (s < strend) {
1602 if (!isDIGIT_LC(*s)) {
1603 if (tmp && (norun || regtry(prog, s)))
1604 goto got_it;
1605 else
1606 tmp = doevery;
1607 }
cf93c79d 1608 else
ffc61ed2
JH
1609 tmp = 1;
1610 s++;
b8c5462f 1611 }
b8c5462f
JH
1612 }
1613 break;
b3c9acc1 1614 default:
3c3eec57
GS
1615 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1616 break;
d6a28714 1617 }
6eb5f6b9
JH
1618 return 0;
1619 got_it:
1620 return s;
1621}
1622
1623/*
1624 - regexec_flags - match a regexp against a string
1625 */
1626I32
1627Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1628 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1629/* strend: pointer to null at end of string */
1630/* strbeg: real beginning of string */
1631/* minend: end of match must be >=minend after stringarg. */
1632/* data: May be used for some additional optimizations. */
1633/* nosave: For optimizations. */
1634{
6eb5f6b9
JH
1635 register char *s;
1636 register regnode *c;
1637 register char *startpos = stringarg;
6eb5f6b9
JH
1638 I32 minlen; /* must match at least this many chars */
1639 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1640 I32 end_shift = 0; /* Same for the end. */ /* CC */
1641 I32 scream_pos = -1; /* Internal iterator of scream. */
1642 char *scream_olds;
1643 SV* oreplsv = GvSV(PL_replgv);
1df70142 1644 const bool do_utf8 = DO_UTF8(sv);
a3b680e6 1645 const I32 multiline = prog->reganch & PMf_MULTILINE;
2a782b5b 1646#ifdef DEBUGGING
9e55ce06
JH
1647 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1648 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1649#endif
a3621e74
YO
1650
1651 GET_RE_DEBUG_FLAGS_DECL;
1652
9d4ba2ae 1653 PERL_UNUSED_ARG(data);
a30b2f1f 1654 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1655
1656 PL_regcc = 0;
1657
1658 cache_re(prog);
1659#ifdef DEBUGGING
aea4f609 1660 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1661#endif
1662
1663 /* Be paranoid... */
1664 if (prog == NULL || startpos == NULL) {
1665 Perl_croak(aTHX_ "NULL regexp parameter");
1666 return 0;
1667 }
1668
1669 minlen = prog->minlen;
61a36c01 1670 if (strend - startpos < minlen) {
a3621e74 1671 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1672 "String too short [regexec_flags]...\n"));
1673 goto phooey;
1aa99e6b 1674 }
6eb5f6b9 1675
6eb5f6b9
JH
1676 /* Check validity of program. */
1677 if (UCHARAT(prog->program) != REG_MAGIC) {
1678 Perl_croak(aTHX_ "corrupted regexp program");
1679 }
1680
1681 PL_reg_flags = 0;
1682 PL_reg_eval_set = 0;
1683 PL_reg_maxiter = 0;
1684
1685 if (prog->reganch & ROPT_UTF8)
1686 PL_reg_flags |= RF_utf8;
1687
1688 /* Mark beginning of line for ^ and lookbehind. */
1689 PL_regbol = startpos;
1690 PL_bostr = strbeg;
1691 PL_reg_sv = sv;
1692
1693 /* Mark end of line for $ (and such) */
1694 PL_regeol = strend;
1695
1696 /* see how far we have to get to not match where we matched before */
1697 PL_regtill = startpos+minend;
1698
1699 /* We start without call_cc context. */
1700 PL_reg_call_cc = 0;
1701
1702 /* If there is a "must appear" string, look for it. */
1703 s = startpos;
1704
1705 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1706 MAGIC *mg;
1707
1708 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1709 PL_reg_ganch = startpos;
1710 else if (sv && SvTYPE(sv) >= SVt_PVMG
1711 && SvMAGIC(sv)
14befaf4
DM
1712 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1713 && mg->mg_len >= 0) {
6eb5f6b9
JH
1714 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1715 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1716 if (s > PL_reg_ganch)
6eb5f6b9
JH
1717 goto phooey;
1718 s = PL_reg_ganch;
1719 }
1720 }
1721 else /* pos() not defined */
1722 PL_reg_ganch = strbeg;
1723 }
1724
33b8afdf 1725 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1726 re_scream_pos_data d;
1727
1728 d.scream_olds = &scream_olds;
1729 d.scream_pos = &scream_pos;
1730 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1731 if (!s) {
a3621e74 1732 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1733 goto phooey; /* not present */
3fa9c3d7 1734 }
6eb5f6b9
JH
1735 }
1736
a3621e74 1737 DEBUG_EXECUTE_r({
1df70142
AL
1738 const char * const s0 = UTF
1739 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1740 UNI_DISPLAY_REGEX)
1741 : prog->precomp;
1742 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1743 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1744 UNI_DISPLAY_REGEX) : startpos;
1df70142 1745 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1746 if (!PL_colorset)
1747 reginitcolors();
1748 PerlIO_printf(Perl_debug_log,
a0288114 1749 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1750 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1751 len0, len0, s0,
2a782b5b 1752 PL_colors[1],
9e55ce06 1753 len0 > 60 ? "..." : "",
2a782b5b 1754 PL_colors[0],
9e55ce06
JH
1755 (int)(len1 > 60 ? 60 : len1),
1756 s1, PL_colors[1],
1757 (len1 > 60 ? "..." : "")
2a782b5b
JH
1758 );
1759 });
6eb5f6b9
JH
1760
1761 /* Simplest case: anchored match need be tried only once. */
1762 /* [unless only anchor is BOL and multiline is set] */
1763 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1764 if (s == startpos && regtry(prog, startpos))
1765 goto got_it;
7fba1cd6 1766 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1767 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1768 {
1769 char *end;
1770
1771 if (minlen)
1772 dontbother = minlen - 1;
1aa99e6b 1773 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1774 /* for multiline we only have to try after newlines */
33b8afdf 1775 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1776 if (s == startpos)
1777 goto after_try;
1778 while (1) {
1779 if (regtry(prog, s))
1780 goto got_it;
1781 after_try:
1782 if (s >= end)
1783 goto phooey;
1784 if (prog->reganch & RE_USE_INTUIT) {
1785 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1786 if (!s)
1787 goto phooey;
1788 }
1789 else
1790 s++;
1791 }
1792 } else {
1793 if (s > startpos)
1794 s--;
1795 while (s < end) {
1796 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1797 if (regtry(prog, s))
1798 goto got_it;
1799 }
1800 }
1801 }
1802 }
1803 goto phooey;
1804 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1805 if (regtry(prog, PL_reg_ganch))
1806 goto got_it;
1807 goto phooey;
1808 }
1809
1810 /* Messy cases: unanchored match. */
33b8afdf 1811 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1812 /* we have /x+whatever/ */
1813 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1814 char ch;
bf93d4cc
GS
1815#ifdef DEBUGGING
1816 int did_match = 0;
1817#endif
33b8afdf
JH
1818 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1819 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1820 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1821
1aa99e6b 1822 if (do_utf8) {
6eb5f6b9
JH
1823 while (s < strend) {
1824 if (*s == ch) {
a3621e74 1825 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1826 if (regtry(prog, s)) goto got_it;
1827 s += UTF8SKIP(s);
1828 while (s < strend && *s == ch)
1829 s += UTF8SKIP(s);
1830 }
1831 s += UTF8SKIP(s);
1832 }
1833 }
1834 else {
1835 while (s < strend) {
1836 if (*s == ch) {
a3621e74 1837 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1838 if (regtry(prog, s)) goto got_it;
1839 s++;
1840 while (s < strend && *s == ch)
1841 s++;
1842 }
1843 s++;
1844 }
1845 }
a3621e74 1846 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1847 PerlIO_printf(Perl_debug_log,
b7953727
JH
1848 "Did not find anchored character...\n")
1849 );
6eb5f6b9 1850 }
33b8afdf
JH
1851 else if (prog->anchored_substr != Nullsv
1852 || prog->anchored_utf8 != Nullsv
1853 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1854 && prog->float_max_offset < strend - s)) {
1855 SV *must;
1856 I32 back_max;
1857 I32 back_min;
1858 char *last;
6eb5f6b9 1859 char *last1; /* Last position checked before */
bf93d4cc
GS
1860#ifdef DEBUGGING
1861 int did_match = 0;
1862#endif
33b8afdf
JH
1863 if (prog->anchored_substr || prog->anchored_utf8) {
1864 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1865 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1866 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1867 back_max = back_min = prog->anchored_offset;
1868 } else {
1869 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1870 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1871 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1872 back_max = prog->float_max_offset;
1873 back_min = prog->float_min_offset;
1874 }
1875 if (must == &PL_sv_undef)
1876 /* could not downgrade utf8 check substring, so must fail */
1877 goto phooey;
1878
1879 last = HOP3c(strend, /* Cannot start after this */
1880 -(I32)(CHR_SVLEN(must)
1881 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1882
1883 if (s > PL_bostr)
1884 last1 = HOPc(s, -1);
1885 else
1886 last1 = s - 1; /* bogus */
1887
a0288114 1888 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1889 check_substr==must. */
1890 scream_pos = -1;
1891 dontbother = end_shift;
1892 strend = HOPc(strend, -dontbother);
1893 while ( (s <= last) &&
9041c2e3 1894 ((flags & REXEC_SCREAM)
1aa99e6b 1895 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1896 end_shift, &scream_pos, 0))
1aa99e6b 1897 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1898 (unsigned char*)strend, must,
7fba1cd6 1899 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1900 /* we may be pointing at the wrong string */
1901 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1902 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1903 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1904 if (HOPc(s, -back_max) > last1) {
1905 last1 = HOPc(s, -back_min);
1906 s = HOPc(s, -back_max);
1907 }
1908 else {
1909 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1910
1911 last1 = HOPc(s, -back_min);
1912 s = t;
1913 }
1aa99e6b 1914 if (do_utf8) {
6eb5f6b9
JH
1915 while (s <= last1) {
1916 if (regtry(prog, s))
1917 goto got_it;
1918 s += UTF8SKIP(s);
1919 }
1920 }
1921 else {
1922 while (s <= last1) {
1923 if (regtry(prog, s))
1924 goto got_it;
1925 s++;
1926 }
1927 }
1928 }
a3621e74 1929 DEBUG_EXECUTE_r(if (!did_match)
b7953727 1930 PerlIO_printf(Perl_debug_log,
a0288114 1931 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1932 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1933 ? "anchored" : "floating"),
1934 PL_colors[0],
1935 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 1936 SvPVX_const(must),
b7953727
JH
1937 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1938 );
6eb5f6b9
JH
1939 goto phooey;
1940 }
155aba94 1941 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1942 if (minlen) {
1943 I32 op = (U8)OP(prog->regstclass);
66e933ab 1944 /* don't bother with what can't match */
f14c76ed
RGS
1945 if (PL_regkind[op] != EXACT && op != CANY)
1946 strend = HOPc(strend, -(minlen - 1));
1947 }
a3621e74 1948 DEBUG_EXECUTE_r({
ffc61ed2 1949 SV *prop = sv_newmortal();
cfd0369c
NC
1950 const char *s0;
1951 const char *s1;
9e55ce06
JH
1952 int len0;
1953 int len1;
1954
ffc61ed2 1955 regprop(prop, c);
9e55ce06 1956 s0 = UTF ?
3f7c398e 1957 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 1958 UNI_DISPLAY_REGEX) :
cfd0369c 1959 SvPVX_const(prop);
9e55ce06
JH
1960 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1961 s1 = UTF ?
c728cb41 1962 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1963 len1 = UTF ? SvCUR(dsv1) : strend - s;
1964 PerlIO_printf(Perl_debug_log,
a0288114 1965 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
9e55ce06
JH
1966 len0, len0, s0,
1967 len1, len1, s1);
ffc61ed2 1968 });
06b5626a 1969 if (find_byclass(prog, c, s, strend, 0))
6eb5f6b9 1970 goto got_it;
a3621e74 1971 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1972 }
1973 else {
1974 dontbother = 0;
33b8afdf
JH
1975 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1976 /* Trim the end. */
d6a28714 1977 char *last;
33b8afdf
JH
1978 SV* float_real;
1979
1980 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1981 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1982 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1983
1984 if (flags & REXEC_SCREAM) {
33b8afdf 1985 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1986 end_shift, &scream_pos, 1); /* last one */
1987 if (!last)
ffc61ed2 1988 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1989 /* we may be pointing at the wrong string */
1990 else if (RX_MATCH_COPIED(prog))
3f7c398e 1991 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1992 }
d6a28714
JH
1993 else {
1994 STRLEN len;
cfd0369c 1995 const char * const little = SvPV_const(float_real, len);
d6a28714 1996
33b8afdf 1997 if (SvTAIL(float_real)) {
d6a28714
JH
1998 if (memEQ(strend - len + 1, little, len - 1))
1999 last = strend - len + 1;
7fba1cd6 2000 else if (!multiline)
9041c2e3 2001 last = memEQ(strend - len, little, len)
d6a28714 2002 ? strend - len : Nullch;
b8c5462f 2003 else
d6a28714
JH
2004 goto find_last;
2005 } else {
2006 find_last:
9041c2e3 2007 if (len)
d6a28714 2008 last = rninstr(s, strend, little, little + len);
b8c5462f 2009 else
a0288114 2010 last = strend; /* matching "$" */
b8c5462f 2011 }
b8c5462f 2012 }
bf93d4cc 2013 if (last == NULL) {
a3621e74 2014 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2015 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2016 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2017 goto phooey; /* Should not happen! */
2018 }
d6a28714
JH
2019 dontbother = strend - last + prog->float_min_offset;
2020 }
2021 if (minlen && (dontbother < minlen))
2022 dontbother = minlen - 1;
2023 strend -= dontbother; /* this one's always in bytes! */
2024 /* We don't know much -- general case. */
1aa99e6b 2025 if (do_utf8) {
d6a28714
JH
2026 for (;;) {
2027 if (regtry(prog, s))
2028 goto got_it;
2029 if (s >= strend)
2030 break;
b8c5462f 2031 s += UTF8SKIP(s);
d6a28714
JH
2032 };
2033 }
2034 else {
2035 do {
2036 if (regtry(prog, s))
2037 goto got_it;
2038 } while (s++ < strend);
2039 }
2040 }
2041
2042 /* Failure. */
2043 goto phooey;
2044
2045got_it:
2046 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2047
2048 if (PL_reg_eval_set) {
2049 /* Preserve the current value of $^R */
2050 if (oreplsv != GvSV(PL_replgv))
2051 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2052 restored, the value remains
2053 the same. */
acfe0abc 2054 restore_pos(aTHX_ 0);
d6a28714
JH
2055 }
2056
2057 /* make sure $`, $&, $', and $digit will work later */
2058 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2059 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2060 if (flags & REXEC_COPY_STR) {
2061 I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2062#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2063 if ((SvIsCOW(sv)
2064 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2065 if (DEBUG_C_TEST) {
2066 PerlIO_printf(Perl_debug_log,
2067 "Copy on write: regexp capture, type %d\n",
2068 (int) SvTYPE(sv));
2069 }
2070 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2071 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2072 assert (SvPOKp(prog->saved_copy));
2073 } else
2074#endif
2075 {
2076 RX_MATCH_COPIED_on(prog);
2077 s = savepvn(strbeg, i);
2078 prog->subbeg = s;
2079 }
d6a28714 2080 prog->sublen = i;
d6a28714
JH
2081 }
2082 else {
2083 prog->subbeg = strbeg;
2084 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2085 }
2086 }
9041c2e3 2087
d6a28714
JH
2088 return 1;
2089
2090phooey:
a3621e74 2091 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2092 PL_colors[4], PL_colors[5]));
d6a28714 2093 if (PL_reg_eval_set)
acfe0abc 2094 restore_pos(aTHX_ 0);
d6a28714
JH
2095 return 0;
2096}
2097
2098/*
2099 - regtry - try match at specific point
2100 */
2101STATIC I32 /* 0 failure, 1 success */
2102S_regtry(pTHX_ regexp *prog, char *startpos)
2103{
d6a28714
JH
2104 register I32 i;
2105 register I32 *sp;
2106 register I32 *ep;
2107 CHECKPOINT lastcp;
a3621e74 2108 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2109
02db2b7b
IZ
2110#ifdef DEBUGGING
2111 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2112#endif
d6a28714
JH
2113 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2114 MAGIC *mg;
2115
2116 PL_reg_eval_set = RS_init;
a3621e74 2117 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2118 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2119 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2120 ));
e8347627 2121 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2122 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2123 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2124 SAVETMPS;
2125 /* Apparently this is not needed, judging by wantarray. */
e8347627 2126 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2127 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2128
2129 if (PL_reg_sv) {
2130 /* Make $_ available to executed code. */
2131 if (PL_reg_sv != DEFSV) {
59f00321 2132 SAVE_DEFSV;
d6a28714 2133 DEFSV = PL_reg_sv;
b8c5462f 2134 }
d6a28714 2135
9041c2e3 2136 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2137 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2138 /* prepare for quick setting of pos */
14befaf4
DM
2139 sv_magic(PL_reg_sv, (SV*)0,
2140 PERL_MAGIC_regex_global, Nullch, 0);
2141 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2142 mg->mg_len = -1;
b8c5462f 2143 }
d6a28714
JH
2144 PL_reg_magic = mg;
2145 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2146 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2147 }
09687e5a 2148 if (!PL_reg_curpm) {
a02a5408 2149 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2150#ifdef USE_ITHREADS
2151 {
2152 SV* repointer = newSViv(0);
577e12cc 2153 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2154 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2155 av_push(PL_regex_padav,repointer);
2156 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2157 PL_regex_pad = AvARRAY(PL_regex_padav);
2158 }
2159#endif
2160 }
aaa362c4 2161 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2162 PL_reg_oldcurpm = PL_curpm;
2163 PL_curpm = PL_reg_curpm;
2164 if (RX_MATCH_COPIED(prog)) {
2165 /* Here is a serious problem: we cannot rewrite subbeg,
2166 since it may be needed if this match fails. Thus
2167 $` inside (?{}) could fail... */
2168 PL_reg_oldsaved = prog->subbeg;
2169 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2170#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2171 PL_nrs = prog->saved_copy;
2172#endif
d6a28714
JH
2173 RX_MATCH_COPIED_off(prog);
2174 }
2175 else
2176 PL_reg_oldsaved = Nullch;
2177 prog->subbeg = PL_bostr;
2178 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2179 }
2180 prog->startp[0] = startpos - PL_bostr;
2181 PL_reginput = startpos;
2182 PL_regstartp = prog->startp;
2183 PL_regendp = prog->endp;
2184 PL_reglastparen = &prog->lastparen;
a01268b5 2185 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2186 prog->lastparen = 0;
03994de8 2187 prog->lastcloseparen = 0;
d6a28714 2188 PL_regsize = 0;
a3621e74 2189 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2190 if (PL_reg_start_tmpl <= prog->nparens) {
2191 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2192 if(PL_reg_start_tmp)
2193 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2194 else
a02a5408 2195 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2196 }
2197
2198 /* XXXX What this code is doing here?!!! There should be no need
2199 to do this again and again, PL_reglastparen should take care of
3dd2943c 2200 this! --ilya*/
dafc8851
JH
2201
2202 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2203 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2204 * PL_reglastparen), is not needed at all by the test suite
2205 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2206 * enough, for building DynaLoader, or otherwise this
2207 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2208 * will happen. Meanwhile, this code *is* needed for the
2209 * above-mentioned test suite tests to succeed. The common theme
2210 * on those tests seems to be returning null fields from matches.
2211 * --jhi */
dafc8851 2212#if 1
d6a28714
JH
2213 sp = prog->startp;
2214 ep = prog->endp;
2215 if (prog->nparens) {
eb160463 2216 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2217 *++sp = -1;
2218 *++ep = -1;
2219 }
2220 }
dafc8851 2221#endif
02db2b7b 2222 REGCP_SET(lastcp);
d6a28714
JH
2223 if (regmatch(prog->program + 1)) {
2224 prog->endp[0] = PL_reginput - PL_bostr;
2225 return 1;
2226 }
02db2b7b 2227 REGCP_UNWIND(lastcp);
d6a28714
JH
2228 return 0;
2229}
2230
02db2b7b
IZ
2231#define RE_UNWIND_BRANCH 1
2232#define RE_UNWIND_BRANCHJ 2
2233
2234union re_unwind_t;
2235
2236typedef struct { /* XX: makes sense to enlarge it... */
2237 I32 type;
2238 I32 prev;
2239 CHECKPOINT lastcp;
2240} re_unwind_generic_t;
2241
2242typedef struct {
2243 I32 type;
2244 I32 prev;
2245 CHECKPOINT lastcp;
2246 I32 lastparen;
2247 regnode *next;
2248 char *locinput;
2249 I32 nextchr;
2250#ifdef DEBUGGING
2251 int regindent;
2252#endif
2253} re_unwind_branch_t;
2254
2255typedef union re_unwind_t {
2256 I32 type;
2257 re_unwind_generic_t generic;
2258 re_unwind_branch_t branch;
2259} re_unwind_t;
2260
8ba1375e
MJD
2261#define sayYES goto yes
2262#define sayNO goto no
e0f9d4a8 2263#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2264#define sayYES_FINAL goto yes_final
2265#define sayYES_LOUD goto yes_loud
2266#define sayNO_FINAL goto no_final
2267#define sayNO_SILENT goto do_no
2268#define saySAME(x) if (x) goto yes; else goto no
2269
3ab3c9b4
HS
2270#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2271#define POSCACHE_SEEN 1 /* we know what we're caching */
2272#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2273#define CACHEsayYES STMT_START { \
2274 if (cache_offset | cache_bit) { \
2275 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2276 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2277 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2278 /* cache records failure, but this is success */ \
2279 DEBUG_r( \
2280 PerlIO_printf(Perl_debug_log, \
2281 "%*s (remove success from failure cache)\n", \
2282 REPORT_CODE_OFF+PL_regindent*2, "") \
2283 ); \
2284 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2285 } \
2286 } \
2287 sayYES; \
2288} STMT_END
2289#define CACHEsayNO STMT_START { \
2290 if (cache_offset | cache_bit) { \
2291 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2292 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2293 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2294 /* cache records success, but this is failure */ \
2295 DEBUG_r( \
2296 PerlIO_printf(Perl_debug_log, \
2297 "%*s (remove failure from success cache)\n", \
2298 REPORT_CODE_OFF+PL_regindent*2, "") \
2299 ); \
2300 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2301 } \
2302 } \
2303 sayNO; \
2304} STMT_END
2305
a3621e74
YO
2306/* this is used to determine how far from the left messages like
2307 'failed...' are printed. Currently 29 makes these messages line
2308 up with the opcode they refer to. Earlier perls used 25 which
2309 left these messages outdented making reviewing a debug output
2310 quite difficult.
2311*/
2312#define REPORT_CODE_OFF 29
2313
2314
2315/* Make sure there is a test for this +1 options in re_tests */
2316#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2317
2318#define TRIE_CHECK_STATE_IS_ACCEPTING STMT_START { \
2319 if ( trie->states[ state ].wordnum ) { \
2320 if ( !accepted ) { \
2321 ENTER; \
2322 SAVETMPS; \
2323 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN ; \
2324 sv_accept_buff=NEWSV( 1234, \
2325 bufflen * sizeof(reg_trie_accepted) - 1 ); \
2326 SvCUR_set( sv_accept_buff, sizeof(reg_trie_accepted) ); \
2327 SvPOK_on( sv_accept_buff ); \
2328 sv_2mortal( sv_accept_buff ); \
2329 accept_buff = (reg_trie_accepted*)SvPV_nolen( sv_accept_buff );\
2330 } else { \
2331 if ( accepted >= bufflen ) { \
2332 bufflen *= 2; \
2333 accept_buff =(reg_trie_accepted*)SvGROW( sv_accept_buff, \
2334 bufflen * sizeof(reg_trie_accepted) ); \
2335 } \
2336 SvCUR_set( sv_accept_buff,SvCUR( sv_accept_buff ) \
2337 + sizeof( reg_trie_accepted ) ); \
2338 } \
2339 accept_buff[ accepted ].wordnum = trie->states[ state ].wordnum; \
2340 accept_buff[ accepted ].endpos = uc; \
2341 ++accepted; \
2342 } } STMT_END
2343
2344#define TRIE_HANDLE_CHAR STMT_START { \
2345 if ( uvc < 256 ) { \
2346 charid = trie->charmap[ uvc ]; \
2347 } else { \
2348 charid = 0; \
2349 if( trie->widecharmap ) { \
2350 SV** svpp = (SV**)NULL; \
2351 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, \
2352 sizeof( UV ), 0 ); \
2353 if ( svpp ) { \
2354 charid = (U16)SvIV( *svpp ); \
2355 } \
2356 } \
2357 } \
2358 if ( charid && \
cc601c31
YO
2359 ( base + charid > trie->uniquecharcount ) && \
2360 ( base + charid - 1 - trie->uniquecharcount < trie->lasttrans) && \
a3621e74
YO
2361 trie->trans[ base + charid - 1 - trie->uniquecharcount ].check == state ) \
2362 { \
2363 state = trie->trans[ base + charid - 1 - trie->uniquecharcount ].next; \
2364 } else { \
2365 state = 0; \
2366 } \
2367 uc += len; \
2368 } STMT_END
8ba1375e 2369
d6a28714
JH
2370/*
2371 - regmatch - main matching routine
2372 *
2373 * Conceptually the strategy is simple: check to see whether the current
2374 * node matches, call self recursively to see whether the rest matches,
2375 * and then act accordingly. In practice we make some effort to avoid
2376 * recursion, in particular by going through "ordinary" nodes (that don't
2377 * need to know whether the rest of the match failed) by a loop instead of
2378 * by recursion.
2379 */
2380/* [lwall] I've hoisted the register declarations to the outer block in order to
2381 * maybe save a little bit of pushing and popping on the stack. It also takes
2382 * advantage of machines that use a register save mask on subroutine entry.
2383 */
2384STATIC I32 /* 0 failure, 1 success */
2385S_regmatch(pTHX_ regnode *prog)
2386{
27da23d5 2387 dVAR;
d6a28714
JH
2388 register regnode *scan; /* Current node. */
2389 regnode *next; /* Next node. */
2390 regnode *inner; /* Next node in internal branch. */
2391 register I32 nextchr; /* renamed nextchr - nextchar colides with
2392 function of same name */
2393 register I32 n; /* no or next */
b7953727
JH
2394 register I32 ln = 0; /* len or last */
2395 register char *s = Nullch; /* operand or save */
d6a28714 2396 register char *locinput = PL_reginput;
b7953727 2397 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2398 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2399 I32 unwind = 0;
a3621e74
YO
2400
2401 /* used by the trie code */
ab74612d
NC
2402 SV *sv_accept_buff = 0; /* accepting states we have traversed */
2403 reg_trie_accepted *accept_buff = 0; /* "" */
2404 reg_trie_data *trie; /* what trie are we using right now */
2405 U32 accepted = 0; /* how many accepting states we have seen*/
a3621e74 2406
b7953727 2407#if 0
02db2b7b 2408 I32 firstcp = PL_savestack_ix;
b7953727 2409#endif
a3b680e6 2410 const register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2411#ifdef DEBUGGING
ce333219
JH
2412 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2413 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2414 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
a3621e74 2415
ab74612d 2416 SV *re_debug_flags = NULL;
2a782b5b 2417#endif
02db2b7b 2418
a3621e74
YO
2419 GET_RE_DEBUG_FLAGS;
2420
d6a28714
JH
2421#ifdef DEBUGGING
2422 PL_regindent++;
2423#endif
2424
a3621e74 2425
d6a28714
JH
2426 /* Note that nextchr is a byte even in UTF */
2427 nextchr = UCHARAT(locinput);
2428 scan = prog;
2429 while (scan != NULL) {
8ba1375e 2430
a3621e74 2431 DEBUG_EXECUTE_r( {
d6a28714 2432 SV *prop = sv_newmortal();
1df70142
AL
2433 const int docolor = *PL_colors[0];
2434 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2435 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2436 /* The part of the string before starttry has one color
2437 (pref0_len chars), between starttry and current
2438 position another one (pref_len - pref0_len chars),
2439 after the current position the third one.
2440 We assume that pref0_len <= pref_len, otherwise we
2441 decrease pref0_len. */
9041c2e3 2442 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2443 ? (5 + taill) - l : locinput - PL_bostr;
2444 int pref0_len;
d6a28714 2445
df1ffd02 2446 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2447 pref_len++;
2448 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2449 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2450 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2451 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2452 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2453 l--;
d6a28714
JH
2454 if (pref0_len < 0)
2455 pref0_len = 0;
2456 if (pref0_len > pref_len)
2457 pref0_len = pref_len;
2458 regprop(prop, scan);
2a782b5b 2459 {
1df70142 2460 const char * const s0 =
f14c76ed 2461 do_utf8 && OP(scan) != CANY ?
2a782b5b 2462 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2463 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2464 locinput - pref_len;
1df70142
AL
2465 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2466 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2467 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2468 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2469 locinput - pref_len + pref0_len;
1df70142
AL
2470 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2471 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2472 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2473 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2474 locinput;
1df70142 2475 const int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2476 PerlIO_printf(Perl_debug_log,
2477 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2478 (IV)(locinput - PL_bostr),
2479 PL_colors[4],
2480 len0, s0,
2481 PL_colors[5],
2482 PL_colors[2],
2483 len1, s1,
2484 PL_colors[3],
2485 (docolor ? "" : "> <"),
2486 PL_colors[0],
2487 len2, s2,
2488 PL_colors[1],
2489 15 - l - pref_len + 1,
2490 "",
2491 (IV)(scan - PL_regprogram), PL_regindent*2, "",
3f7c398e 2492 SvPVX_const(prop));
2a782b5b
JH
2493 }
2494 });
d6a28714
JH
2495
2496 next = scan + NEXT_OFF(scan);
2497 if (next == scan)
2498 next = NULL;
2499
2500 switch (OP(scan)) {
2501 case BOL:
7fba1cd6 2502 if (locinput == PL_bostr)
d6a28714
JH
2503 {
2504 /* regtill = regbol; */
b8c5462f
JH
2505 break;
2506 }
d6a28714
JH
2507 sayNO;
2508 case MBOL:
12d33761
HS
2509 if (locinput == PL_bostr ||
2510 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2511 {
b8c5462f
JH
2512 break;
2513 }
d6a28714
JH
2514 sayNO;
2515 case SBOL:
c2a73568 2516 if (locinput == PL_bostr)
b8c5462f 2517 break;
d6a28714
JH
2518 sayNO;
2519 case GPOS:
2520 if (locinput == PL_reg_ganch)
2521 break;
2522 sayNO;
2523 case EOL:
d6a28714
JH
2524 goto seol;
2525 case MEOL:
d6a28714 2526 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2527 sayNO;
b8c5462f 2528 break;
d6a28714
JH
2529 case SEOL:
2530 seol:
2531 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2532 sayNO;
d6a28714 2533 if (PL_regeol - locinput > 1)
b8c5462f 2534 sayNO;
b8c5462f 2535 break;
d6a28714
JH
2536 case EOS:
2537 if (PL_regeol != locinput)
b8c5462f 2538 sayNO;
d6a28714 2539 break;
ffc61ed2 2540 case SANY:
d6a28714 2541 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2542 sayNO;
f33976b4
DB
2543 if (do_utf8) {
2544 locinput += PL_utf8skip[nextchr];
2545 if (locinput > PL_regeol)
2546 sayNO;
2547 nextchr = UCHARAT(locinput);
2548 }
2549 else
2550 nextchr = UCHARAT(++locinput);
2551 break;
2552 case CANY:
2553 if (!nextchr && locinput >= PL_regeol)
2554 sayNO;
b8c5462f 2555 nextchr = UCHARAT(++locinput);
a0d0e21e 2556 break;
ffc61ed2 2557 case REG_ANY:
1aa99e6b
IH
2558 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2559 sayNO;
2560 if (do_utf8) {
b8c5462f 2561 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2562 if (locinput > PL_regeol)
2563 sayNO;
a0ed51b3 2564 nextchr = UCHARAT(locinput);
a0ed51b3 2565 }
1aa99e6b
IH
2566 else
2567 nextchr = UCHARAT(++locinput);
a0ed51b3 2568 break;
a3621e74
YO
2569
2570
2571
2572 /*
2573 traverse the TRIE keeping track of all accepting states
2574 we transition through until we get to a failing node.
2575
2576 we use two slightly different pieces of code to handle
2577 the traversal depending on whether its case sensitive or
2578 not. we reuse the accept code however. (this should probably
2579 be turned into a macro.)
2580
2581 */
2582 case TRIEF:
2583 case TRIEFL:
2584 {
2585
a3b680e6 2586 const U32 uniflags = ckWARN( WARN_UTF8 ) ? 0 : UTF8_ALLOW_ANY;
a3621e74
YO
2587 U8 *uc = ( U8* )locinput;
2588 U32 state = 1;
2589 U16 charid = 0;
2590 U32 base = 0;
2591 UV uvc = 0;
2592 STRLEN len = 0;
2593 STRLEN foldlen = 0;
a3621e74
YO
2594 U8 *uscan = (U8*)NULL;
2595 STRLEN bufflen=0;
2596 accepted = 0;
2597
2598 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2599
2600 while ( state && uc <= (U8*)PL_regeol ) {
2601
2602 TRIE_CHECK_STATE_IS_ACCEPTING;
2603
2604 base = trie->states[ state ].trans.base;
2605
2606 DEBUG_TRIE_EXECUTE_r(
2607 PerlIO_printf( Perl_debug_log,
e4584336 2608 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2609 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
e4584336 2610 (UV)state, (UV)base, (UV)accepted );
a3621e74
YO
2611 );
2612
2613 if ( base ) {
2614
108bb1ad 2615 if ( do_utf8 ) {
a3621e74
YO
2616 if ( foldlen>0 ) {
2617 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2618 foldlen -= len;
2619 uscan += len;
2620 len=0;
2621 } else {
1df70142 2622 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2623 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2624 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2625 foldlen -= UNISKIP( uvc );
2626 uscan = foldbuf + UNISKIP( uvc );
2627 }
2628 } else {
e4584336 2629 uvc = (UV)*uc;
a3621e74
YO
2630 len = 1;
2631 }
2632
2633 TRIE_HANDLE_CHAR;
2634
2635 } else {
2636 state = 0;
2637 }
2638 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2639 PerlIO_printf( Perl_debug_log,
2640 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2641 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2642 );
2643 }
2644 if ( !accepted ) {
2645 sayNO;
2646 } else {
2647 goto TrieAccept;
2648 }
2649 }
2650 /* unreached codepoint: we jump into the middle of the next case
2651 from previous if blocks */
2652 case TRIE:
2653 {
a3b680e6 2654 const U32 uniflags = ckWARN( WARN_UTF8 ) ? 0 : UTF8_ALLOW_ANY;
a3621e74
YO
2655 U8 *uc = (U8*)locinput;
2656 U32 state = 1;
2657 U16 charid = 0;
2658 U32 base = 0;
2659 UV uvc = 0;
2660 STRLEN len = 0;
2661 STRLEN bufflen = 0;
2662 accepted = 0;
2663
2664 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2665
2666 while ( state && uc <= (U8*)PL_regeol ) {
2667
2668 TRIE_CHECK_STATE_IS_ACCEPTING;
2669
2670 base = trie->states[ state ].trans.base;
2671
2672 DEBUG_TRIE_EXECUTE_r(
2673 PerlIO_printf( Perl_debug_log,
e4584336 2674 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2675 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
e4584336 2676 (UV)state, (UV)base, (UV)accepted );
a3621e74
YO
2677 );
2678
2679 if ( base ) {
2680
108bb1ad 2681 if ( do_utf8 ) {
a3621e74
YO
2682 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2683 } else {
2684 uvc = (U32)*uc;
2685 len = 1;
2686 }
2687
2688 TRIE_HANDLE_CHAR;
2689
2690 } else {
2691 state = 0;
2692 }
2693 DEBUG_TRIE_EXECUTE_r(
2694 PerlIO_printf( Perl_debug_log,
e4584336
RB
2695 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2696 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2697 );
2698 }
2699 if ( !accepted ) {
2700 sayNO;
2701 }
2702 }
2703
2704
2705 /*
2706 There was at least one accepting state that we
2707 transitioned through. Presumably the number of accepting
2708 states is going to be low, typically one or two. So we
2709 simply scan through to find the one with lowest wordnum.
2710 Once we find it, we swap the last state into its place
2711 and decrement the size. We then try to match the rest of
2712 the pattern at the point where the word ends, if we
2713 succeed then we end the loop, otherwise the loop
2714 eventually terminates once all of the accepting states
2715 have been tried.
2716 */
2717 TrieAccept:
2718 {
2719 int gotit = 0;
2720
2721 if ( accepted == 1 ) {
2722 DEBUG_EXECUTE_r({
2723 SV **tmp = av_fetch( trie->words, accept_buff[ 0 ].wordnum-1, 0 );
2724 PerlIO_printf( Perl_debug_log,
2725 "%*s %sonly one match : #%d <%s>%s\n",
e4584336 2726 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74 2727 accept_buff[ 0 ].wordnum,
cfd0369c 2728 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
a3621e74
YO
2729 PL_colors[5] );
2730 });
cc601c31 2731 PL_reginput = (char *)accept_buff[ 0 ].endpos;
a3621e74
YO
2732 /* in this case we free tmps/leave before we call regmatch
2733 as we wont be using accept_buff again. */
2734 FREETMPS;
2735 LEAVE;
2736 gotit = regmatch( scan + NEXT_OFF( scan ) );
2737 } else {
2738 DEBUG_EXECUTE_r(
e4584336
RB
2739 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
2740 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)accepted,
a3621e74
YO
2741 PL_colors[5] );
2742 );
2743 while ( !gotit && accepted-- ) {
2744 U32 best = 0;
2745 U32 cur;
2746 for( cur = 1 ; cur <= accepted ; cur++ ) {
e4584336
RB
2747 DEBUG_TRIE_EXECUTE_r(
2748 PerlIO_printf( Perl_debug_log,
2749 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2750 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2751 (IV)best, accept_buff[ best ].wordnum, (IV)cur,
2752 accept_buff[ cur ].wordnum, PL_colors[5] );
2753 );
a3621e74
YO
2754
2755 if ( accept_buff[ cur ].wordnum < accept_buff[ best ].wordnum )
2756 best = cur;
2757 }
2758 DEBUG_EXECUTE_r({
2759 SV **tmp = av_fetch( trie->words, accept_buff[ best ].wordnum - 1, 0 );
2760 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
e4584336 2761 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74 2762 accept_buff[best].wordnum,
cfd0369c 2763 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",scan,
a3621e74
YO
2764 PL_colors[5] );
2765 });
2766 if ( best<accepted ) {
2767 reg_trie_accepted tmp = accept_buff[ best ];
2768 accept_buff[ best ] = accept_buff[ accepted ];
2769 accept_buff[ accepted ] = tmp;
2770 best = accepted;
2771 }
cc601c31 2772 PL_reginput = (char *)accept_buff[ best ].endpos;
a3621e74
YO
2773
2774 /*
2775 as far as I can tell we only need the SAVETMPS/FREETMPS
2776 for re's with EVAL in them but I'm leaving them in for
2777 all until I can be sure.
2778 */
2779 SAVETMPS;
2780 gotit = regmatch( scan + NEXT_OFF( scan ) ) ;
2781 FREETMPS;
2782 }
2783 FREETMPS;
2784 LEAVE;
2785 }
2786
2787 if ( gotit ) {
2788 sayYES;
2789 } else {
2790 sayNO;
2791 }
2792 }
2793 /* unreached codepoint */
d6a28714 2794 case EXACT:
cd439c50
IZ
2795 s = STRING(scan);
2796 ln = STR_LEN(scan);
eb160463 2797 if (do_utf8 != UTF) {
bc517b45 2798 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2799 char *l = locinput;
a3b680e6 2800 const char *e = s + ln;
a72c7584 2801
5ff6fc6d
JH
2802 if (do_utf8) {
2803 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2804 while (s < e) {
a3b680e6 2805 STRLEN ulen;
1aa99e6b 2806 if (l >= PL_regeol)
5ff6fc6d
JH
2807 sayNO;
2808 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2809 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
872c91ae
JH
2810 ckWARN(WARN_UTF8) ?
2811 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2812 sayNO;
bc517b45 2813 l += ulen;
5ff6fc6d 2814 s ++;
1aa99e6b 2815 }
5ff6fc6d
JH
2816 }
2817 else {
2818 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2819 while (s < e) {
a3b680e6 2820 STRLEN ulen;
1aa99e6b
IH
2821 if (l >= PL_regeol)
2822 sayNO;
5ff6fc6d 2823 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2824 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
872c91ae
JH
2825 ckWARN(WARN_UTF8) ?
2826 0 : UTF8_ALLOW_ANY))
1aa99e6b 2827 sayNO;
bc517b45 2828 s += ulen;
a72c7584 2829 l ++;
1aa99e6b 2830 }
5ff6fc6d 2831 }
1aa99e6b
IH
2832 locinput = l;
2833 nextchr = UCHARAT(locinput);
2834 break;
2835 }
bc517b45 2836 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2837 /* Inline the first character, for speed. */
2838 if (UCHARAT(s) != nextchr)
2839 sayNO;
2840 if (PL_regeol - locinput < ln)
2841 sayNO;
2842 if (ln > 1 && memNE(s, locinput, ln))
2843 sayNO;
2844 locinput += ln;
2845 nextchr = UCHARAT(locinput);
2846 break;
2847 case EXACTFL:
b8c5462f
JH
2848 PL_reg_flags |= RF_tainted;
2849 /* FALL THROUGH */
d6a28714 2850 case EXACTF:
cd439c50
IZ
2851 s = STRING(scan);
2852 ln = STR_LEN(scan);
d6a28714 2853
d07ddd77
JH
2854 if (do_utf8 || UTF) {
2855 /* Either target or the pattern are utf8. */
d6a28714 2856 char *l = locinput;
d07ddd77 2857 char *e = PL_regeol;
bc517b45 2858
eb160463 2859 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2860 l, &e, 0, do_utf8)) {
5486206c
JH
2861 /* One more case for the sharp s:
2862 * pack("U0U*", 0xDF) =~ /ss/i,
2863 * the 0xC3 0x9F are the UTF-8
2864 * byte sequence for the U+00DF. */
2865 if (!(do_utf8 &&
2866 toLOWER(s[0]) == 's' &&
2867 ln >= 2 &&
2868 toLOWER(s[1]) == 's' &&
2869 (U8)l[0] == 0xC3 &&
2870 e - l >= 2 &&
2871 (U8)l[1] == 0x9F))
2872 sayNO;
2873 }
d07ddd77
JH
2874 locinput = e;
2875 nextchr = UCHARAT(locinput);
2876 break;
a0ed51b3 2877 }
d6a28714 2878
bc517b45
JH
2879 /* Neither the target and the pattern are utf8. */
2880
d6a28714
JH
2881 /* Inline the first character, for speed. */
2882 if (UCHARAT(s) != nextchr &&
2883 UCHARAT(s) != ((OP(scan) == EXACTF)
2884 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2885 sayNO;
d6a28714 2886 if (PL_regeol - locinput < ln)
b8c5462f 2887 sayNO;
d6a28714
JH
2888 if (ln > 1 && (OP(scan) == EXACTF
2889 ? ibcmp(s, locinput, ln)
2890 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2891 sayNO;
d6a28714
JH
2892 locinput += ln;
2893 nextchr = UCHARAT(locinput);
a0d0e21e 2894 break;
d6a28714 2895 case ANYOF:
ffc61ed2 2896 if (do_utf8) {
9e55ce06
JH
2897 STRLEN inclasslen = PL_regeol - locinput;
2898
ba7b4546 2899 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2900 sayNO_ANYOF;
ffc61ed2
JH
2901 if (locinput >= PL_regeol)
2902 sayNO;
0f0076b4 2903 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2904 nextchr = UCHARAT(locinput);
e0f9d4a8 2905 break;
ffc61ed2
JH
2906 }
2907 else {
2908 if (nextchr < 0)
2909 nextchr = UCHARAT(locinput);
7d3e948e 2910 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2911 sayNO_ANYOF;
ffc61ed2
JH
2912 if (!nextchr && locinput >= PL_regeol)
2913 sayNO;
2914 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2915 break;
2916 }
2917 no_anyof:
2918 /* If we might have the case of the German sharp s
2919 * in a casefolding Unicode character class. */
2920
ebc501f0
JH
2921 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2922 locinput += SHARP_S_SKIP;
e0f9d4a8 2923 nextchr = UCHARAT(locinput);
ffc61ed2 2924 }
e0f9d4a8
JH
2925 else
2926 sayNO;
b8c5462f 2927 break;
d6a28714 2928 case ALNUML:
b8c5462f
JH
2929 PL_reg_flags |= RF_tainted;
2930 /* FALL THROUGH */
d6a28714 2931 case ALNUM:
b8c5462f 2932 if (!nextchr)
4633a7c4 2933 sayNO;
ffc61ed2 2934 if (do_utf8) {
1a4fad37 2935 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2936 if (!(OP(scan) == ALNUM
3568d838 2937 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2938 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2939 {
2940 sayNO;
a0ed51b3 2941 }
b8c5462f 2942 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2943 nextchr = UCHARAT(locinput);
2944 break;
2945 }
ffc61ed2 2946 if (!(OP(scan) == ALNUM
d6a28714 2947 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2948 sayNO;
b8c5462f 2949 nextchr = UCHARAT(++locinput);
a0d0e21e 2950 break;
d6a28714 2951 case NALNUML:
b8c5462f
JH
2952 PL_reg_flags |= RF_tainted;
2953 /* FALL THROUGH */
d6a28714
JH
2954 case NALNUM:
2955 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2956 sayNO;
ffc61ed2 2957 if (do_utf8) {
1a4fad37 2958 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2959 if (OP(scan) == NALNUM
3568d838 2960 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2961 : isALNUM_LC_utf8((U8*)locinput))
2962 {
b8c5462f 2963 sayNO;
d6a28714 2964 }
b8c5462f
JH
2965 locinput += PL_utf8skip[nextchr];
2966 nextchr = UCHARAT(locinput);
2967 break;
2968 }
ffc61ed2 2969 if (OP(scan) == NALNUM
d6a28714 2970 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2971 sayNO;
76e3520e 2972 nextchr = UCHARAT(++locinput);
a0d0e21e 2973 break;
d6a28714
JH
2974 case BOUNDL:
2975 case NBOUNDL:
3280af22 2976 PL_reg_flags |= RF_tainted;
bbce6d69 2977 /* FALL THROUGH */
d6a28714
JH
2978 case BOUND:
2979 case NBOUND:
2980 /* was last char in word? */
ffc61ed2 2981 if (do_utf8) {
12d33761
HS
2982 if (locinput == PL_bostr)
2983 ln = '\n';
ffc61ed2 2984 else {
a3b680e6 2985 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2986
b4f7163a 2987 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2988 }
2989 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2990 ln = isALNUM_uni(ln);
1a4fad37 2991 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 2992 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2993 }
2994 else {
9041c2e3 2995 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2996 n = isALNUM_LC_utf8((U8*)locinput);
2997 }
a0ed51b3 2998 }
d6a28714 2999 else {
12d33761
HS
3000 ln = (locinput != PL_bostr) ?
3001 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
3002 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3003 ln = isALNUM(ln);
3004 n = isALNUM(nextchr);
3005 }
3006 else {
3007 ln = isALNUM_LC(ln);
3008 n = isALNUM_LC(nextchr);
3009 }
d6a28714 3010 }
ffc61ed2
JH
3011 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3012 OP(scan) == BOUNDL))
3013 sayNO;
a0ed51b3 3014 break;
d6a28714 3015 case SPACEL:
3280af22 3016 PL_reg_flags |= RF_tainted;
bbce6d69 3017 /* FALL THROUGH */
d6a28714 3018 case SPACE:
9442cb0e 3019 if (!nextchr)
4633a7c4 3020 sayNO;
1aa99e6b 3021 if (do_utf8) {
fd400ab9 3022 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3023 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3024 if (!(OP(scan) == SPACE
3568d838 3025 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3026 : isSPACE_LC_utf8((U8*)locinput)))
3027 {
3028 sayNO;
3029 }
3030 locinput += PL_utf8skip[nextchr];
3031 nextchr = UCHARAT(locinput);
3032 break;
d6a28714 3033 }
ffc61ed2
JH
3034 if (!(OP(scan) == SPACE
3035 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3036 sayNO;
3037 nextchr = UCHARAT(++locinput);
3038 }
3039 else {
3040 if (!(OP(scan) == SPACE
3041 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3042 sayNO;
3043 nextchr = UCHARAT(++locinput);
a0ed51b3 3044 }
a0ed51b3 3045 break;
d6a28714 3046 case NSPACEL:
3280af22 3047 PL_reg_flags |= RF_tainted;
bbce6d69 3048 /* FALL THROUGH */
d6a28714 3049 case NSPACE:
9442cb0e 3050 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3051 sayNO;
1aa99e6b 3052 if (do_utf8) {
1a4fad37 3053 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3054 if (OP(scan) == NSPACE
3568d838 3055 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3056 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3057 {
3058 sayNO;
3059 }
3060 locinput += PL_utf8skip[nextchr];
3061 nextchr = UCHARAT(locinput);
3062 break;
a0ed51b3 3063 }
ffc61ed2 3064 if (OP(scan) == NSPACE
d6a28714 3065 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3066 sayNO;
76e3520e 3067 nextchr = UCHARAT(++locinput);
a0d0e21e 3068 break;
d6a28714 3069 case DIGITL:
a0ed51b3
LW
3070 PL_reg_flags |= RF_tainted;
3071 /* FALL THROUGH */
d6a28714 3072 case DIGIT:
9442cb0e 3073 if (!nextchr)
a0ed51b3 3074 sayNO;
1aa99e6b 3075 if (do_utf8) {
1a4fad37 3076 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3077 if (!(OP(scan) == DIGIT
3568d838 3078 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3079 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3080 {
a0ed51b3 3081 sayNO;
dfe13c55 3082 }
6f06b55f 3083 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3084 nextchr = UCHARAT(locinput);
3085 break;
3086 }
ffc61ed2 3087 if (!(OP(scan) == DIGIT
9442cb0e 3088 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3089 sayNO;
3090 nextchr = UCHARAT(++locinput);
3091 break;
d6a28714 3092 case NDIGITL:
b8c5462f
JH
3093 PL_reg_flags |= RF_tainted;
3094 /* FALL THROUGH */
d6a28714 3095 case NDIGIT:
9442cb0e 3096 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3097 sayNO;
1aa99e6b 3098 if (do_utf8) {
1a4fad37 3099 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3100 if (OP(scan) == NDIGIT
3568d838 3101 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3102 : isDIGIT_LC_utf8((U8*)locinput))
3103 {
a0ed51b3 3104 sayNO;
9442cb0e 3105 }
6f06b55f 3106 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3107 nextchr = UCHARAT(locinput);
3108 break;
3109 }
ffc61ed2 3110 if (OP(scan) == NDIGIT
9442cb0e 3111 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3112 sayNO;
3113 nextchr = UCHARAT(++locinput);
3114 break;
3115 case CLUMP:
b7c83a7e 3116 if (locinput >= PL_regeol)
a0ed51b3 3117 sayNO;
b7c83a7e 3118 if (do_utf8) {
1a4fad37 3119 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3120 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3121 sayNO;
3122 locinput += PL_utf8skip[nextchr];
3123 while (locinput < PL_regeol &&
3124 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3125 locinput += UTF8SKIP(locinput);
3126 if (locinput > PL_regeol)
3127 sayNO;
eb08e2da
JH
3128 }
3129 else
3130 locinput++;
a0ed51b3
LW
3131 nextchr = UCHARAT(locinput);
3132 break;
c8756f30 3133 case REFFL:
3280af22 3134 PL_reg_flags |= RF_tainted;
c8756f30 3135 /* FALL THROUGH */
c277df42 3136 case REF:
c8756f30 3137 case REFF:
c277df42 3138 n = ARG(scan); /* which paren pair */
cf93c79d 3139 ln = PL_regstartp[n];
2c2d71f5 3140 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 3141 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 3142 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 3143 if (ln == PL_regendp[n])
a0d0e21e 3144 break;
a0ed51b3 3145
cf93c79d 3146 s = PL_bostr + ln;
1aa99e6b 3147 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3148 char *l = locinput;
a3b680e6 3149 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3150 /*
3151 * Note that we can't do the "other character" lookup trick as
3152 * in the 8-bit case (no pun intended) because in Unicode we
3153 * have to map both upper and title case to lower case.
3154 */
3155 if (OP(scan) == REFF) {
3156 while (s < e) {
a3b680e6
AL
3157 STRLEN ulen1, ulen2;
3158 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3159 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3160
a0ed51b3
LW
3161 if (l >= PL_regeol)
3162 sayNO;
a2a2844f
JH
3163 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3164 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3165 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3166 sayNO;
a2a2844f
JH
3167 s += ulen1;
3168 l += ulen2;
a0ed51b3
LW
3169 }
3170 }
3171 locinput = l;
3172 nextchr = UCHARAT(locinput);
3173 break;
3174 }
3175
a0d0e21e 3176 /* Inline the first character, for speed. */
76e3520e 3177 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3178 (OP(scan) == REF ||
3179 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3180 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3181 sayNO;
cf93c79d 3182 ln = PL_regendp[n] - ln;
3280af22 3183 if (locinput + ln > PL_regeol)
4633a7c4 3184 sayNO;
c8756f30
AK
3185 if (ln > 1 && (OP(scan) == REF
3186 ? memNE(s, locinput, ln)
3187 : (OP(scan) == REFF
3188 ? ibcmp(s, locinput, ln)
3189 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3190 sayNO;
a0d0e21e 3191 locinput += ln;
76e3520e 3192 nextchr = UCHARAT(locinput);
a0d0e21e
LW
3193 break;
3194
3195 case NOTHING:
c277df42 3196 case TAIL:
a0d0e21e
LW
3197 break;
3198 case BACK:
3199 break;
c277df42
IZ
3200 case EVAL:
3201 {
3202 dSP;
533c011a 3203 OP_4tree *oop = PL_op;
3280af22 3204 COP *ocurcop = PL_curcop;
f3548bdc 3205 PAD *old_comppad;
c277df42 3206 SV *ret;
080c2dec 3207 struct regexp *oreg = PL_reg_re;
9041c2e3 3208
c277df42 3209 n = ARG(scan);
533c011a 3210 PL_op = (OP_4tree*)PL_regdata->data[n];
a3621e74 3211 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 3212 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 3213 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 3214
8e5e9ebe
RGS
3215 {
3216 SV **before = SP;
3217 CALLRUNOPS(aTHX); /* Scalar context. */
3218 SPAGAIN;
3219 if (SP == before)
075aa684 3220 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3221 else {
3222 ret = POPs;
3223 PUTBACK;
3224 }
3225 }
3226
0f5d15d6 3227 PL_op = oop;
f3548bdc 3228 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 3229 PL_curcop = ocurcop;
c277df42 3230 if (logical) {
0f5d15d6
IZ
3231 if (logical == 2) { /* Postponed subexpression. */
3232 regexp *re;
22c35a8c 3233 MAGIC *mg = Null(MAGIC*);
0f5d15d6 3234 re_cc_state state;
0f5d15d6 3235 CHECKPOINT cp, lastcp;
cb50f42d 3236 int toggleutf;
faf82a0b 3237 register SV *sv;
0f5d15d6 3238
faf82a0b
AE
3239 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3240 mg = mg_find(sv, PERL_MAGIC_qr);
3241 else if (SvSMAGICAL(ret)) {
3242 if (SvGMAGICAL(ret))
3243 sv_unmagic(ret, PERL_MAGIC_qr);
3244 else
3245 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3246 }
faf82a0b 3247
0f5d15d6
IZ
3248 if (mg) {
3249 re = (regexp *)mg->mg_obj;
df0003d4 3250 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3251 }
3252 else {
3253 STRLEN len;
83003860 3254 const char *t = SvPV_const(ret, len);
0f5d15d6 3255 PMOP pm;
a3b680e6
AL
3256 char * const oprecomp = PL_regprecomp;
3257 const I32 osize = PL_regsize;
3258 const I32 onpar = PL_regnpar;
0f5d15d6 3259
5fcd1c1b 3260 Zero(&pm, 1, PMOP);
cb50f42d 3261 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3262 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3263 if (!(SvFLAGS(ret)
faf82a0b
AE
3264 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3265 | SVs_GMG)))
14befaf4
DM
3266 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3267 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
3268 PL_regprecomp = oprecomp;
3269 PL_regsize = osize;
3270 PL_regnpar = onpar;
3271 }
a3621e74 3272 DEBUG_EXECUTE_r(
9041c2e3 3273 PerlIO_printf(Perl_debug_log,
a0288114 3274 "Entering embedded \"%s%.60s%s%s\"\n",
0f5d15d6
IZ
3275 PL_colors[0],
3276 re->precomp,
3277 PL_colors[1],
3278 (strlen(re->precomp) > 60 ? "..." : ""))
3279 );
3280 state.node = next;
3281 state.prev = PL_reg_call_cc;
3282 state.cc = PL_regcc;
3283 state.re = PL_reg_re;
3284
2ab05381 3285 PL_regcc = 0;
9041c2e3 3286
0f5d15d6 3287 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3288 REGCP_SET(lastcp);
0f5d15d6
IZ
3289 cache_re(re);
3290 state.ss = PL_savestack_ix;
3291 *PL_reglastparen = 0;
a01268b5 3292 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
3293 PL_reg_call_cc = &state;
3294 PL_reginput = locinput;
cb50f42d
YST
3295 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3296 ((re->reganch & ROPT_UTF8) != 0);
3297 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3298
3299 /* XXXX This is too dramatic a measure... */
3300 PL_reg_maxiter = 0;
3301
0f5d15d6 3302 if (regmatch(re->program + 1)) {
2c914db6
IZ
3303 /* Even though we succeeded, we need to restore
3304 global variables, since we may be wrapped inside
3305 SUSPEND, thus the match may be not finished yet. */
3306
3307 /* XXXX Do this only if SUSPENDed? */
3308 PL_reg_call_cc = state.prev;
3309 PL_regcc = state.cc;
3310 PL_reg_re = state.re;
3311 cache_re(PL_reg_re);
cb50f42d 3312 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
3313
3314 /* XXXX This is too dramatic a measure... */
3315 PL_reg_maxiter = 0;
3316
3317 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
3318 ReREFCNT_dec(re);
3319 regcpblow(cp);
3320 sayYES;
3321 }
0f5d15d6 3322 ReREFCNT_dec(re);
02db2b7b 3323 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3324 regcppop();
3325 PL_reg_call_cc = state.prev;
3326 PL_regcc = state.cc;
3327 PL_reg_re = state.re;
d3790889 3328 cache_re(PL_reg_re);
cb50f42d 3329 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3330
3331 /* XXXX This is too dramatic a measure... */
3332 PL_reg_maxiter = 0;
3333
8e514ae6 3334 logical = 0;
0f5d15d6
IZ
3335 sayNO;
3336 }
c277df42 3337 sw = SvTRUE(ret);
0f5d15d6 3338 logical = 0;
a0ed51b3 3339 }
080c2dec 3340 else {
3280af22 3341 sv_setsv(save_scalar(PL_replgv), ret);
080c2dec
AE
3342 cache_re(oreg);
3343 }
c277df42
IZ
3344 break;
3345 }
a0d0e21e 3346 case OPEN:
c277df42 3347 n = ARG(scan); /* which paren pair */
3280af22
NIS
3348 PL_reg_start_tmp[n] = locinput;
3349 if (n > PL_regsize)
3350 PL_regsize = n;
a0d0e21e
LW
3351 break;
3352 case CLOSE:
c277df42 3353 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3354 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3355 PL_regendp[n] = locinput - PL_bostr;
eb160463 3356 if (n > (I32)*PL_reglastparen)
3280af22 3357 *PL_reglastparen = n;
a01268b5 3358 *PL_reglastcloseparen = n;
a0d0e21e 3359 break;
c277df42
IZ
3360 case GROUPP:
3361 n = ARG(scan); /* which paren pair */
eb160463 3362 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3363 break;
3364 case IFTHEN:
2c2d71f5 3365 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
3366 if (sw)
3367 next = NEXTOPER(NEXTOPER(scan));
3368 else {
3369 next = scan + ARG(scan);
3370 if (OP(next) == IFTHEN) /* Fake one. */
3371 next = NEXTOPER(NEXTOPER(next));
3372 }
3373 break;
3374 case LOGICAL:
0f5d15d6 3375 logical = scan->flags;
c277df42 3376 break;
2ab05381
IZ
3377/*******************************************************************
3378 PL_regcc contains infoblock about the innermost (...)* loop, and
3379 a pointer to the next outer infoblock.
3380
3381 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3382
3383 1) After matching X, regnode for CURLYX is processed;
3384
9041c2e3 3385 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3386 regmatch() recursively with the starting point at WHILEM node;
3387
3388 3) Each hit of WHILEM node tries to match A and Z (in the order
3389 depending on the current iteration, min/max of {min,max} and
3390 greediness). The information about where are nodes for "A"
3391 and "Z" is read from the infoblock, as is info on how many times "A"
3392 was already matched, and greediness.
3393
3394 4) After A matches, the same WHILEM node is hit again.
3395
3396 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3397 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3398 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3399 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3400 of the external loop.
3401
3402 Currently present infoblocks form a tree with a stem formed by PL_curcc
3403 and whatever it mentions via ->next, and additional attached trees
3404 corresponding to temporarily unset infoblocks as in "5" above.
3405
9041c2e3 3406 In the following picture infoblocks for outer loop of
2ab05381
IZ
3407 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3408 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3409 infoblocks are drawn below the "reset" infoblock.
3410
3411 In fact in the picture below we do not show failed matches for Z and T
3412 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3413 more obvious *why* one needs to *temporary* unset infoblocks.]
3414
3415 Matched REx position InfoBlocks Comment
3416 (Y(A)*?Z)*?T x
3417 Y(A)*?Z)*?T x <- O
3418 Y (A)*?Z)*?T x <- O
3419 Y A)*?Z)*?T x <- O <- I
3420 YA )*?Z)*?T x <- O <- I
3421 YA A)*?Z)*?T x <- O <- I
3422 YAA )*?Z)*?T x <- O <- I
3423 YAA Z)*?T x <- O # Temporary unset I
3424 I
3425
3426 YAAZ Y(A)*?Z)*?T x <- O
3427 I
3428
3429 YAAZY (A)*?Z)*?T x <- O
3430 I
3431
3432 YAAZY A)*?Z)*?T x <- O <- I
3433 I
3434
3435 YAAZYA )*?Z)*?T x <- O <- I
3436 I
3437
3438 YAAZYA Z)*?T x <- O # Temporary unset I
3439 I,I
3440
3441 YAAZYAZ )*?T x <- O
3442 I,I
3443
3444 YAAZYAZ T x # Temporary unset O
3445 O
3446 I,I
3447
3448 YAAZYAZT x
3449 O
3450 I,I
3451 *******************************************************************/
a0d0e21e
LW
3452 case CURLYX: {
3453 CURCUR cc;
3280af22 3454 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3455 /* No need to save/restore up to this paren */
3456 I32 parenfloor = scan->flags;
c277df42
IZ
3457
3458 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3459 next += ARG(next);
3280af22
NIS
3460 cc.oldcc = PL_regcc;
3461 PL_regcc = &cc;
cb434fcc
IZ
3462 /* XXXX Probably it is better to teach regpush to support
3463 parenfloor > PL_regsize... */
eb160463 3464 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3465 parenfloor = *PL_reglastparen; /* Pessimization... */
3466 cc.parenfloor = parenfloor;
a0d0e21e
LW
3467 cc.cur = -1;
3468 cc.min = ARG1(scan);
3469 cc.max = ARG2(scan);
c277df42 3470 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3471 cc.next = next;
3472 cc.minmod = minmod;
3473 cc.lastloc = 0;
3280af22 3474 PL_reginput = locinput;
a0d0e21e
LW
3475 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3476 regcpblow(cp);
3280af22 3477 PL_regcc = cc.oldcc;
4633a7c4 3478 saySAME(n);
a0d0e21e
LW
3479 }
3480 /* NOT REACHED */
3481 case WHILEM: {
3482 /*
3483 * This is really hard to understand, because after we match
3484 * what we're trying to match, we must make sure the rest of
2c2d71f5 3485 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3486 * to go back UP the parse tree by recursing ever deeper. And
3487 * if it fails, we have to reset our parent's current state
3488 * that we can try again after backing off.
3489 */
3490
c277df42 3491 CHECKPOINT cp, lastcp;
3280af22 3492 CURCUR* cc = PL_regcc;
c277df42 3493 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3ab3c9b4 3494 I32 cache_offset = 0, cache_bit = 0;
c277df42 3495
4633a7c4 3496 n = cc->cur + 1; /* how many we know we matched */
3280af22 3497 PL_reginput = locinput;
a0d0e21e 3498
a3621e74 3499 DEBUG_EXECUTE_r(
9041c2e3 3500 PerlIO_printf(Perl_debug_log,
91f3b821 3501 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3502 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3503 (long)n, (long)cc->min,
2797576d 3504 (long)cc->max, PTR2UV(cc))
c277df42 3505 );
4633a7c4 3506
a0d0e21e
LW
3507 /* If degenerate scan matches "", assume scan done. */
3508
579cf2c3 3509 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3510 PL_regcc = cc->oldcc;
2ab05381
IZ
3511 if (PL_regcc)
3512 ln = PL_regcc->cur;
a3621e74 3513 DEBUG_EXECUTE_r(
c3464db5
DD
3514 PerlIO_printf(Perl_debug_log,
3515 "%*s empty match detected, try continuation...\n",
3280af22 3516 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3517 );
a0d0e21e 3518 if (regmatch(cc->next))
4633a7c4 3519 sayYES;
2ab05381
IZ
3520 if (PL_regcc)
3521 PL_regcc->cur = ln;
3280af22 3522 PL_regcc = cc;
4633a7c4 3523 sayNO;
a0d0e21e
LW
3524 }
3525
3526 /* First just match a string of min scans. */
3527
3528 if (n < cc->min) {
3529 cc->cur = n;
3530 cc->lastloc = locinput;
4633a7c4
LW
3531 if (regmatch(cc->scan))
3532 sayYES;
3533 cc->cur = n - 1;
c277df42 3534 cc->lastloc = lastloc;
4633a7c4 3535 sayNO;
a0d0e21e
LW
3536 }
3537
2c2d71f5
JH
3538 if (scan->flags) {
3539 /* Check whether we already were at this position.
3540 Postpone detection until we know the match is not
3541 *that* much linear. */
3542 if (!PL_reg_maxiter) {
3543 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3544 PL_reg_leftiter = PL_reg_maxiter;
3545 }
3546 if (PL_reg_leftiter-- == 0) {
a3b680e6 3547 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
2c2d71f5 3548 if (PL_reg_poscache) {
eb160463 3549 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3550 Renew(PL_reg_poscache, size, char);
3551 PL_reg_poscache_size = size;
3552 }
3553 Zero(PL_reg_poscache, size, char);
3554 }
3555 else {
3556 PL_reg_poscache_size = size;
a02a5408 3557 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3558 }
a3621e74 3559 DEBUG_EXECUTE_r(
2c2d71f5
JH
3560 PerlIO_printf(Perl_debug_log,
3561 "%sDetected a super-linear match, switching on caching%s...\n",
3562 PL_colors[4], PL_colors[5])
3563 );
3564 }
3565 if (PL_reg_leftiter < 0) {
3ab3c9b4 3566 cache_offset = locinput - PL_bostr;
2c2d71f5 3567
3ab3c9b4
HS
3568 cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3569 + cache_offset * (scan->flags>>4);
3570 cache_bit = cache_offset % 8;
3571 cache_offset /= 8;
3572 if (PL_reg_poscache[cache_offset] & (1<<cache_bit)) {
a3621e74 3573 DEBUG_EXECUTE_r(
2c2d71f5
JH
3574 PerlIO_printf(Perl_debug_log,
3575 "%*s already tried at this position...\n",
3576 REPORT_CODE_OFF+PL_regindent*2, "")
3577 );
3ab3c9b4
HS
3578 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3579 /* cache records success */
c2b0868c
HS
3580 sayYES;
3581 else
3ab3c9b4 3582 /* cache records failure */
c2b0868c 3583 sayNO_SILENT;
2c2d71f5 3584 }
3ab3c9b4 3585 PL_reg_poscache[cache_offset] |= (1<<cache_bit);
2c2d71f5
JH
3586 }
3587 }
3588
a0d0e21e
LW
3589 /* Prefer next over scan for minimal matching. */
3590
3591 if (cc->minmod) {
3280af22 3592 PL_regcc = cc->oldcc;
2ab05381
IZ
3593 if (PL_regcc)
3594 ln = PL_regcc->cur;
5f05dabc 3595 cp = regcppush(cc->parenfloor);
02db2b7b 3596 REGCP_SET(lastcp);
5f05dabc 3597 if (regmatch(cc->next)) {
c277df42 3598 regcpblow(cp);
3ab3c9b4 3599 CACHEsayYES; /* All done. */
5f05dabc 3600 }
02db2b7b 3601 REGCP_UNWIND(lastcp);
5f05dabc 3602 regcppop();
2ab05381
IZ
3603 if (PL_regcc)
3604 PL_regcc->cur = ln;
3280af22 3605 PL_regcc = cc;
a0d0e21e 3606
c277df42 3607 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3608 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3609 && !(PL_reg_flags & RF_warned)) {
3610 PL_reg_flags |= RF_warned;
9014280d 3611 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3612 "Complex regular subexpression recursion",
3613 REG_INFTY - 1);
c277df42 3614 }
3ab3c9b4 3615 CACHEsayNO;
c277df42 3616 }
a687059c 3617
a3621e74 3618 DEBUG_EXECUTE_r(
c3464db5
DD
3619 PerlIO_printf(Perl_debug_log,
3620 "%*s trying longer...\n",
3280af22 3621 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3622 );
a0d0e21e 3623 /* Try scanning more and see if it helps. */
3280af22 3624 PL_reginput = locinput;
a0d0e21e
LW
3625 cc->cur = n;
3626 cc->lastloc = locinput;
5f05dabc 3627 cp = regcppush(cc->parenfloor);
02db2b7b 3628 REGCP_SET(lastcp);
5f05dabc 3629 if (regmatch(cc->scan)) {
c277df42 3630 regcpblow(cp);
3ab3c9b4 3631 CACHEsayYES;
5f05dabc 3632 }
02db2b7b 3633 REGCP_UNWIND(lastcp);
5f05dabc 3634 regcppop();
4633a7c4 3635 cc->cur = n - 1;
c277df42 3636 cc->lastloc = lastloc;
3ab3c9b4 3637 CACHEsayNO;
a0d0e21e
LW
3638 }
3639
3640 /* Prefer scan over next for maximal matching. */
3641
3642 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3643 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3644 cc->cur = n;
3645 cc->lastloc = locinput;
02db2b7b 3646 REGCP_SET(lastcp);
5f05dabc 3647 if (regmatch(cc->scan)) {
c277df42 3648 regcpblow(cp);
3ab3c9b4 3649 CACHEsayYES;
5f05dabc 3650 }
02db2b7b 3651 REGCP_UNWIND(lastcp);
a0d0e21e 3652 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3653 PL_reginput = locinput;
a3621e74 3654 DEBUG_EXECUTE_r(
c3464db5
DD
3655 PerlIO_printf(Perl_debug_log,
3656 "%*s failed, try continuation...\n",
3280af22 3657 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3658 );
3659 }
9041c2e3 3660 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3661 && !(PL_reg_flags & RF_warned)) {
3280af22 3662 PL_reg_flags |= RF_warned;
9014280d 3663 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3664 "Complex regular subexpression recursion",
3665 REG_INFTY - 1);
a0d0e21e
LW
3666 }
3667
3668 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3669 PL_regcc = cc->oldcc;
2ab05381
IZ
3670 if (PL_regcc)
3671 ln = PL_regcc->cur;
a0d0e21e 3672 if (regmatch(cc->next))
3ab3c9b4 3673 CACHEsayYES;
2ab05381
IZ
3674 if (PL_regcc)
3675 PL_regcc->cur = ln;
3280af22 3676 PL_regcc = cc;
4633a7c4 3677 cc->cur = n - 1;
c277df42 3678 cc->lastloc = lastloc;
3ab3c9b4 3679 CACHEsayNO;
a0d0e21e
LW
3680 }
3681 /* NOT REACHED */
9041c2e3 3682 case BRANCHJ:
c277df42
IZ
3683 next = scan + ARG(scan);
3684 if (next == scan)
3685 next = NULL;
3686 inner = NEXTOPER(NEXTOPER(scan));
3687 goto do_branch;
9041c2e3 3688 case BRANCH:
c277df42
IZ
3689 inner = NEXTOPER(scan);
3690 do_branch:
3691 {
c277df42
IZ
3692 c1 = OP(scan);
3693 if (OP(next) != c1) /* No choice. */
3694 next = inner; /* Avoid recursion. */
a0d0e21e 3695 else {
a3b680e6 3696 const I32 lastparen = *PL_reglastparen;
02db2b7b
IZ
3697 I32 unwind1;
3698 re_unwind_branch_t *uw;
3699
3700 /* Put unwinding data on stack */
3701 unwind1 = SSNEWt(1,re_unwind_branch_t);
3702 uw = SSPTRt(unwind1,re_unwind_branch_t);
3703 uw->prev = unwind;
3704 unwind = unwind1;
3705 uw->type = ((c1 == BRANCH)
3706 ? RE_UNWIND_BRANCH
3707 : RE_UNWIND_BRANCHJ);
3708 uw->lastparen = lastparen;
3709 uw->next = next;
3710 uw->locinput = locinput;
3711 uw->nextchr = nextchr;
3712#ifdef DEBUGGING
3713 uw->regindent = ++PL_regindent;
3714#endif
c277df42 3715
02db2b7b
IZ
3716 REGCP_SET(uw->lastcp);
3717
3718 /* Now go into the first branch */
3719 next = inner;
a687059c 3720 }
a0d0e21e
LW
3721 }
3722 break;
3723 case MINMOD:
3724 minmod = 1;
3725 break;
c277df42
IZ
3726 case CURLYM:
3727 {
00db4c45 3728 I32 l = 0;
c277df42 3729 CHECKPOINT lastcp;
9041c2e3 3730
c277df42 3731 /* We suppose that the next guy does not need
0e788c72 3732 backtracking: in particular, it is of constant non-zero length,
c277df42
IZ
3733 and has no parenths to influence future backrefs. */
3734 ln = ARG1(scan); /* min to match */
3735 n = ARG2(scan); /* max to match */
c277df42
IZ
3736 paren = scan->flags;
3737 if (paren) {
3280af22
NIS
3738 if (paren > PL_regsize)
3739 PL_regsize = paren;
eb160463 3740 if (paren > (I32)*PL_reglastparen)
3280af22 3741 *PL_reglastparen = paren;
c277df42 3742 }
dc45a647 3743 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3744 if (paren)
3745 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3746 PL_reginput = locinput;
c277df42
IZ
3747 if (minmod) {
3748 minmod = 0;
3749 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3750 sayNO;
3280af22 3751 locinput = PL_reginput;
cca55fe3 3752 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3753 regnode *text_node = next;
3754
cca55fe3 3755 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3756
cca55fe3 3757 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3758 else {
cca55fe3 3759 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3760 c1 = c2 = -1000;
3761 goto assume_ok_MM;
cca55fe3
JP
3762 }
3763 else { c1 = (U8)*STRING(text_node); }
af5decee 3764 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3765 c2 = PL_fold[c1];
af5decee 3766 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3767 c2 = PL_fold_locale[c1];
3768 else
3769 c2 = c1;
3770 }
a0ed51b3
LW
3771 }
3772 else
c277df42 3773 c1 = c2 = -1000;
cca55fe3 3774 assume_ok_MM:
02db2b7b 3775 REGCP_SET(lastcp);
0e788c72 3776 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
c277df42
IZ
3777 /* If it could work, try it. */
3778 if (c1 == -1000 ||
3280af22
NIS
3779 UCHARAT(PL_reginput) == c1 ||
3780 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3781 {
3782 if (paren) {
f31a99c8 3783 if (ln) {
cf93c79d
IZ
3784 PL_regstartp[paren] =
3785 HOPc(PL_reginput, -l) - PL_bostr;
3786 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3787 }
3788 else
cf93c79d 3789 PL_regendp[paren] = -1;
c277df42
IZ
3790 }
3791 if (regmatch(next))
3792 sayYES;
02db2b7b 3793 REGCP_UNWIND(lastcp);
c277df42
IZ
3794 }
3795 /* Couldn't or didn't -- move forward. */
3280af22 3796 PL_reginput = locinput;
c277df42
IZ
3797 if (regrepeat_hard(scan, 1, &l)) {
3798 ln++;
3280af22 3799 locinput = PL_reginput;
c277df42
IZ
3800 }
3801 else
3802 sayNO;
3803 }
a0ed51b3
LW
3804 }
3805 else {
c277df42 3806 n = regrepeat_hard(scan, n, &l);
3280af22 3807 locinput = PL_reginput;
a3621e74 3808 DEBUG_EXECUTE_r(
5c0ca799 3809 PerlIO_printf(Perl_debug_log,
faccc32b 3810 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3811 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3812 (IV) n, (IV)l)
c277df42
IZ
3813 );
3814 if (n >= ln) {
cca55fe3 3815 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3816 regnode *text_node = next;
3817
cca55fe3 3818 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3819
cca55fe3 3820 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3821 else {
cca55fe3 3822 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3823 c1 = c2 = -1000;
3824 goto assume_ok_REG;
cca55fe3
JP
3825 }
3826 else { c1 = (U8)*STRING(text_node); }
3827
af5decee 3828 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3829 c2 = PL_fold[c1];
af5decee 3830 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3831 c2 = PL_fold_locale[c1];
3832 else
3833 c2 = c1;
3834 }
a0ed51b3
LW
3835 }
3836 else
c277df42
IZ
3837 c1 = c2 = -1000;
3838 }
cca55fe3 3839 assume_ok_REG:
02db2b7b 3840 REGCP_SET(lastcp);
c277df42
IZ
3841 while (n >= ln) {
3842 /* If it could work, try it. */
3843 if (c1 == -1000 ||
3280af22
NIS
3844 UCHARAT(PL_reginput) == c1 ||
3845 UCHARAT(PL_reginput) == c2)
a0ed51b3 3846 {
a3621e74 3847 DEBUG_EXECUTE_r(
c3464db5 3848 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3849 "%*s trying tail with n=%"IVdf"...\n",
3850 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3851 );
3852 if (paren) {
3853 if (n) {
cf93c79d
IZ
3854 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3855 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3856 }
a0ed51b3 3857 else
cf93c79d 3858 PL_regendp[paren] = -1;
c277df42 3859 }
a0ed51b3
LW
3860 if (regmatch(next))
3861 sayYES;
02db2b7b 3862 REGCP_UNWIND(lastcp);
a0ed51b3 3863 }
c277df42
IZ
3864 /* Couldn't or didn't -- back up. */
3865 n--;
dfe13c55 3866 locinput = HOPc(locinput, -l);
3280af22 3867 PL_reginput = locinput;
c277df42
IZ
3868 }
3869 }
3870 sayNO;
3871 break;
3872 }
3873 case CURLYN:
3874 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3875 if (paren > PL_regsize)
3876 PL_regsize = paren;
eb160463 3877 if (paren > (I32)*PL_reglastparen)
3280af22 3878 *PL_reglastparen = paren;
c277df42
IZ
3879 ln = ARG1(scan); /* min to match */
3880 n = ARG2(scan); /* max to match */
dc45a647 3881 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3882 goto repeat;
a0d0e21e 3883 case CURLY:
c277df42 3884 paren = 0;
a0d0e21e
LW
3885 ln = ARG1(scan); /* min to match */
3886 n = ARG2(scan); /* max to match */
dc45a647 3887 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3888 goto repeat;
3889 case STAR:
3890 ln = 0;
c277df42 3891 n = REG_INFTY;
a0d0e21e 3892 scan = NEXTOPER(scan);
c277df42 3893 paren = 0;
a0d0e21e
LW
3894 goto repeat;
3895 case PLUS:
c277df42
IZ
3896 ln = 1;
3897 n = REG_INFTY;
3898 scan = NEXTOPER(scan);
3899 paren = 0;
3900 repeat:
a0d0e21e
LW
3901 /*
3902 * Lookahead to avoid useless match attempts
3903 * when we know what character comes next.
3904 */
5f80c4cf
JP
3905
3906 /*
3907 * Used to only do .*x and .*?x, but now it allows
3908 * for )'s, ('s and (?{ ... })'s to be in the way
3909 * of the quantifier and the EXACT-like node. -- japhy
3910 */
3911
cca55fe3 3912 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3913 U8 *s;
3914 regnode *text_node = next;
3915
cca55fe3 3916 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3917
cca55fe3 3918 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3919 else {
cca55fe3 3920 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3921 c1 = c2 = -1000;
3922 goto assume_ok_easy;
cca55fe3
JP
3923 }
3924 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
3925
3926 if (!UTF) {
3927 c2 = c1 = *s;
f65d3ee7 3928 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3929 c2 = PL_fold[c1];
f65d3ee7 3930 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf 3931 c2 = PL_fold_locale[c1];
1aa99e6b 3932 }
5f80c4cf 3933 else { /* UTF */
f65d3ee7 3934 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3935 STRLEN ulen1, ulen2;
89ebb4a3
JH
3936 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3937 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
3938
3939 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3940 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3941
89ebb4a3 3942 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
872c91ae
JH
3943 ckWARN(WARN_UTF8) ?
3944 0 : UTF8_ALLOW_ANY);
89ebb4a3 3945 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
872c91ae
JH
3946 ckWARN(WARN_UTF8) ?
3947 0 : UTF8_ALLOW_ANY);
5f80c4cf
JP
3948 }
3949 else {
89ebb4a3 3950 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
872c91ae
JH
3951 ckWARN(WARN_UTF8) ?
3952 0 : UTF8_ALLOW_ANY);
5f80c4cf 3953 }
1aa99e6b
IH
3954 }
3955 }
bbce6d69 3956 }
a0d0e21e 3957 else
bbce6d69 3958 c1 = c2 = -1000;
cca55fe3 3959 assume_ok_easy:
3280af22 3960 PL_reginput = locinput;
a0d0e21e 3961 if (minmod) {
c277df42 3962 CHECKPOINT lastcp;
a0d0e21e
LW
3963 minmod = 0;
3964 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 3965 sayNO;
a0ed51b3 3966 locinput = PL_reginput;
02db2b7b 3967 REGCP_SET(lastcp);
0fe9bf95 3968 if (c1 != -1000) {
1aa99e6b 3969 char *e; /* Should not check after this */
0fe9bf95 3970 char *old = locinput;
b2f2f093 3971 int count = 0;
0fe9bf95 3972
1aa99e6b 3973 if (n == REG_INFTY) {
0fe9bf95 3974 e = PL_regeol - 1;
1aa99e6b
IH
3975 if (do_utf8)
3976 while (UTF8_IS_CONTINUATION(*(U8*)e))
3977 e--;
3978 }
3979 else if (do_utf8) {
3980 int m = n - ln;
3981 for (e = locinput;
3982 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3983 e += UTF8SKIP(e);
3984 }
3985 else {
3986 e = locinput + n - ln;
3987 if (e >= PL_regeol)
3988 e = PL_regeol - 1;
3989 }
0fe9bf95
IZ
3990 while (1) {
3991 /* Find place 'next' could work */
1aa99e6b
IH
3992 if (!do_utf8) {
3993 if (c1 == c2) {
a8e8ab15
JH
3994 while (locinput <= e &&
3995 UCHARAT(locinput) != c1)
1aa99e6b
IH
3996 locinput++;
3997 } else {
9041c2e3 3998 while (locinput <= e
a8e8ab15
JH
3999 && UCHARAT(locinput) != c1
4000 && UCHARAT(locinput) != c2)
1aa99e6b
IH
4001 locinput++;
4002 }
4003 count = locinput - old;
4004 }
4005 else {
1aa99e6b 4006 if (c1 == c2) {
a3b680e6 4007 STRLEN len;
872c91ae
JH
4008 /* count initialised to
4009 * utf8_distance(old, locinput) */
b2f2f093 4010 while (locinput <= e &&
872c91ae 4011 utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4012 UTF8_MAXBYTES, &len,
872c91ae 4013 ckWARN(WARN_UTF8) ?
eb160463 4014 0 : UTF8_ALLOW_ANY) != (UV)c1) {
1aa99e6b 4015 locinput += len;
b2f2f093
JH
4016 count++;
4017 }
1aa99e6b 4018 } else {
a3b680e6 4019 STRLEN len;
872c91ae
JH
4020 /* count initialised to
4021 * utf8_distance(old, locinput) */
b2f2f093 4022 while (locinput <= e) {
872c91ae 4023 UV c = utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4024 UTF8_MAXBYTES, &len,
872c91ae
JH
4025 ckWARN(WARN_UTF8) ?
4026 0 : UTF8_ALLOW_ANY);
eb160463 4027 if (c == (UV)c1 || c == (UV)c2)
1aa99e6b 4028 break;
b2f2f093
JH
4029 locinput += len;
4030 count++;
1aa99e6b
IH
4031 }
4032 }
0fe9bf95 4033 }
9041c2e3 4034 if (locinput > e)
0fe9bf95
IZ
4035 sayNO;
4036 /* PL_reginput == old now */
4037 if (locinput != old) {
4038 ln = 1; /* Did some */
1aa99e6b 4039 if (regrepeat(scan, count) < count)
0fe9bf95
IZ
4040 sayNO;
4041 }
4042 /* PL_reginput == locinput now */
29d1e993 4043 TRYPAREN(paren, ln, locinput);
0fe9bf95 4044 PL_reginput = locinput; /* Could be reset... */
02db2b7b 4045 REGCP_UNWIND(lastcp);
0fe9bf95 4046 /* Couldn't or didn't -- move forward. */
1aa99e6b
IH
4047 old = locinput;
4048 if (do_utf8)
4049 locinput += UTF8SKIP(locinput);
4050 else
4051 locinput++;
b2f2f093 4052 count = 1;
0fe9bf95
IZ
4053 }
4054 }
4055 else
c277df42 4056 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
1aa99e6b
IH
4057 UV c;
4058 if (c1 != -1000) {
4059 if (do_utf8)
872c91ae 4060 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4061 UTF8_MAXBYTES, 0,
872c91ae
JH
4062 ckWARN(WARN_UTF8) ?
4063 0 : UTF8_ALLOW_ANY);
1aa99e6b 4064 else
9041c2e3 4065 c = UCHARAT(PL_reginput);
2390ecbc 4066 /* If it could work, try it. */
eb160463 4067 if (c == (UV)c1 || c == (UV)c2)
2390ecbc 4068 {
ecc99935 4069 TRYPAREN(paren, ln, PL_reginput);
2390ecbc
PP
4070 REGCP_UNWIND(lastcp);
4071 }
1aa99e6b 4072 }
a0d0e21e 4073 /* If it could work, try it. */
2390ecbc 4074 else if (c1 == -1000)
bbce6d69 4075 {
ecc99935 4076 TRYPAREN(paren, ln, PL_reginput);
02db2b7b 4077 REGCP_UNWIND(lastcp);
bbce6d69 4078 }
c277df42 4079 /* Couldn't or didn't -- move forward. */
a0ed51b3 4080 PL_reginput = locinput;
a0d0e21e
LW
4081 if (regrepeat(scan, 1)) {
4082 ln++;
a0ed51b3
LW
4083 locinput = PL_reginput;
4084 }
4085 else
4633a7c4 4086 sayNO;
a0d0e21e
LW
4087 }
4088 }
4089 else {
c277df42 4090 CHECKPOINT lastcp;
a0d0e21e 4091 n = regrepeat(scan, n);
a0ed51b3 4092 locinput = PL_reginput;
22c35a8c 4093 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
7fba1cd6 4094 (OP(next) != MEOL ||
15272685
HS
4095 OP(next) == SEOL || OP(next) == EOS))
4096 {
a0d0e21e 4097 ln = n; /* why back off? */
1aeab75a
GS
4098 /* ...because $ and \Z can match before *and* after
4099 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4100 We should back off by one in this case. */
4101 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
4102 ln--;
4103 }
02db2b7b 4104 REGCP_SET(lastcp);
c277df42 4105 if (paren) {
8fa7f367 4106 UV c = 0;
c277df42 4107 while (n >= ln) {
1aa99e6b
IH
4108 if (c1 != -1000) {
4109 if (do_utf8)
872c91ae 4110 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4111 UTF8_MAXBYTES, 0,
872c91ae
JH
4112 ckWARN(WARN_UTF8) ?
4113 0 : UTF8_ALLOW_ANY);
1aa99e6b 4114 else
9041c2e3 4115 c = UCHARAT(PL_reginput);
1aa99e6b 4116 }
c277df42 4117 /* If it could work, try it. */
eb160463 4118 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 4119 {
29d1e993 4120 TRYPAREN(paren, n, PL_reginput);
02db2b7b 4121 REGCP_UNWIND(lastcp);
c277df42
IZ
4122 }
4123 /* Couldn't or didn't -- back up. */
4124 n--;
dfe13c55 4125 PL_reginput = locinput = HOPc(locinput, -1);
c277df42 4126 }
a0ed51b3
LW
4127 }
4128 else {
8fa7f367 4129 UV c = 0;
c277df42 4130 while (n >= ln) {
1aa99e6b
IH
4131 if (c1 != -1000) {
4132 if (do_utf8)
872c91ae 4133 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4134 UTF8_MAXBYTES, 0,
872c91ae
JH
4135 ckWARN(WARN_UTF8) ?
4136 0 : UTF8_ALLOW_ANY);
1aa99e6b 4137 else
9041c2e3 4138 c = UCHARAT(PL_reginput);
1aa99e6b 4139 }
c277df42 4140 /* If it could work, try it. */
eb160463 4141 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 4142 {
29d1e993 4143 TRYPAREN(paren, n, PL_reginput);
02db2b7b 4144 REGCP_UNWIND(lastcp);
c277df42
IZ
4145 }
4146 /* Couldn't or didn't -- back up. */
4147 n--;
dfe13c55 4148 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 4149 }
a0d0e21e
LW
4150 }
4151 }
4633a7c4 4152 sayNO;
c277df42 4153 break;
a0d0e21e 4154 case END:
0f5d15d6
IZ
4155 if (PL_reg_call_cc) {
4156 re_cc_state *cur_call_cc = PL_reg_call_cc;
4157 CURCUR *cctmp = PL_regcc;
4158 regexp *re = PL_reg_re;
4159 CHECKPOINT cp, lastcp;
4160
4161 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 4162 REGCP_SET(lastcp);
0f5d15d6
IZ
4163 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
4164 the caller. */
4165 PL_reginput = locinput; /* Make position available to
4166 the callcc. */
4167 cache_re(PL_reg_call_cc->re);
4168 PL_regcc = PL_reg_call_cc->cc;
4169 PL_reg_call_cc = PL_reg_call_cc->prev;
4170 if (regmatch(cur_call_cc->node)) {
4171 PL_reg_call_cc = cur_call_cc;
4172 regcpblow(cp);
4173 sayYES;
4174 }
02db2b7b 4175 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
4176 regcppop();
4177 PL_reg_call_cc = cur_call_cc;
4178 PL_regcc = cctmp;
4179 PL_reg_re = re;
4180 cache_re(re);
4181
a3621e74 4182 DEBUG_EXECUTE_r(
0f5d15d6
IZ
4183 PerlIO_printf(Perl_debug_log,
4184 "%*s continuation failed...\n",
4185 REPORT_CODE_OFF+PL_regindent*2, "")
4186 );
7821416a 4187 sayNO_SILENT;
0f5d15d6 4188 }
7821416a 4189 if (locinput < PL_regtill) {
a3621e74 4190 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4191 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4192 PL_colors[4],
4193 (long)(locinput - PL_reg_starttry),
4194 (long)(PL_regtill - PL_reg_starttry),
4195 PL_colors[5]));
4196 sayNO_FINAL; /* Cannot match: too short. */
4197 }
4198 PL_reginput = locinput; /* put where regtry can find it */
4199 sayYES_FINAL; /* Success! */
7e5428c5 4200 case SUCCEED:
3280af22 4201 PL_reginput = locinput; /* put where regtry can find it */
7821416a 4202 sayYES_LOUD; /* Success! */
c277df42
IZ
4203 case SUSPEND:
4204 n = 1;
9fe1d20c 4205 PL_reginput = locinput;
9041c2e3 4206 goto do_ifmatch;
a0d0e21e 4207 case UNLESSM:
c277df42 4208 n = 0;
a0ed51b3 4209 if (scan->flags) {
efb30f32
HS
4210 s = HOPBACKc(locinput, scan->flags);
4211 if (!s)
4212 goto say_yes;
4213 PL_reginput = s;
a0ed51b3
LW
4214 }
4215 else
4216 PL_reginput = locinput;
c277df42
IZ
4217 goto do_ifmatch;
4218 case IFMATCH:
4219 n = 1;
a0ed51b3 4220 if (scan->flags) {
efb30f32
HS
4221 s = HOPBACKc(locinput, scan->flags);
4222 if (!s)
4223 goto say_no;
4224 PL_reginput = s;
a0ed51b3
LW
4225 }
4226 else
4227 PL_reginput = locinput;
4228
c277df42 4229 do_ifmatch:
c277df42
IZ
4230 inner = NEXTOPER(NEXTOPER(scan));
4231 if (regmatch(inner) != n) {
4232 say_no:
4233 if (logical) {
4234 logical = 0;
4235 sw = 0;
4236 goto do_longjump;
a0ed51b3
LW
4237 }
4238 else
c277df42
IZ
4239 sayNO;
4240 }
4241 say_yes:
4242 if (logical) {
4243 logical = 0;
4244 sw = 1;
4245 }
fe44a5e8 4246 if (OP(scan) == SUSPEND) {
3280af22 4247 locinput = PL_reginput;
565764a8 4248 nextchr = UCHARAT(locinput);
fe44a5e8 4249 }
c277df42
IZ
4250 /* FALL THROUGH. */
4251 case LONGJMP:
4252 do_longjump:
4253 next = scan + ARG(scan);
4254 if (next == scan)
4255 next = NULL;
a0d0e21e
LW
4256 break;
4257 default:
b900a521 4258 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4259 PTR2UV(scan), OP(scan));
cea2e8a9 4260 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4261 }
02db2b7b 4262 reenter:
a0d0e21e
LW
4263 scan = next;
4264 }
a687059c 4265
a0d0e21e
LW
4266 /*
4267 * We get here only if there's trouble -- normally "case END" is
4268 * the terminating point.
4269 */
cea2e8a9 4270 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4271 /*NOTREACHED*/
4633a7c4
LW
4272 sayNO;
4273
7821416a 4274yes_loud:
a3621e74 4275 DEBUG_EXECUTE_r(
7821416a
IZ
4276 PerlIO_printf(Perl_debug_log,
4277 "%*s %scould match...%s\n",
e4584336 4278 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a
IZ
4279 );
4280 goto yes;
4281yes_final:
a3621e74 4282 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4283 PL_colors[4], PL_colors[5]));
4633a7c4
LW
4284yes:
4285#ifdef DEBUGGING
3280af22 4286 PL_regindent--;
4633a7c4 4287#endif
02db2b7b
IZ
4288
4289#if 0 /* Breaks $^R */
4290 if (unwind)
4291 regcpblow(firstcp);
4292#endif
4633a7c4
LW
4293 return 1;
4294
4295no:
a3621e74 4296 DEBUG_EXECUTE_r(
7821416a
IZ
4297 PerlIO_printf(Perl_debug_log,
4298 "%*s %sfailed...%s\n",
e4584336 4299 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a
IZ
4300 );
4301 goto do_no;
4302no_final:
4303do_no:
02db2b7b
IZ
4304 if (unwind) {
4305 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
4306
4307 switch (uw->type) {
4308 case RE_UNWIND_BRANCH:
4309 case RE_UNWIND_BRANCHJ:
4310 {
4311 re_unwind_branch_t *uwb = &(uw->branch);
a3b680e6 4312 const I32 lastparen = uwb->lastparen;
9041c2e3 4313
02db2b7b
IZ
4314 REGCP_UNWIND(uwb->lastcp);
4315 for (n = *PL_reglastparen; n > lastparen; n--)
4316 PL_regendp[n] = -1;
4317 *PL_reglastparen = n;
4318 scan = next = uwb->next;
9041c2e3
NIS
4319 if ( !scan ||
4320 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b
IZ
4321 ? BRANCH : BRANCHJ) ) { /* Failure */
4322 unwind = uwb->prev;
4323#ifdef DEBUGGING
4324 PL_regindent--;
4325#endif
4326 goto do_no;
4327 }
4328 /* Have more choice yet. Reuse the same uwb. */
02db2b7b
IZ
4329 if ((n = (uwb->type == RE_UNWIND_BRANCH
4330 ? NEXT_OFF(next) : ARG(next))))
4331 next += n;
4332 else
4333 next = NULL; /* XXXX Needn't unwinding in this case... */
4334 uwb->next = next;
4335 next = NEXTOPER(scan);
4336 if (uwb->type == RE_UNWIND_BRANCHJ)
4337 next = NEXTOPER(next);
4338 locinput = uwb->locinput;
4339 nextchr = uwb->nextchr;
4340#ifdef DEBUGGING
4341 PL_regindent = uwb->regindent;
4342#endif
4343
4344 goto reenter;
4345 }
4346 /* NOT REACHED */
4347 default:
4348 Perl_croak(aTHX_ "regexp unwind memory corruption");
4349 }
4350 /* NOT REACHED */
4351 }
4633a7c4 4352#ifdef DEBUGGING
3280af22 4353 PL_regindent--;
4633a7c4 4354#endif
a0d0e21e 4355 return 0;
a687059c
LW
4356}
4357
4358/*
4359 - regrepeat - repeatedly match something simple, report how many
4360 */
4361/*
4362 * [This routine now assumes that it will only match on things of length 1.
4363 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4364 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4365 */
76e3520e 4366STATIC I32
a3b680e6 4367S_regrepeat(pTHX_ const regnode *p, I32 max)
a687059c 4368{
27da23d5 4369 dVAR;
a0d0e21e 4370 register char *scan;
a0d0e21e 4371 register I32 c;
3280af22 4372 register char *loceol = PL_regeol;
a0ed51b3 4373 register I32 hardcount = 0;
53c4c00c 4374 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4375
3280af22 4376 scan = PL_reginput;
faf11cac
HS
4377 if (max == REG_INFTY)
4378 max = I32_MAX;
4379 else if (max < loceol - scan)
a0d0e21e 4380 loceol = scan + max;
a0d0e21e 4381 switch (OP(p)) {
22c35a8c 4382 case REG_ANY:
1aa99e6b 4383 if (do_utf8) {
ffc61ed2 4384 loceol = PL_regeol;
1aa99e6b 4385 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4386 scan += UTF8SKIP(scan);
4387 hardcount++;
4388 }
4389 } else {
4390 while (scan < loceol && *scan != '\n')
4391 scan++;
a0ed51b3
LW
4392 }
4393 break;
ffc61ed2 4394 case SANY:
def8e4ea
JH
4395 if (do_utf8) {
4396 loceol = PL_regeol;
a0804c9e 4397 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4398 scan += UTF8SKIP(scan);
4399 hardcount++;
4400 }
4401 }
4402 else
4403 scan = loceol;
a0ed51b3 4404 break;
f33976b4
DB
4405 case CANY:
4406 scan = loceol;
4407 break;
090f7165
JH
4408 case EXACT: /* length of string is 1 */
4409 c = (U8)*STRING(p);
4410 while (scan < loceol && UCHARAT(scan) == c)
4411 scan++;
bbce6d69 4412 break;
4413 case EXACTF: /* length of string is 1 */
cd439c50 4414 c = (U8)*STRING(p);
bbce6d69 4415 while (scan < loceol &&
22c35a8c 4416 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4417 scan++;
4418 break;
4419 case EXACTFL: /* length of string is 1 */
3280af22 4420 PL_reg_flags |= RF_tainted;
cd439c50 4421 c = (U8)*STRING(p);
bbce6d69 4422 while (scan < loceol &&
22c35a8c 4423 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4424 scan++;
4425 break;
4426 case ANYOF:
ffc61ed2
JH
4427 if (do_utf8) {
4428 loceol = PL_regeol;
cfc92286
JH
4429 while (hardcount < max && scan < loceol &&
4430 reginclass(p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4431 scan += UTF8SKIP(scan);
4432 hardcount++;
4433 }
4434 } else {
7d3e948e 4435 while (scan < loceol && REGINCLASS(p, (U8*)scan))
ffc61ed2
JH
4436 scan++;
4437 }
a0d0e21e
LW
4438 break;
4439 case ALNUM:
1aa99e6b 4440 if (do_utf8) {
ffc61ed2 4441 loceol = PL_regeol;
1a4fad37 4442 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4443 while (hardcount < max && scan < loceol &&
3568d838 4444 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4445 scan += UTF8SKIP(scan);
4446 hardcount++;
4447 }
4448 } else {
4449 while (scan < loceol && isALNUM(*scan))
4450 scan++;
a0ed51b3
LW
4451 }
4452 break;
bbce6d69 4453 case ALNUML:
3280af22 4454 PL_reg_flags |= RF_tainted;
1aa99e6b 4455 if (do_utf8) {
ffc61ed2 4456 loceol = PL_regeol;
1aa99e6b
IH
4457 while (hardcount < max && scan < loceol &&
4458 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4459 scan += UTF8SKIP(scan);
4460 hardcount++;
4461 }
4462 } else {
4463 while (scan < loceol && isALNUM_LC(*scan))
4464 scan++;
a0ed51b3
LW
4465 }
4466 break;
a0d0e21e 4467 case NALNUM:
1aa99e6b 4468 if (do_utf8) {
ffc61ed2 4469 loceol = PL_regeol;
1a4fad37 4470 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4471 while (hardcount < max && scan < loceol &&
3568d838 4472 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4473 scan += UTF8SKIP(scan);
4474 hardcount++;
4475 }
4476 } else {
4477 while (scan < loceol && !isALNUM(*scan))
4478 scan++;
a0ed51b3
LW
4479 }
4480 break;
bbce6d69 4481 case NALNUML:
3280af22 4482 PL_reg_flags |= RF_tainted;
1aa99e6b 4483 if (do_utf8) {
ffc61ed2 4484 loceol = PL_regeol;
1aa99e6b
IH
4485 while (hardcount < max && scan < loceol &&
4486 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4487 scan += UTF8SKIP(scan);
4488 hardcount++;
4489 }
4490 } else {
4491 while (scan < loceol && !isALNUM_LC(*scan))
4492 scan++;
a0ed51b3
LW
4493 }
4494 break;
a0d0e21e 4495 case SPACE:
1aa99e6b 4496 if (do_utf8) {
ffc61ed2 4497 loceol = PL_regeol;
1a4fad37 4498 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4499 while (hardcount < max && scan < loceol &&
3568d838
JH
4500 (*scan == ' ' ||
4501 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4502 scan += UTF8SKIP(scan);
4503 hardcount++;
4504 }
4505 } else {
4506 while (scan < loceol && isSPACE(*scan))
4507 scan++;
a0ed51b3
LW
4508 }
4509 break;
bbce6d69 4510 case SPACEL:
3280af22 4511 PL_reg_flags |= RF_tainted;
1aa99e6b 4512 if (do_utf8) {
ffc61ed2 4513 loceol = PL_regeol;
1aa99e6b 4514 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4515 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4516 scan += UTF8SKIP(scan);
4517 hardcount++;
4518 }
4519 } else {
4520 while (scan < loceol && isSPACE_LC(*scan))
4521 scan++;
a0ed51b3
LW
4522 }
4523 break;
a0d0e21e 4524 case NSPACE:
1aa99e6b 4525 if (do_utf8) {
ffc61ed2 4526 loceol = PL_regeol;
1a4fad37 4527 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4528 while (hardcount < max && scan < loceol &&
3568d838
JH
4529 !(*scan == ' ' ||
4530 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4531 scan += UTF8SKIP(scan);
4532 hardcount++;
4533 }
4534 } else {
4535 while (scan < loceol && !isSPACE(*scan))
4536 scan++;
4537 break;
a0ed51b3 4538 }
bbce6d69 4539 case NSPACEL:
3280af22 4540 PL_reg_flags |= RF_tainted;
1aa99e6b 4541 if (do_utf8) {
ffc61ed2 4542 loceol = PL_regeol;
1aa99e6b 4543 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4544 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4545 scan += UTF8SKIP(scan);
4546 hardcount++;
4547 }
4548 } else {
4549 while (scan < loceol && !isSPACE_LC(*scan))
4550 scan++;
a0ed51b3
LW
4551 }
4552 break;
a0d0e21e 4553 case DIGIT:
1aa99e6b 4554 if (do_utf8) {
ffc61ed2 4555 loceol = PL_regeol;
1a4fad37 4556 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4557 while (hardcount < max && scan < loceol &&
3568d838 4558 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4559 scan += UTF8SKIP(scan);
4560 hardcount++;
4561 }
4562 } else {
4563 while (scan < loceol && isDIGIT(*scan))
4564 scan++;
a0ed51b3
LW
4565 }
4566 break;
a0d0e21e 4567 case NDIGIT:
1aa99e6b 4568 if (do_utf8) {
ffc61ed2 4569 loceol = PL_regeol;
1a4fad37 4570 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4571 while (hardcount < max && scan < loceol &&
3568d838 4572 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4573 scan += UTF8SKIP(scan);
4574 hardcount++;
4575 }
4576 } else {
4577 while (scan < loceol && !isDIGIT(*scan))
4578 scan++;
a0ed51b3
LW
4579 }
4580 break;
a0d0e21e
LW
4581 default: /* Called on something of 0 width. */
4582 break; /* So match right here or not at all. */
4583 }
a687059c 4584
a0ed51b3
LW
4585 if (hardcount)
4586 c = hardcount;
4587 else
4588 c = scan - PL_reginput;
3280af22 4589 PL_reginput = scan;
a687059c 4590
a3621e74 4591 DEBUG_r({
ab74612d 4592 SV *re_debug_flags = NULL;
c277df42 4593 SV *prop = sv_newmortal();
a3621e74
YO
4594 GET_RE_DEBUG_FLAGS;
4595 DEBUG_EXECUTE_r({
c277df42 4596 regprop(prop, p);
9041c2e3
NIS
4597 PerlIO_printf(Perl_debug_log,
4598 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3f7c398e 4599 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
c277df42 4600 });
a3621e74 4601 });
9041c2e3 4602
a0d0e21e 4603 return(c);
a687059c
LW
4604}
4605
4606/*
c277df42 4607 - regrepeat_hard - repeatedly match something, report total lenth and length
9041c2e3 4608 *
0e788c72 4609 * The repeater is supposed to have constant non-zero length.
c277df42
IZ
4610 */
4611
76e3520e 4612STATIC I32
cea2e8a9 4613S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
c277df42 4614{
b7953727 4615 register char *scan = Nullch;
c277df42 4616 register char *start;
3280af22 4617 register char *loceol = PL_regeol;
a0ed51b3 4618 I32 l = 0;
708e3b05 4619 I32 count = 0, res = 1;
a0ed51b3
LW
4620
4621 if (!max)
4622 return 0;
c277df42 4623
3280af22 4624 start = PL_reginput;
53c4c00c 4625 if (PL_reg_match_utf8) {
708e3b05 4626 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4627 if (!count++) {
4628 l = 0;
4629 while (start < PL_reginput) {
4630 l++;
4631 start += UTF8SKIP(start);
4632 }
4633 *lp = l;
4634 if (l == 0)
4635 return max;
4636 }
4637 if (count == max)
4638 return count;
4639 }
4640 }
4641 else {
708e3b05 4642 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4643 if (!count++) {
4644 *lp = l = PL_reginput - start;
4645 if (max != REG_INFTY && l*max < loceol - scan)
4646 loceol = scan + l*max;
4647 if (l == 0)
4648 return max;
c277df42
IZ
4649 }
4650 }
4651 }
708e3b05 4652 if (!res)
3280af22 4653 PL_reginput = scan;
9041c2e3 4654
a0ed51b3 4655 return count;
c277df42
IZ
4656}
4657
4658/*
ffc61ed2
JH
4659- regclass_swash - prepare the utf8 swash
4660*/
4661
4662SV *
a3b680e6 4663Perl_regclass_swash(pTHX_ register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4664{
9e55ce06
JH
4665 SV *sw = NULL;
4666 SV *si = NULL;
4667 SV *alt = NULL;
ffc61ed2
JH
4668
4669 if (PL_regdata && PL_regdata->count) {
a3b680e6 4670 const U32 n = ARG(node);
ffc61ed2
JH
4671
4672 if (PL_regdata->what[n] == 's') {
4673 SV *rv = (SV*)PL_regdata->data[n];
4674 AV *av = (AV*)SvRV((SV*)rv);
b11f357e 4675 SV **ary = AvARRAY(av);
9e55ce06 4676 SV **a, **b;
9041c2e3 4677
9e55ce06
JH
4678 /* See the end of regcomp.c:S_reglass() for
4679 * documentation of these array elements. */
4680
b11f357e
JH
4681 si = *ary;
4682 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4683 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4684
ffc61ed2
JH
4685 if (a)
4686 sw = *a;
4687 else if (si && doinit) {
4688 sw = swash_init("utf8", "", si, 1, 0);
4689 (void)av_store(av, 1, sw);
4690 }
9e55ce06
JH
4691 if (b)
4692 alt = *b;
ffc61ed2
JH
4693 }
4694 }
4695
9e55ce06
JH
4696 if (listsvp)
4697 *listsvp = si;
4698 if (altsvp)
4699 *altsvp = alt;
ffc61ed2
JH
4700
4701 return sw;
4702}
4703
4704/*
ba7b4546 4705 - reginclass - determine if a character falls into a character class
832705d4
JH
4706
4707 The n is the ANYOF regnode, the p is the target string, lenp
4708 is pointer to the maximum length of how far to go in the p
4709 (if the lenp is zero, UTF8SKIP(p) is used),
4710 do_utf8 tells whether the target string is in UTF-8.
4711
bbce6d69 4712 */
4713
76e3520e 4714STATIC bool
a3b680e6 4715S_reginclass(pTHX_ register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4716{
27da23d5 4717 dVAR;
a3b680e6 4718 const char flags = ANYOF_FLAGS(n);
bbce6d69 4719 bool match = FALSE;
cc07378b 4720 UV c = *p;
ae9ddab8 4721 STRLEN len = 0;
9e55ce06 4722 STRLEN plen;
1aa99e6b 4723
ae9ddab8 4724 if (do_utf8 && !UTF8_IS_INVARIANT(c))
89ebb4a3 4725 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
ae9ddab8 4726 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
bbce6d69 4727
0f0076b4 4728 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4729 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4730 if (lenp)
4731 *lenp = 0;
ffc61ed2 4732 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4733 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4734 match = TRUE;
bbce6d69 4735 }
3568d838 4736 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4737 match = TRUE;
ffc61ed2 4738 if (!match) {
9e55ce06
JH
4739 AV *av;
4740 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4741
4742 if (sw) {
3568d838 4743 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4744 match = TRUE;
4745 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4746 if (!match && lenp && av) {
4747 I32 i;
4748
4749 for (i = 0; i <= av_len(av); i++) {
4750 SV* sv = *av_fetch(av, i, FALSE);
4751 STRLEN len;
cfd0369c 4752 const char *s = SvPV_const(sv, len);
9e55ce06 4753
061b10df 4754 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4755 *lenp = len;
4756 match = TRUE;
4757 break;
4758 }
4759 }
4760 }
4761 if (!match) {
89ebb4a3 4762 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
4763 STRLEN tmplen;
4764
9e55ce06
JH
4765 to_utf8_fold(p, tmpbuf, &tmplen);
4766 if (swash_fetch(sw, tmpbuf, do_utf8))
4767 match = TRUE;
4768 }
ffc61ed2
JH
4769 }
4770 }
bbce6d69 4771 }
9e55ce06 4772 if (match && lenp && *lenp == 0)
0f0076b4 4773 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4774 }
1aa99e6b 4775 if (!match && c < 256) {
ffc61ed2
JH
4776 if (ANYOF_BITMAP_TEST(n, c))
4777 match = TRUE;
4778 else if (flags & ANYOF_FOLD) {
eb160463 4779 U8 f;
a0ed51b3 4780
ffc61ed2
JH
4781 if (flags & ANYOF_LOCALE) {
4782 PL_reg_flags |= RF_tainted;
4783 f = PL_fold_locale[c];
4784 }
4785 else
4786 f = PL_fold[c];
4787 if (f != c && ANYOF_BITMAP_TEST(n, f))
4788 match = TRUE;
4789 }
4790
4791 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4792 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4793 if (
4794 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4795 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4796 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4797 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4798 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4799 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4800 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4801 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4802 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4803 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4804 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4805 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4806 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4807 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4808 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4809 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4810 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4811 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4812 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4813 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4814 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4815 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4816 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4817 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4818 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4819 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4820 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4821 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4822 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4823 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4824 ) /* How's that for a conditional? */
4825 {
4826 match = TRUE;
4827 }
a0ed51b3 4828 }
a0ed51b3
LW
4829 }
4830
a0ed51b3
LW
4831 return (flags & ANYOF_INVERT) ? !match : match;
4832}
161b471a 4833
dfe13c55 4834STATIC U8 *
cea2e8a9 4835S_reghop(pTHX_ U8 *s, I32 off)
9041c2e3 4836{
1aa99e6b
IH
4837 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4838}
4839
4840STATIC U8 *
4841S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
9041c2e3 4842{
a0ed51b3 4843 if (off >= 0) {
1aa99e6b 4844 while (off-- && s < lim) {
ffc61ed2 4845 /* XXX could check well-formedness here */
a0ed51b3 4846 s += UTF8SKIP(s);
ffc61ed2 4847 }
a0ed51b3
LW
4848 }
4849 else {
4850 while (off++) {
1aa99e6b 4851 if (s > lim) {
a0ed51b3 4852 s--;
ffc61ed2 4853 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4854 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4855 s--;
ffc61ed2
JH
4856 }
4857 /* XXX could check well-formedness here */
a0ed51b3
LW
4858 }
4859 }
4860 }
4861 return s;
4862}
161b471a 4863
dfe13c55 4864STATIC U8 *
1aa99e6b 4865S_reghopmaybe(pTHX_ U8 *s, I32 off)
9041c2e3 4866{
1aa99e6b
IH
4867 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4868}
4869
4870STATIC U8 *
4871S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
a0ed51b3
LW
4872{
4873 if (off >= 0) {
1aa99e6b 4874 while (off-- && s < lim) {
ffc61ed2 4875 /* XXX could check well-formedness here */
a0ed51b3 4876 s += UTF8SKIP(s);
ffc61ed2 4877 }
a0ed51b3
LW
4878 if (off >= 0)
4879 return 0;
4880 }
4881 else {
4882 while (off++) {
1aa99e6b 4883 if (s > lim) {
a0ed51b3 4884 s--;
ffc61ed2 4885 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4886 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4887 s--;
ffc61ed2
JH
4888 }
4889 /* XXX could check well-formedness here */
a0ed51b3
LW
4890 }
4891 else
4892 break;
4893 }
4894 if (off <= 0)
4895 return 0;
4896 }
4897 return s;
4898}
51371543 4899
51371543 4900static void
acfe0abc 4901restore_pos(pTHX_ void *arg)
51371543 4902{
9d4ba2ae 4903 PERL_UNUSED_ARG(arg);
51371543
GS
4904 if (PL_reg_eval_set) {
4905 if (PL_reg_oldsaved) {
4906 PL_reg_re->subbeg = PL_reg_oldsaved;
4907 PL_reg_re->sublen = PL_reg_oldsavedlen;
f8c7b90f 4908#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
4909 PL_reg_re->saved_copy = PL_nrs;
4910#endif
51371543
GS
4911 RX_MATCH_COPIED_on(PL_reg_re);
4912 }
4913 PL_reg_magic->mg_len = PL_reg_oldpos;
4914 PL_reg_eval_set = 0;
4915 PL_curpm = PL_reg_oldcurpm;
4916 }
4917}
33b8afdf
JH
4918
4919STATIC void
4920S_to_utf8_substr(pTHX_ register regexp *prog)
4921{
33b8afdf 4922 if (prog->float_substr && !prog->float_utf8) {
9d4ba2ae 4923 SV* sv;
f2b990bf 4924 prog->float_utf8 = sv = newSVsv(prog->float_substr);
33b8afdf
JH
4925 sv_utf8_upgrade(sv);
4926 if (SvTAIL(prog->float_substr))
4927 SvTAIL_on(sv);
4928 if (prog->float_substr == prog->check_substr)
4929 prog->check_utf8 = sv;
4930 }
4931 if (prog->anchored_substr && !prog->anchored_utf8) {
9d4ba2ae 4932 SV* sv;
f2b990bf 4933 prog->anchored_utf8 = sv = newSVsv(prog->anchored_substr);
33b8afdf
JH
4934 sv_utf8_upgrade(sv);
4935 if (SvTAIL(prog->anchored_substr))
4936 SvTAIL_on(sv);
4937 if (prog->anchored_substr == prog->check_substr)
4938 prog->check_utf8 = sv;
4939 }
4940}
4941
4942STATIC void
4943S_to_byte_substr(pTHX_ register regexp *prog)
4944{
33b8afdf 4945 if (prog->float_utf8 && !prog->float_substr) {
9d4ba2ae 4946 SV* sv;
f2b990bf 4947 prog->float_substr = sv = newSVsv(prog->float_utf8);
33b8afdf
JH
4948 if (sv_utf8_downgrade(sv, TRUE)) {
4949 if (SvTAIL(prog->float_utf8))
4950 SvTAIL_on(sv);
4951 } else {
4952 SvREFCNT_dec(sv);
4953 prog->float_substr = sv = &PL_sv_undef;
4954 }
4955 if (prog->float_utf8 == prog->check_utf8)
4956 prog->check_substr = sv;
4957 }
4958 if (prog->anchored_utf8 && !prog->anchored_substr) {
9d4ba2ae 4959 SV* sv;
f2b990bf 4960 prog->anchored_substr = sv = newSVsv(prog->anchored_utf8);
33b8afdf
JH
4961 if (sv_utf8_downgrade(sv, TRUE)) {
4962 if (SvTAIL(prog->anchored_utf8))
4963 SvTAIL_on(sv);
4964 } else {
4965 SvREFCNT_dec(sv);
4966 prog->anchored_substr = sv = &PL_sv_undef;
4967 }
4968 if (prog->anchored_utf8 == prog->check_utf8)
4969 prog->check_substr = sv;
4970 }
4971}
66610fdd
RGS
4972
4973/*
4974 * Local variables:
4975 * c-indentation-style: bsd
4976 * c-basic-offset: 4
4977 * indent-tabs-mode: t
4978 * End:
4979 *
37442d52
RGS
4980 * ex: set ts=8 sts=4 sw=4 noet:
4981 */