This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
gv_fetchpvn_flags ranks highly in the profile, and the ::/' scanning
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
50/* *These* symbols are masked to allow static link. */
51# define Perl_pregexec my_pregexec
9041c2e3 52# define Perl_reginitcolors my_reginitcolors
490a3f88 53# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
9041c2e3 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 **** Alterations to Henry's code are...
79 ****
4bb101f2 80 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 81 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 82 ****
9ef589d8
LW
83 **** You may distribute under the terms of either the GNU General Public
84 **** License or the Artistic License, as specified in the README file.
a687059c
LW
85 *
86 * Beware that some of this code is subtly aware of the way operator
87 * precedence is structured in regular expressions. Serious changes in
88 * regular-expression syntax might require a total rethink.
89 */
90#include "EXTERN.h"
864dbfa3 91#define PERL_IN_REGEXEC_C
a687059c 92#include "perl.h"
0f5d15d6 93
a687059c
LW
94#include "regcomp.h"
95
c277df42
IZ
96#define RF_tainted 1 /* tainted information used? */
97#define RF_warned 2 /* warned about big count? */
ce862d02 98#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
99#define RF_utf8 8 /* String contains multibyte chars? */
100
eb160463 101#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
102
103#define RS_init 1 /* eval environment created */
104#define RS_set 2 /* replsv value is set */
c277df42 105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
ba7b4546 110#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 111
c277df42
IZ
112/*
113 * Forwards.
114 */
115
33b8afdf 116#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 117#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 118
dfe13c55
GS
119#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
120#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
121#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
122#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
123#define HOPc(pos,off) ((char*)HOP(pos,off))
124#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 125
efb30f32 126#define HOPBACK(pos, off) ( \
e54858b0 127 (PL_reg_match_utf8) \
efb30f32
HS
128 ? reghopmaybe((U8*)pos, -off) \
129 : (pos - off >= PL_bostr) \
130 ? (U8*)(pos - off) \
131 : (U8*)NULL \
132)
133#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
134
1aa99e6b
IH
135#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
136#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
137#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
138#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
139#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
140#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
141
1a4fad37
AL
142#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
143 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
144#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
145#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
146#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
147#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 148
5f80c4cf 149/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
150#define JUMPABLE(rn) ( \
151 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
152 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
153 OP(rn) == PLUS || OP(rn) == MINMOD || \
154 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
155)
156
cca55fe3
JP
157#define HAS_TEXT(rn) ( \
158 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
159)
e2d8ce26 160
a84d97b6
HS
161/*
162 Search for mandatory following text node; for lookahead, the text must
163 follow but for lookbehind (rn->flags != 0) we skip to the next step.
164*/
cca55fe3 165#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 166 while (JUMPABLE(rn)) \
a84d97b6 167 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 168 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
169 else if (OP(rn) == PLUS) \
170 rn = NEXTOPER(rn); \
a84d97b6
HS
171 else if (OP(rn) == IFMATCH) \
172 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 173 else rn += NEXT_OFF(rn); \
5f80c4cf 174} STMT_END
74750237 175
acfe0abc 176static void restore_pos(pTHX_ void *arg);
51371543 177
76e3520e 178STATIC CHECKPOINT
cea2e8a9 179S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 180{
97aff369 181 dVAR;
a3b680e6 182 const int retval = PL_savestack_ix;
b1ce53c5 183#define REGCP_PAREN_ELEMS 4
a3b680e6 184 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
185 int p;
186
e49a9654
IH
187 if (paren_elems_to_push < 0)
188 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
189
a01268b5 190#define REGCP_OTHER_ELEMS 6
4b3c1a47 191 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 192 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 193/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
194 SSPUSHINT(PL_regendp[p]);
195 SSPUSHINT(PL_regstartp[p]);
3280af22 196 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
197 SSPUSHINT(p);
198 }
b1ce53c5 199/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
200 SSPUSHINT(PL_regsize);
201 SSPUSHINT(*PL_reglastparen);
a01268b5 202 SSPUSHINT(*PL_reglastcloseparen);
3280af22 203 SSPUSHPTR(PL_reginput);
41123dfd
JH
204#define REGCP_FRAME_ELEMS 2
205/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
206 * are needed for the regexp context stack bookkeeping. */
207 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 208 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 209
a0d0e21e
LW
210 return retval;
211}
212
c277df42 213/* These are needed since we do not localize EVAL nodes: */
a3621e74 214# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 215 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 216 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 217
a3621e74 218# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 219 PerlIO_printf(Perl_debug_log, \
faccc32b 220 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 221 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 222
76e3520e 223STATIC char *
cea2e8a9 224S_regcppop(pTHX)
a0d0e21e 225{
97aff369 226 dVAR;
b1ce53c5 227 I32 i;
a0d0e21e
LW
228 U32 paren = 0;
229 char *input;
b1ce53c5 230
a3621e74
YO
231 GET_RE_DEBUG_FLAGS_DECL;
232
b1ce53c5 233 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 234 i = SSPOPINT;
b1ce53c5
JH
235 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
236 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 237 input = (char *) SSPOPPTR;
a01268b5 238 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
239 *PL_reglastparen = SSPOPINT;
240 PL_regsize = SSPOPINT;
b1ce53c5
JH
241
242 /* Now restore the parentheses context. */
41123dfd
JH
243 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
244 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 245 I32 tmps;
a0d0e21e 246 paren = (U32)SSPOPINT;
3280af22 247 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
248 PL_regstartp[paren] = SSPOPINT;
249 tmps = SSPOPINT;
3280af22
NIS
250 if (paren <= *PL_reglastparen)
251 PL_regendp[paren] = tmps;
a3621e74 252 DEBUG_EXECUTE_r(
c3464db5 253 PerlIO_printf(Perl_debug_log,
b900a521 254 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 255 (UV)paren, (IV)PL_regstartp[paren],
b900a521 256 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 257 (IV)PL_regendp[paren],
3280af22 258 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 259 );
a0d0e21e 260 }
a3621e74 261 DEBUG_EXECUTE_r(
eb160463 262 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 263 PerlIO_printf(Perl_debug_log,
faccc32b
JH
264 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
265 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
266 }
267 );
daf18116 268#if 1
dafc8851
JH
269 /* It would seem that the similar code in regtry()
270 * already takes care of this, and in fact it is in
271 * a better location to since this code can #if 0-ed out
272 * but the code in regtry() is needed or otherwise tests
273 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
274 * (as of patchlevel 7877) will fail. Then again,
275 * this code seems to be necessary or otherwise
276 * building DynaLoader will fail:
277 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
278 * --jhi */
eb160463
GS
279 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
280 if ((I32)paren > PL_regsize)
cf93c79d
IZ
281 PL_regstartp[paren] = -1;
282 PL_regendp[paren] = -1;
a0d0e21e 283 }
dafc8851 284#endif
a0d0e21e
LW
285 return input;
286}
287
0f5d15d6
IZ
288typedef struct re_cc_state
289{
290 I32 ss;
291 regnode *node;
292 struct re_cc_state *prev;
293 CURCUR *cc;
294 regexp *re;
295} re_cc_state;
296
02db2b7b 297#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 298
29d1e993
HS
299#define TRYPAREN(paren, n, input) { \
300 if (paren) { \
301 if (n) { \
302 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
303 PL_regendp[paren] = input - PL_bostr; \
304 } \
305 else \
306 PL_regendp[paren] = -1; \
307 } \
308 if (regmatch(next)) \
309 sayYES; \
310 if (paren && n) \
311 PL_regendp[paren] = -1; \
312}
313
314
a687059c 315/*
e50aee73 316 * pregexec and friends
a687059c
LW
317 */
318
319/*
c277df42 320 - pregexec - match a regexp against a string
a687059c 321 */
c277df42 322I32
864dbfa3 323Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 324 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
325/* strend: pointer to null at end of string */
326/* strbeg: real beginning of string */
327/* minend: end of match must be >=minend after stringarg. */
328/* nosave: For optimizations. */
329{
330 return
9041c2e3 331 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
332 nosave ? 0 : REXEC_COPY_STR);
333}
0f5d15d6
IZ
334
335STATIC void
cea2e8a9 336S_cache_re(pTHX_ regexp *prog)
0f5d15d6 337{
97aff369 338 dVAR;
0f5d15d6
IZ
339 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
340#ifdef DEBUGGING
341 PL_regprogram = prog->program;
342#endif
343 PL_regnpar = prog->nparens;
9041c2e3
NIS
344 PL_regdata = prog->data;
345 PL_reg_re = prog;
0f5d15d6 346}
22e551b9 347
9041c2e3 348/*
cad2e5aa
JH
349 * Need to implement the following flags for reg_anch:
350 *
351 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
352 * USE_INTUIT_ML
353 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
354 * INTUIT_AUTORITATIVE_ML
355 * INTUIT_ONCE_NOML - Intuit can match in one location only.
356 * INTUIT_ONCE_ML
357 *
358 * Another flag for this function: SECOND_TIME (so that float substrs
359 * with giant delta may be not rechecked).
360 */
361
362/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
363
3f7c398e 364/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
365 Otherwise, only SvCUR(sv) is used to get strbeg. */
366
367/* XXXX We assume that strpos is strbeg unless sv. */
368
6eb5f6b9
JH
369/* XXXX Some places assume that there is a fixed substring.
370 An update may be needed if optimizer marks as "INTUITable"
371 RExen without fixed substrings. Similarly, it is assumed that
372 lengths of all the strings are no more than minlen, thus they
373 cannot come from lookahead.
374 (Or minlen should take into account lookahead.) */
375
2c2d71f5
JH
376/* A failure to find a constant substring means that there is no need to make
377 an expensive call to REx engine, thus we celebrate a failure. Similarly,
378 finding a substring too deep into the string means that less calls to
30944b6d
IZ
379 regtry() should be needed.
380
381 REx compiler's optimizer found 4 possible hints:
382 a) Anchored substring;
383 b) Fixed substring;
384 c) Whether we are anchored (beginning-of-line or \G);
385 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 386 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
387 string which does not contradict any of them.
388 */
2c2d71f5 389
6eb5f6b9
JH
390/* Most of decisions we do here should have been done at compile time.
391 The nodes of the REx which we used for the search should have been
392 deleted from the finite automaton. */
393
cad2e5aa
JH
394char *
395Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
396 char *strend, U32 flags, re_scream_pos_data *data)
397{
97aff369 398 dVAR;
b7953727 399 register I32 start_shift = 0;
cad2e5aa 400 /* Should be nonnegative! */
b7953727 401 register I32 end_shift = 0;
2c2d71f5
JH
402 register char *s;
403 register SV *check;
a1933d95 404 char *strbeg;
cad2e5aa 405 char *t;
a3b680e6 406 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 407 I32 ml_anch;
6eb5f6b9 408 register char *other_last = Nullch; /* other substr checked before this */
b7953727 409 char *check_at = Nullch; /* check substr found at this pos */
1df70142 410 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 411#ifdef DEBUGGING
890ce7af
AL
412 const char * const i_strpos = strpos;
413 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 414#endif
a3621e74
YO
415
416 GET_RE_DEBUG_FLAGS_DECL;
417
a30b2f1f 418 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 419
b8d68ded 420 if (prog->reganch & ROPT_UTF8) {
a3621e74 421 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
422 "UTF-8 regex...\n"));
423 PL_reg_flags |= RF_utf8;
424 }
425
a3621e74 426 DEBUG_EXECUTE_r({
1df70142 427 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
428 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
429 strpos;
1df70142 430 const int len = PL_reg_match_utf8 ?
b8d68ded 431 strlen(s) : strend - strpos;
2a782b5b
JH
432 if (!PL_colorset)
433 reginitcolors();
b8d68ded 434 if (PL_reg_match_utf8)
a3621e74 435 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 436 "UTF-8 target...\n"));
2a782b5b 437 PerlIO_printf(Perl_debug_log,
a0288114 438 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 439 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
440 prog->precomp,
441 PL_colors[1],
442 (strlen(prog->precomp) > 60 ? "..." : ""),
443 PL_colors[0],
444 (int)(len > 60 ? 60 : len),
445 s, PL_colors[1],
446 (len > 60 ? "..." : "")
447 );
448 });
cad2e5aa 449
c344f387
JH
450 /* CHR_DIST() would be more correct here but it makes things slow. */
451 if (prog->minlen > strend - strpos) {
a3621e74 452 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 453 "String too short... [re_intuit_start]\n"));
cad2e5aa 454 goto fail;
2c2d71f5 455 }
a1933d95 456 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 457 PL_regeol = strend;
33b8afdf
JH
458 if (do_utf8) {
459 if (!prog->check_utf8 && prog->check_substr)
460 to_utf8_substr(prog);
461 check = prog->check_utf8;
462 } else {
463 if (!prog->check_substr && prog->check_utf8)
464 to_byte_substr(prog);
465 check = prog->check_substr;
466 }
467 if (check == &PL_sv_undef) {
a3621e74 468 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
469 "Non-utf string cannot match utf check string\n"));
470 goto fail;
471 }
2c2d71f5 472 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
473 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
474 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 475 && !multiline ) ); /* Check after \n? */
cad2e5aa 476
7e25d62c
JH
477 if (!ml_anch) {
478 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
479 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 480 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
481 && sv && !SvROK(sv)
482 && (strpos != strbeg)) {
a3621e74 483 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
484 goto fail;
485 }
486 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 487 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 488 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
489 I32 slen;
490
1aa99e6b 491 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
492 if (SvTAIL(check)) {
493 slen = SvCUR(check); /* >= 1 */
cad2e5aa 494
9041c2e3 495 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 496 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 497 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 498 goto fail_finish;
cad2e5aa
JH
499 }
500 /* Now should match s[0..slen-2] */
501 slen--;
3f7c398e 502 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 503 || (slen > 1
3f7c398e 504 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 505 report_neq:
a3621e74 506 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
507 goto fail_finish;
508 }
cad2e5aa 509 }
3f7c398e 510 else if (*SvPVX_const(check) != *s
653099ff 511 || ((slen = SvCUR(check)) > 1
3f7c398e 512 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 513 goto report_neq;
c315bfe8 514 check_at = s;
2c2d71f5 515 goto success_at_start;
7e25d62c 516 }
cad2e5aa 517 }
2c2d71f5 518 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 519 s = strpos;
2c2d71f5 520 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 521 end_shift = prog->minlen - start_shift -
653099ff 522 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 523 if (!ml_anch) {
a3b680e6 524 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 525 - (SvTAIL(check) != 0);
a3b680e6 526 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
527
528 if (end_shift < eshift)
529 end_shift = eshift;
530 }
cad2e5aa 531 }
2c2d71f5 532 else { /* Can match at random position */
cad2e5aa
JH
533 ml_anch = 0;
534 s = strpos;
2c2d71f5
JH
535 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
536 /* Should be nonnegative! */
537 end_shift = prog->minlen - start_shift -
653099ff 538 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
539 }
540
2c2d71f5 541#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 542 if (end_shift < 0)
6bbae5e6 543 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
544#endif
545
2c2d71f5
JH
546 restart:
547 /* Find a possible match in the region s..strend by looking for
548 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 549 if (flags & REXEC_SCREAM) {
cad2e5aa 550 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 551 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 552
2c2d71f5
JH
553 if (PL_screamfirst[BmRARE(check)] >= 0
554 || ( BmRARE(check) == '\n'
555 && (BmPREVIOUS(check) == SvCUR(check) - 1)
556 && SvTAIL(check) ))
9041c2e3 557 s = screaminstr(sv, check,
2c2d71f5 558 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 559 else
2c2d71f5 560 goto fail_finish;
4addbd3b
HS
561 /* we may be pointing at the wrong string */
562 if (s && RX_MATCH_COPIED(prog))
3f7c398e 563 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
564 if (data)
565 *data->scream_olds = s;
566 }
f33976b4 567 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
568 s = fbm_instr((U8*)(s + start_shift),
569 (U8*)(strend - end_shift),
7fba1cd6 570 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 571 else
1aa99e6b
IH
572 s = fbm_instr(HOP3(s, start_shift, strend),
573 HOP3(strend, -end_shift, strbeg),
7fba1cd6 574 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
575
576 /* Update the count-of-usability, remove useless subpatterns,
577 unshift s. */
2c2d71f5 578
a0288114 579 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 580 (s ? "Found" : "Did not find"),
33b8afdf 581 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 582 PL_colors[0],
7b0972df 583 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 584 SvPVX_const(check),
2c2d71f5
JH
585 PL_colors[1], (SvTAIL(check) ? "$" : ""),
586 (s ? " at offset " : "...\n") ) );
587
588 if (!s)
589 goto fail_finish;
590
6eb5f6b9
JH
591 check_at = s;
592
2c2d71f5 593 /* Finish the diagnostic message */
a3621e74 594 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
595
596 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
597 Start with the other substr.
598 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 599 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
600 *always* match. Probably should be marked during compile...
601 Probably it is right to do no SCREAM here...
602 */
603
33b8afdf 604 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 605 /* Take into account the "other" substring. */
2c2d71f5
JH
606 /* XXXX May be hopelessly wrong for UTF... */
607 if (!other_last)
6eb5f6b9 608 other_last = strpos;
33b8afdf 609 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
610 do_other_anchored:
611 {
890ce7af
AL
612 char * const last = HOP3c(s, -start_shift, strbeg);
613 char *last1, *last2;
2c2d71f5 614 char *s1 = s;
33b8afdf 615 SV* must;
2c2d71f5 616
2c2d71f5
JH
617 t = s - prog->check_offset_max;
618 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 619 && (!do_utf8
1aa99e6b 620 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 621 && t > strpos)))
30944b6d 622 /* EMPTY */;
2c2d71f5
JH
623 else
624 t = strpos;
1aa99e6b 625 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
626 if (t < other_last) /* These positions already checked */
627 t = other_last;
1aa99e6b 628 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
629 if (last < last1)
630 last1 = last;
631 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
632 /* On end-of-str: see comment below. */
33b8afdf
JH
633 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
634 if (must == &PL_sv_undef) {
635 s = (char*)NULL;
a3621e74 636 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
637 }
638 else
639 s = fbm_instr(
640 (unsigned char*)t,
641 HOP3(HOP3(last1, prog->anchored_offset, strend)
642 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
643 must,
7fba1cd6 644 multiline ? FBMrf_MULTILINE : 0
33b8afdf 645 );
a3621e74 646 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 647 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
648 (s ? "Found" : "Contradicts"),
649 PL_colors[0],
33b8afdf
JH
650 (int)(SvCUR(must)
651 - (SvTAIL(must)!=0)),
3f7c398e 652 SvPVX_const(must),
33b8afdf 653 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
654 if (!s) {
655 if (last1 >= last2) {
a3621e74 656 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
657 ", giving up...\n"));
658 goto fail_finish;
659 }
a3621e74 660 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 661 ", trying floating at offset %ld...\n",
1aa99e6b
IH
662 (long)(HOP3c(s1, 1, strend) - i_strpos)));
663 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
664 s = HOP3c(last, 1, strend);
2c2d71f5
JH
665 goto restart;
666 }
667 else {
a3621e74 668 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 669 (long)(s - i_strpos)));
1aa99e6b
IH
670 t = HOP3c(s, -prog->anchored_offset, strbeg);
671 other_last = HOP3c(s, 1, strend);
30944b6d 672 s = s1;
2c2d71f5
JH
673 if (t == strpos)
674 goto try_at_start;
2c2d71f5
JH
675 goto try_at_offset;
676 }
30944b6d 677 }
2c2d71f5
JH
678 }
679 else { /* Take into account the floating substring. */
33b8afdf
JH
680 char *last, *last1;
681 char *s1 = s;
682 SV* must;
683
684 t = HOP3c(s, -start_shift, strbeg);
685 last1 = last =
686 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
687 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
688 last = HOP3c(t, prog->float_max_offset, strend);
689 s = HOP3c(t, prog->float_min_offset, strend);
690 if (s < other_last)
691 s = other_last;
2c2d71f5 692 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
693 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
694 /* fbm_instr() takes into account exact value of end-of-str
695 if the check is SvTAIL(ed). Since false positives are OK,
696 and end-of-str is not later than strend we are OK. */
697 if (must == &PL_sv_undef) {
698 s = (char*)NULL;
a3621e74 699 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
700 }
701 else
2c2d71f5 702 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
703 (unsigned char*)last + SvCUR(must)
704 - (SvTAIL(must)!=0),
7fba1cd6 705 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 706 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
707 (s ? "Found" : "Contradicts"),
708 PL_colors[0],
709 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 710 SvPVX_const(must),
33b8afdf
JH
711 PL_colors[1], (SvTAIL(must) ? "$" : "")));
712 if (!s) {
713 if (last1 == last) {
a3621e74 714 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
715 ", giving up...\n"));
716 goto fail_finish;
2c2d71f5 717 }
a3621e74 718 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
719 ", trying anchored starting at offset %ld...\n",
720 (long)(s1 + 1 - i_strpos)));
721 other_last = last;
722 s = HOP3c(t, 1, strend);
723 goto restart;
724 }
725 else {
a3621e74 726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
727 (long)(s - i_strpos)));
728 other_last = s; /* Fix this later. --Hugo */
729 s = s1;
730 if (t == strpos)
731 goto try_at_start;
732 goto try_at_offset;
733 }
2c2d71f5 734 }
cad2e5aa 735 }
2c2d71f5
JH
736
737 t = s - prog->check_offset_max;
2c2d71f5 738 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 739 && (!do_utf8
1aa99e6b
IH
740 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
741 && t > strpos))) {
2c2d71f5
JH
742 /* Fixed substring is found far enough so that the match
743 cannot start at strpos. */
744 try_at_offset:
cad2e5aa 745 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
746 /* Eventually fbm_*() should handle this, but often
747 anchored_offset is not 0, so this check will not be wasted. */
748 /* XXXX In the code below we prefer to look for "^" even in
749 presence of anchored substrings. And we search even
750 beyond the found float position. These pessimizations
751 are historical artefacts only. */
752 find_anchor:
2c2d71f5 753 while (t < strend - prog->minlen) {
cad2e5aa 754 if (*t == '\n') {
4ee3650e 755 if (t < check_at - prog->check_offset_min) {
33b8afdf 756 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
757 /* Since we moved from the found position,
758 we definitely contradict the found anchored
30944b6d
IZ
759 substr. Due to the above check we do not
760 contradict "check" substr.
761 Thus we can arrive here only if check substr
762 is float. Redo checking for "other"=="fixed".
763 */
9041c2e3 764 strpos = t + 1;
a3621e74 765 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 766 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
767 goto do_other_anchored;
768 }
4ee3650e
GS
769 /* We don't contradict the found floating substring. */
770 /* XXXX Why not check for STCLASS? */
cad2e5aa 771 s = t + 1;
a3621e74 772 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 773 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
774 goto set_useful;
775 }
4ee3650e
GS
776 /* Position contradicts check-string */
777 /* XXXX probably better to look for check-string
778 than for "\n", so one should lower the limit for t? */
a3621e74 779 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 780 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 781 other_last = strpos = s = t + 1;
cad2e5aa
JH
782 goto restart;
783 }
784 t++;
785 }
a3621e74 786 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 787 PL_colors[0], PL_colors[1]));
2c2d71f5 788 goto fail_finish;
cad2e5aa 789 }
f5952150 790 else {
a3621e74 791 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 792 PL_colors[0], PL_colors[1]));
f5952150 793 }
cad2e5aa
JH
794 s = t;
795 set_useful:
33b8afdf 796 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
797 }
798 else {
f5952150 799 /* The found string does not prohibit matching at strpos,
2c2d71f5 800 - no optimization of calling REx engine can be performed,
f5952150
GS
801 unless it was an MBOL and we are not after MBOL,
802 or a future STCLASS check will fail this. */
2c2d71f5
JH
803 try_at_start:
804 /* Even in this situation we may use MBOL flag if strpos is offset
805 wrt the start of the string. */
05b4157f 806 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 807 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
808 /* May be due to an implicit anchor of m{.*foo} */
809 && !(prog->reganch & ROPT_IMPLICIT))
810 {
cad2e5aa
JH
811 t = strpos;
812 goto find_anchor;
813 }
a3621e74 814 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 815 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 816 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 817 );
2c2d71f5 818 success_at_start:
30944b6d 819 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
820 && (do_utf8 ? (
821 prog->check_utf8 /* Could be deleted already */
822 && --BmUSEFUL(prog->check_utf8) < 0
823 && (prog->check_utf8 == prog->float_utf8)
824 ) : (
825 prog->check_substr /* Could be deleted already */
826 && --BmUSEFUL(prog->check_substr) < 0
827 && (prog->check_substr == prog->float_substr)
828 )))
66e933ab 829 {
cad2e5aa 830 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 831 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
832 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
833 if (do_utf8 ? prog->check_substr : prog->check_utf8)
834 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
835 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
836 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 837 check = Nullsv; /* abort */
cad2e5aa 838 s = strpos;
3cf5c195
IZ
839 /* XXXX This is a remnant of the old implementation. It
840 looks wasteful, since now INTUIT can use many
6eb5f6b9 841 other heuristics. */
cad2e5aa
JH
842 prog->reganch &= ~RE_USE_INTUIT;
843 }
844 else
845 s = strpos;
846 }
847
6eb5f6b9
JH
848 /* Last resort... */
849 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
850 if (prog->regstclass) {
851 /* minlen == 0 is possible if regstclass is \b or \B,
852 and the fixed substr is ''$.
853 Since minlen is already taken into account, s+1 is before strend;
854 accidentally, minlen >= 1 guaranties no false positives at s + 1
855 even for \b or \B. But (minlen? 1 : 0) below assumes that
856 regstclass does not come from lookahead... */
857 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
858 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 859 const U8* const str = (U8*)STRING(prog->regstclass);
06b5626a 860 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 861 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 862 : 1);
a3b680e6 863 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 864 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 865 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
866 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
867 cl_l, strend)
868 : strend);
6eb5f6b9
JH
869
870 t = s;
9926ca43 871 cache_re(prog);
06b5626a 872 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
6eb5f6b9
JH
873 if (!s) {
874#ifdef DEBUGGING
cbbf8932 875 const char *what = NULL;
6eb5f6b9
JH
876#endif
877 if (endpos == strend) {
a3621e74 878 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
879 "Could not match STCLASS...\n") );
880 goto fail;
881 }
a3621e74 882 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 883 "This position contradicts STCLASS...\n") );
653099ff
GS
884 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
885 goto fail;
6eb5f6b9 886 /* Contradict one of substrings */
33b8afdf
JH
887 if (prog->anchored_substr || prog->anchored_utf8) {
888 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 889 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 890 hop_and_restart:
1aa99e6b 891 s = HOP3c(t, 1, strend);
66e933ab
GS
892 if (s + start_shift + end_shift > strend) {
893 /* XXXX Should be taken into account earlier? */
a3621e74 894 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
895 "Could not match STCLASS...\n") );
896 goto fail;
897 }
5e39e1e5
HS
898 if (!check)
899 goto giveup;
a3621e74 900 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 901 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
902 what, (long)(s + start_shift - i_strpos)) );
903 goto restart;
904 }
66e933ab 905 /* Have both, check_string is floating */
6eb5f6b9
JH
906 if (t + start_shift >= check_at) /* Contradicts floating=check */
907 goto retry_floating_check;
908 /* Recheck anchored substring, but not floating... */
9041c2e3 909 s = check_at;
5e39e1e5
HS
910 if (!check)
911 goto giveup;
a3621e74 912 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 913 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
914 (long)(other_last - i_strpos)) );
915 goto do_other_anchored;
916 }
60e71179
GS
917 /* Another way we could have checked stclass at the
918 current position only: */
919 if (ml_anch) {
920 s = t = t + 1;
5e39e1e5
HS
921 if (!check)
922 goto giveup;
a3621e74 923 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 924 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 925 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 926 goto try_at_offset;
66e933ab 927 }
33b8afdf 928 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 929 goto fail;
6eb5f6b9
JH
930 /* Check is floating subtring. */
931 retry_floating_check:
932 t = check_at - start_shift;
a3621e74 933 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
934 goto hop_and_restart;
935 }
b7953727 936 if (t != s) {
a3621e74 937 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 938 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
939 (long)(t - i_strpos), (long)(s - i_strpos))
940 );
941 }
942 else {
a3621e74 943 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
944 "Does not contradict STCLASS...\n");
945 );
946 }
6eb5f6b9 947 }
5e39e1e5 948 giveup:
a3621e74 949 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
950 PL_colors[4], (check ? "Guessed" : "Giving up"),
951 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 952 return s;
2c2d71f5
JH
953
954 fail_finish: /* Substring not found */
33b8afdf
JH
955 if (prog->check_substr || prog->check_utf8) /* could be removed already */
956 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 957 fail:
a3621e74 958 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 959 PL_colors[4], PL_colors[5]));
cad2e5aa
JH
960 return Nullch;
961}
9661b544 962
6eb5f6b9 963/* We know what class REx starts with. Try to find this position... */
3c3eec57 964STATIC char *
a3b680e6 965S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 norun)
a687059c 966{
27da23d5 967 dVAR;
1df70142 968 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 969 char *m;
d8093b23 970 STRLEN ln;
5dab1207 971 STRLEN lnc;
078c425b 972 register STRLEN uskip;
d8093b23
G
973 unsigned int c1;
974 unsigned int c2;
6eb5f6b9
JH
975 char *e;
976 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 977 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 978
6eb5f6b9
JH
979 /* We know what class it must start with. */
980 switch (OP(c)) {
6eb5f6b9 981 case ANYOF:
388cc4de 982 if (do_utf8) {
078c425b 983 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
984 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
985 !UTF8_IS_INVARIANT((U8)s[0]) ?
986 reginclass(c, (U8*)s, 0, do_utf8) :
987 REGINCLASS(c, (U8*)s)) {
988 if (tmp && (norun || regtry(prog, s)))
989 goto got_it;
990 else
991 tmp = doevery;
992 }
993 else
994 tmp = 1;
078c425b 995 s += uskip;
388cc4de
HS
996 }
997 }
998 else {
999 while (s < strend) {
1000 STRLEN skip = 1;
1001
1002 if (REGINCLASS(c, (U8*)s) ||
1003 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1004 /* The assignment of 2 is intentional:
1005 * for the folded sharp s, the skip is 2. */
1006 (skip = SHARP_S_SKIP))) {
1007 if (tmp && (norun || regtry(prog, s)))
1008 goto got_it;
1009 else
1010 tmp = doevery;
1011 }
1012 else
1013 tmp = 1;
1014 s += skip;
1015 }
a0d0e21e 1016 }
6eb5f6b9 1017 break;
f33976b4
DB
1018 case CANY:
1019 while (s < strend) {
1020 if (tmp && (norun || regtry(prog, s)))
1021 goto got_it;
1022 else
1023 tmp = doevery;
1024 s++;
1025 }
1026 break;
6eb5f6b9 1027 case EXACTF:
5dab1207
NIS
1028 m = STRING(c);
1029 ln = STR_LEN(c); /* length to match in octets/bytes */
1030 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1031 if (UTF) {
a2a2844f 1032 STRLEN ulen1, ulen2;
5dab1207 1033 U8 *sm = (U8 *) m;
89ebb4a3
JH
1034 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1035 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
041457d9 1036 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
a2a2844f
JH
1037
1038 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1039 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1040
89ebb4a3 1041 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1042 0, uniflags);
89ebb4a3 1043 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1044 0, uniflags);
5dab1207
NIS
1045 lnc = 0;
1046 while (sm < ((U8 *) m + ln)) {
1047 lnc++;
1048 sm += UTF8SKIP(sm);
1049 }
1aa99e6b
IH
1050 }
1051 else {
1052 c1 = *(U8*)m;
1053 c2 = PL_fold[c1];
1054 }
6eb5f6b9
JH
1055 goto do_exactf;
1056 case EXACTFL:
5dab1207
NIS
1057 m = STRING(c);
1058 ln = STR_LEN(c);
1059 lnc = (I32) ln;
d8093b23 1060 c1 = *(U8*)m;
6eb5f6b9
JH
1061 c2 = PL_fold_locale[c1];
1062 do_exactf:
db12adc6 1063 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1064
6eb5f6b9
JH
1065 if (norun && e < s)
1066 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1067
60a8b682
JH
1068 /* The idea in the EXACTF* cases is to first find the
1069 * first character of the EXACTF* node and then, if
1070 * necessary, case-insensitively compare the full
1071 * text of the node. The c1 and c2 are the first
1072 * characters (though in Unicode it gets a bit
1073 * more complicated because there are more cases
7f16dd3d
JH
1074 * than just upper and lower: one needs to use
1075 * the so-called folding case for case-insensitive
1076 * matching (called "loose matching" in Unicode).
1077 * ibcmp_utf8() will do just that. */
60a8b682 1078
1aa99e6b 1079 if (do_utf8) {
575cac57 1080 UV c, f;
89ebb4a3 1081 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1082 STRLEN len, foldlen;
041457d9 1083 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
09091399 1084 if (c1 == c2) {
5dab1207
NIS
1085 /* Upper and lower of 1st char are equal -
1086 * probably not a "letter". */
1aa99e6b 1087 while (s <= e) {
89ebb4a3 1088 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1089 uniflags);
80aecb99
JH
1090 if ( c == c1
1091 && (ln == len ||
66423254 1092 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1093 m, (char **)0, ln, (bool)UTF))
55da9344 1094 && (norun || regtry(prog, s)) )
1aa99e6b 1095 goto got_it;
80aecb99 1096 else {
1df70142 1097 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1098 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1099 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1100 if ( f != c
1101 && (f == c1 || f == c2)
1102 && (ln == foldlen ||
66423254
JH
1103 !ibcmp_utf8((char *) foldbuf,
1104 (char **)0, foldlen, do_utf8,
d07ddd77 1105 m,
eb160463 1106 (char **)0, ln, (bool)UTF))
80aecb99
JH
1107 && (norun || regtry(prog, s)) )
1108 goto got_it;
1109 }
1aa99e6b
IH
1110 s += len;
1111 }
09091399
JH
1112 }
1113 else {
1aa99e6b 1114 while (s <= e) {
89ebb4a3 1115 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1116 uniflags);
80aecb99 1117
60a8b682 1118 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1119 * Note that not all the possible combinations
1120 * are handled here: some of them are handled
1121 * by the standard folding rules, and some of
1122 * them (the character class or ANYOF cases)
1123 * are handled during compiletime in
1124 * regexec.c:S_regclass(). */
880bd946
JH
1125 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1126 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1127 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1128
1129 if ( (c == c1 || c == c2)
1130 && (ln == len ||
66423254 1131 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1132 m, (char **)0, ln, (bool)UTF))
55da9344 1133 && (norun || regtry(prog, s)) )
1aa99e6b 1134 goto got_it;
80aecb99 1135 else {
1df70142 1136 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1137 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1138 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1139 if ( f != c
1140 && (f == c1 || f == c2)
1141 && (ln == foldlen ||
a6872d42 1142 !ibcmp_utf8((char *) foldbuf,
66423254 1143 (char **)0, foldlen, do_utf8,
d07ddd77 1144 m,
eb160463 1145 (char **)0, ln, (bool)UTF))
80aecb99
JH
1146 && (norun || regtry(prog, s)) )
1147 goto got_it;
1148 }
1aa99e6b
IH
1149 s += len;
1150 }
09091399 1151 }
1aa99e6b
IH
1152 }
1153 else {
1154 if (c1 == c2)
1155 while (s <= e) {
1156 if ( *(U8*)s == c1
1157 && (ln == 1 || !(OP(c) == EXACTF
1158 ? ibcmp(s, m, ln)
1159 : ibcmp_locale(s, m, ln)))
1160 && (norun || regtry(prog, s)) )
1161 goto got_it;
1162 s++;
1163 }
1164 else
1165 while (s <= e) {
1166 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1167 && (ln == 1 || !(OP(c) == EXACTF
1168 ? ibcmp(s, m, ln)
1169 : ibcmp_locale(s, m, ln)))
1170 && (norun || regtry(prog, s)) )
1171 goto got_it;
1172 s++;
1173 }
b3c9acc1
IZ
1174 }
1175 break;
bbce6d69 1176 case BOUNDL:
3280af22 1177 PL_reg_flags |= RF_tainted;
bbce6d69 1178 /* FALL THROUGH */
a0d0e21e 1179 case BOUND:
ffc61ed2 1180 if (do_utf8) {
12d33761 1181 if (s == PL_bostr)
ffc61ed2
JH
1182 tmp = '\n';
1183 else {
6136c704 1184 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
b4f7163a 1185 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1186 }
1187 tmp = ((OP(c) == BOUND ?
9041c2e3 1188 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1189 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1190 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1191 if (tmp == !(OP(c) == BOUND ?
3568d838 1192 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1193 isALNUM_LC_utf8((U8*)s)))
1194 {
1195 tmp = !tmp;
1196 if ((norun || regtry(prog, s)))
1197 goto got_it;
1198 }
078c425b 1199 s += uskip;
a687059c 1200 }
a0d0e21e 1201 }
667bb95a 1202 else {
12d33761 1203 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1204 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1205 while (s < strend) {
1206 if (tmp ==
1207 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1208 tmp = !tmp;
1209 if ((norun || regtry(prog, s)))
1210 goto got_it;
1211 }
1212 s++;
a0ed51b3 1213 }
a0ed51b3 1214 }
6eb5f6b9 1215 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1216 goto got_it;
1217 break;
bbce6d69 1218 case NBOUNDL:
3280af22 1219 PL_reg_flags |= RF_tainted;
bbce6d69 1220 /* FALL THROUGH */
a0d0e21e 1221 case NBOUND:
ffc61ed2 1222 if (do_utf8) {
12d33761 1223 if (s == PL_bostr)
ffc61ed2
JH
1224 tmp = '\n';
1225 else {
6136c704 1226 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
b4f7163a 1227 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1228 }
1229 tmp = ((OP(c) == NBOUND ?
9041c2e3 1230 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1231 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1232 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1233 if (tmp == !(OP(c) == NBOUND ?
3568d838 1234 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1235 isALNUM_LC_utf8((U8*)s)))
1236 tmp = !tmp;
1237 else if ((norun || regtry(prog, s)))
1238 goto got_it;
078c425b 1239 s += uskip;
ffc61ed2 1240 }
a0d0e21e 1241 }
667bb95a 1242 else {
12d33761 1243 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1244 tmp = ((OP(c) == NBOUND ?
1245 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1246 while (s < strend) {
1247 if (tmp ==
1248 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1249 tmp = !tmp;
1250 else if ((norun || regtry(prog, s)))
1251 goto got_it;
1252 s++;
1253 }
a0ed51b3 1254 }
6eb5f6b9 1255 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1256 goto got_it;
1257 break;
a0d0e21e 1258 case ALNUM:
ffc61ed2 1259 if (do_utf8) {
1a4fad37 1260 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1261 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1262 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1263 if (tmp && (norun || regtry(prog, s)))
1264 goto got_it;
1265 else
1266 tmp = doevery;
1267 }
bbce6d69 1268 else
ffc61ed2 1269 tmp = 1;
078c425b 1270 s += uskip;
bbce6d69 1271 }
bbce6d69 1272 }
ffc61ed2
JH
1273 else {
1274 while (s < strend) {
1275 if (isALNUM(*s)) {
1276 if (tmp && (norun || regtry(prog, s)))
1277 goto got_it;
1278 else
1279 tmp = doevery;
1280 }
a0ed51b3 1281 else
ffc61ed2
JH
1282 tmp = 1;
1283 s++;
a0ed51b3 1284 }
a0ed51b3
LW
1285 }
1286 break;
bbce6d69 1287 case ALNUML:
3280af22 1288 PL_reg_flags |= RF_tainted;
ffc61ed2 1289 if (do_utf8) {
078c425b 1290 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1291 if (isALNUM_LC_utf8((U8*)s)) {
1292 if (tmp && (norun || regtry(prog, s)))
1293 goto got_it;
1294 else
1295 tmp = doevery;
1296 }
a687059c 1297 else
ffc61ed2 1298 tmp = 1;
078c425b 1299 s += uskip;
a0d0e21e 1300 }
a0d0e21e 1301 }
ffc61ed2
JH
1302 else {
1303 while (s < strend) {
1304 if (isALNUM_LC(*s)) {
1305 if (tmp && (norun || regtry(prog, s)))
1306 goto got_it;
1307 else
1308 tmp = doevery;
1309 }
a0ed51b3 1310 else
ffc61ed2
JH
1311 tmp = 1;
1312 s++;
a0ed51b3 1313 }
a0ed51b3
LW
1314 }
1315 break;
a0d0e21e 1316 case NALNUM:
ffc61ed2 1317 if (do_utf8) {
1a4fad37 1318 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1319 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1320 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1321 if (tmp && (norun || regtry(prog, s)))
1322 goto got_it;
1323 else
1324 tmp = doevery;
1325 }
bbce6d69 1326 else
ffc61ed2 1327 tmp = 1;
078c425b 1328 s += uskip;
bbce6d69 1329 }
bbce6d69 1330 }
ffc61ed2
JH
1331 else {
1332 while (s < strend) {
1333 if (!isALNUM(*s)) {
1334 if (tmp && (norun || regtry(prog, s)))
1335 goto got_it;
1336 else
1337 tmp = doevery;
1338 }
a0ed51b3 1339 else
ffc61ed2
JH
1340 tmp = 1;
1341 s++;
a0ed51b3 1342 }
a0ed51b3
LW
1343 }
1344 break;
bbce6d69 1345 case NALNUML:
3280af22 1346 PL_reg_flags |= RF_tainted;
ffc61ed2 1347 if (do_utf8) {
078c425b 1348 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1349 if (!isALNUM_LC_utf8((U8*)s)) {
1350 if (tmp && (norun || regtry(prog, s)))
1351 goto got_it;
1352 else
1353 tmp = doevery;
1354 }
a687059c 1355 else
ffc61ed2 1356 tmp = 1;
078c425b 1357 s += uskip;
a687059c 1358 }
a0d0e21e 1359 }
ffc61ed2
JH
1360 else {
1361 while (s < strend) {
1362 if (!isALNUM_LC(*s)) {
1363 if (tmp && (norun || regtry(prog, s)))
1364 goto got_it;
1365 else
1366 tmp = doevery;
1367 }
a0ed51b3 1368 else
ffc61ed2
JH
1369 tmp = 1;
1370 s++;
a0ed51b3 1371 }
a0ed51b3
LW
1372 }
1373 break;
a0d0e21e 1374 case SPACE:
ffc61ed2 1375 if (do_utf8) {
1a4fad37 1376 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1377 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1378 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1379 if (tmp && (norun || regtry(prog, s)))
1380 goto got_it;
1381 else
1382 tmp = doevery;
1383 }
a0d0e21e 1384 else
ffc61ed2 1385 tmp = 1;
078c425b 1386 s += uskip;
2304df62 1387 }
a0d0e21e 1388 }
ffc61ed2
JH
1389 else {
1390 while (s < strend) {
1391 if (isSPACE(*s)) {
1392 if (tmp && (norun || regtry(prog, s)))
1393 goto got_it;
1394 else
1395 tmp = doevery;
1396 }
a0ed51b3 1397 else
ffc61ed2
JH
1398 tmp = 1;
1399 s++;
a0ed51b3 1400 }
a0ed51b3
LW
1401 }
1402 break;
bbce6d69 1403 case SPACEL:
3280af22 1404 PL_reg_flags |= RF_tainted;
ffc61ed2 1405 if (do_utf8) {
078c425b 1406 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1407 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1408 if (tmp && (norun || regtry(prog, s)))
1409 goto got_it;
1410 else
1411 tmp = doevery;
1412 }
bbce6d69 1413 else
ffc61ed2 1414 tmp = 1;
078c425b 1415 s += uskip;
bbce6d69 1416 }
bbce6d69 1417 }
ffc61ed2
JH
1418 else {
1419 while (s < strend) {
1420 if (isSPACE_LC(*s)) {
1421 if (tmp && (norun || regtry(prog, s)))
1422 goto got_it;
1423 else
1424 tmp = doevery;
1425 }
a0ed51b3 1426 else
ffc61ed2
JH
1427 tmp = 1;
1428 s++;
a0ed51b3 1429 }
a0ed51b3
LW
1430 }
1431 break;
a0d0e21e 1432 case NSPACE:
ffc61ed2 1433 if (do_utf8) {
1a4fad37 1434 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1435 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1436 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1437 if (tmp && (norun || regtry(prog, s)))
1438 goto got_it;
1439 else
1440 tmp = doevery;
1441 }
a0d0e21e 1442 else
ffc61ed2 1443 tmp = 1;
078c425b 1444 s += uskip;
a687059c 1445 }
a0d0e21e 1446 }
ffc61ed2
JH
1447 else {
1448 while (s < strend) {
1449 if (!isSPACE(*s)) {
1450 if (tmp && (norun || regtry(prog, s)))
1451 goto got_it;
1452 else
1453 tmp = doevery;
1454 }
a0ed51b3 1455 else
ffc61ed2
JH
1456 tmp = 1;
1457 s++;
a0ed51b3 1458 }
a0ed51b3
LW
1459 }
1460 break;
bbce6d69 1461 case NSPACEL:
3280af22 1462 PL_reg_flags |= RF_tainted;
ffc61ed2 1463 if (do_utf8) {
078c425b 1464 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1465 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1466 if (tmp && (norun || regtry(prog, s)))
1467 goto got_it;
1468 else
1469 tmp = doevery;
1470 }
bbce6d69 1471 else
ffc61ed2 1472 tmp = 1;
078c425b 1473 s += uskip;
bbce6d69 1474 }
bbce6d69 1475 }
ffc61ed2
JH
1476 else {
1477 while (s < strend) {
1478 if (!isSPACE_LC(*s)) {
1479 if (tmp && (norun || regtry(prog, s)))
1480 goto got_it;
1481 else
1482 tmp = doevery;
1483 }
a0ed51b3 1484 else
ffc61ed2
JH
1485 tmp = 1;
1486 s++;
a0ed51b3 1487 }
a0ed51b3
LW
1488 }
1489 break;
a0d0e21e 1490 case DIGIT:
ffc61ed2 1491 if (do_utf8) {
1a4fad37 1492 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1493 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1494 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1495 if (tmp && (norun || regtry(prog, s)))
1496 goto got_it;
1497 else
1498 tmp = doevery;
1499 }
a0d0e21e 1500 else
ffc61ed2 1501 tmp = 1;
078c425b 1502 s += uskip;
2b69d0c2 1503 }
a0d0e21e 1504 }
ffc61ed2
JH
1505 else {
1506 while (s < strend) {
1507 if (isDIGIT(*s)) {
1508 if (tmp && (norun || regtry(prog, s)))
1509 goto got_it;
1510 else
1511 tmp = doevery;
1512 }
a0ed51b3 1513 else
ffc61ed2
JH
1514 tmp = 1;
1515 s++;
a0ed51b3 1516 }
a0ed51b3
LW
1517 }
1518 break;
b8c5462f
JH
1519 case DIGITL:
1520 PL_reg_flags |= RF_tainted;
ffc61ed2 1521 if (do_utf8) {
078c425b 1522 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1523 if (isDIGIT_LC_utf8((U8*)s)) {
1524 if (tmp && (norun || regtry(prog, s)))
1525 goto got_it;
1526 else
1527 tmp = doevery;
1528 }
b8c5462f 1529 else
ffc61ed2 1530 tmp = 1;
078c425b 1531 s += uskip;
b8c5462f 1532 }
b8c5462f 1533 }
ffc61ed2
JH
1534 else {
1535 while (s < strend) {
1536 if (isDIGIT_LC(*s)) {
1537 if (tmp && (norun || regtry(prog, s)))
1538 goto got_it;
1539 else
1540 tmp = doevery;
1541 }
b8c5462f 1542 else
ffc61ed2
JH
1543 tmp = 1;
1544 s++;
b8c5462f 1545 }
b8c5462f
JH
1546 }
1547 break;
a0d0e21e 1548 case NDIGIT:
ffc61ed2 1549 if (do_utf8) {
1a4fad37 1550 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1551 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1552 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1553 if (tmp && (norun || regtry(prog, s)))
1554 goto got_it;
1555 else
1556 tmp = doevery;
1557 }
a0d0e21e 1558 else
ffc61ed2 1559 tmp = 1;
078c425b 1560 s += uskip;
a687059c 1561 }
a0d0e21e 1562 }
ffc61ed2
JH
1563 else {
1564 while (s < strend) {
1565 if (!isDIGIT(*s)) {
1566 if (tmp && (norun || regtry(prog, s)))
1567 goto got_it;
1568 else
1569 tmp = doevery;
1570 }
a0ed51b3 1571 else
ffc61ed2
JH
1572 tmp = 1;
1573 s++;
a0ed51b3 1574 }
a0ed51b3
LW
1575 }
1576 break;
b8c5462f
JH
1577 case NDIGITL:
1578 PL_reg_flags |= RF_tainted;
ffc61ed2 1579 if (do_utf8) {
078c425b 1580 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1581 if (!isDIGIT_LC_utf8((U8*)s)) {
1582 if (tmp && (norun || regtry(prog, s)))
1583 goto got_it;
1584 else
1585 tmp = doevery;
1586 }
b8c5462f 1587 else
ffc61ed2 1588 tmp = 1;
078c425b 1589 s += uskip;
b8c5462f 1590 }
a0ed51b3 1591 }
ffc61ed2
JH
1592 else {
1593 while (s < strend) {
1594 if (!isDIGIT_LC(*s)) {
1595 if (tmp && (norun || regtry(prog, s)))
1596 goto got_it;
1597 else
1598 tmp = doevery;
1599 }
cf93c79d 1600 else
ffc61ed2
JH
1601 tmp = 1;
1602 s++;
b8c5462f 1603 }
b8c5462f
JH
1604 }
1605 break;
b3c9acc1 1606 default:
3c3eec57
GS
1607 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1608 break;
d6a28714 1609 }
6eb5f6b9
JH
1610 return 0;
1611 got_it:
1612 return s;
1613}
1614
1615/*
1616 - regexec_flags - match a regexp against a string
1617 */
1618I32
1619Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1620 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1621/* strend: pointer to null at end of string */
1622/* strbeg: real beginning of string */
1623/* minend: end of match must be >=minend after stringarg. */
1624/* data: May be used for some additional optimizations. */
1625/* nosave: For optimizations. */
1626{
97aff369 1627 dVAR;
6eb5f6b9
JH
1628 register char *s;
1629 register regnode *c;
1630 register char *startpos = stringarg;
6eb5f6b9
JH
1631 I32 minlen; /* must match at least this many chars */
1632 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1633 I32 end_shift = 0; /* Same for the end. */ /* CC */
1634 I32 scream_pos = -1; /* Internal iterator of scream. */
1635 char *scream_olds;
1636 SV* oreplsv = GvSV(PL_replgv);
1df70142 1637 const bool do_utf8 = DO_UTF8(sv);
a3b680e6 1638 const I32 multiline = prog->reganch & PMf_MULTILINE;
2a782b5b 1639#ifdef DEBUGGING
6136c704
AL
1640 SV * const dsv0 = PERL_DEBUG_PAD_ZERO(0);
1641 SV * const dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1642#endif
a3621e74
YO
1643
1644 GET_RE_DEBUG_FLAGS_DECL;
1645
9d4ba2ae 1646 PERL_UNUSED_ARG(data);
a30b2f1f 1647 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1648
1649 PL_regcc = 0;
1650
1651 cache_re(prog);
1652#ifdef DEBUGGING
aea4f609 1653 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1654#endif
1655
1656 /* Be paranoid... */
1657 if (prog == NULL || startpos == NULL) {
1658 Perl_croak(aTHX_ "NULL regexp parameter");
1659 return 0;
1660 }
1661
1662 minlen = prog->minlen;
61a36c01 1663 if (strend - startpos < minlen) {
a3621e74 1664 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1665 "String too short [regexec_flags]...\n"));
1666 goto phooey;
1aa99e6b 1667 }
6eb5f6b9 1668
6eb5f6b9
JH
1669 /* Check validity of program. */
1670 if (UCHARAT(prog->program) != REG_MAGIC) {
1671 Perl_croak(aTHX_ "corrupted regexp program");
1672 }
1673
1674 PL_reg_flags = 0;
1675 PL_reg_eval_set = 0;
1676 PL_reg_maxiter = 0;
1677
1678 if (prog->reganch & ROPT_UTF8)
1679 PL_reg_flags |= RF_utf8;
1680
1681 /* Mark beginning of line for ^ and lookbehind. */
1682 PL_regbol = startpos;
1683 PL_bostr = strbeg;
1684 PL_reg_sv = sv;
1685
1686 /* Mark end of line for $ (and such) */
1687 PL_regeol = strend;
1688
1689 /* see how far we have to get to not match where we matched before */
1690 PL_regtill = startpos+minend;
1691
1692 /* We start without call_cc context. */
1693 PL_reg_call_cc = 0;
1694
1695 /* If there is a "must appear" string, look for it. */
1696 s = startpos;
1697
1698 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1699 MAGIC *mg;
1700
1701 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1702 PL_reg_ganch = startpos;
1703 else if (sv && SvTYPE(sv) >= SVt_PVMG
1704 && SvMAGIC(sv)
14befaf4
DM
1705 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1706 && mg->mg_len >= 0) {
6eb5f6b9
JH
1707 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1708 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1709 if (s > PL_reg_ganch)
6eb5f6b9
JH
1710 goto phooey;
1711 s = PL_reg_ganch;
1712 }
1713 }
1714 else /* pos() not defined */
1715 PL_reg_ganch = strbeg;
1716 }
1717
33b8afdf 1718 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1719 re_scream_pos_data d;
1720
1721 d.scream_olds = &scream_olds;
1722 d.scream_pos = &scream_pos;
1723 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1724 if (!s) {
a3621e74 1725 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1726 goto phooey; /* not present */
3fa9c3d7 1727 }
6eb5f6b9
JH
1728 }
1729
a3621e74 1730 DEBUG_EXECUTE_r({
1df70142
AL
1731 const char * const s0 = UTF
1732 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1733 UNI_DISPLAY_REGEX)
1734 : prog->precomp;
1735 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1736 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1737 UNI_DISPLAY_REGEX) : startpos;
1df70142 1738 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1739 if (!PL_colorset)
1740 reginitcolors();
1741 PerlIO_printf(Perl_debug_log,
a0288114 1742 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1743 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1744 len0, len0, s0,
2a782b5b 1745 PL_colors[1],
9e55ce06 1746 len0 > 60 ? "..." : "",
2a782b5b 1747 PL_colors[0],
9e55ce06
JH
1748 (int)(len1 > 60 ? 60 : len1),
1749 s1, PL_colors[1],
1750 (len1 > 60 ? "..." : "")
2a782b5b
JH
1751 );
1752 });
6eb5f6b9
JH
1753
1754 /* Simplest case: anchored match need be tried only once. */
1755 /* [unless only anchor is BOL and multiline is set] */
1756 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1757 if (s == startpos && regtry(prog, startpos))
1758 goto got_it;
7fba1cd6 1759 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1760 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1761 {
1762 char *end;
1763
1764 if (minlen)
1765 dontbother = minlen - 1;
1aa99e6b 1766 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1767 /* for multiline we only have to try after newlines */
33b8afdf 1768 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1769 if (s == startpos)
1770 goto after_try;
1771 while (1) {
1772 if (regtry(prog, s))
1773 goto got_it;
1774 after_try:
1775 if (s >= end)
1776 goto phooey;
1777 if (prog->reganch & RE_USE_INTUIT) {
1778 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1779 if (!s)
1780 goto phooey;
1781 }
1782 else
1783 s++;
1784 }
1785 } else {
1786 if (s > startpos)
1787 s--;
1788 while (s < end) {
1789 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1790 if (regtry(prog, s))
1791 goto got_it;
1792 }
1793 }
1794 }
1795 }
1796 goto phooey;
1797 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1798 if (regtry(prog, PL_reg_ganch))
1799 goto got_it;
1800 goto phooey;
1801 }
1802
1803 /* Messy cases: unanchored match. */
33b8afdf 1804 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1805 /* we have /x+whatever/ */
1806 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1807 char ch;
bf93d4cc
GS
1808#ifdef DEBUGGING
1809 int did_match = 0;
1810#endif
33b8afdf
JH
1811 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1812 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1813 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1814
1aa99e6b 1815 if (do_utf8) {
6eb5f6b9
JH
1816 while (s < strend) {
1817 if (*s == ch) {
a3621e74 1818 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1819 if (regtry(prog, s)) goto got_it;
1820 s += UTF8SKIP(s);
1821 while (s < strend && *s == ch)
1822 s += UTF8SKIP(s);
1823 }
1824 s += UTF8SKIP(s);
1825 }
1826 }
1827 else {
1828 while (s < strend) {
1829 if (*s == ch) {
a3621e74 1830 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1831 if (regtry(prog, s)) goto got_it;
1832 s++;
1833 while (s < strend && *s == ch)
1834 s++;
1835 }
1836 s++;
1837 }
1838 }
a3621e74 1839 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1840 PerlIO_printf(Perl_debug_log,
b7953727
JH
1841 "Did not find anchored character...\n")
1842 );
6eb5f6b9 1843 }
33b8afdf
JH
1844 else if (prog->anchored_substr != Nullsv
1845 || prog->anchored_utf8 != Nullsv
1846 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1847 && prog->float_max_offset < strend - s)) {
1848 SV *must;
1849 I32 back_max;
1850 I32 back_min;
1851 char *last;
6eb5f6b9 1852 char *last1; /* Last position checked before */
bf93d4cc
GS
1853#ifdef DEBUGGING
1854 int did_match = 0;
1855#endif
33b8afdf
JH
1856 if (prog->anchored_substr || prog->anchored_utf8) {
1857 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1858 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1859 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1860 back_max = back_min = prog->anchored_offset;
1861 } else {
1862 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1863 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1864 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1865 back_max = prog->float_max_offset;
1866 back_min = prog->float_min_offset;
1867 }
1868 if (must == &PL_sv_undef)
1869 /* could not downgrade utf8 check substring, so must fail */
1870 goto phooey;
1871
1872 last = HOP3c(strend, /* Cannot start after this */
1873 -(I32)(CHR_SVLEN(must)
1874 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1875
1876 if (s > PL_bostr)
1877 last1 = HOPc(s, -1);
1878 else
1879 last1 = s - 1; /* bogus */
1880
a0288114 1881 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1882 check_substr==must. */
1883 scream_pos = -1;
1884 dontbother = end_shift;
1885 strend = HOPc(strend, -dontbother);
1886 while ( (s <= last) &&
9041c2e3 1887 ((flags & REXEC_SCREAM)
1aa99e6b 1888 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1889 end_shift, &scream_pos, 0))
1aa99e6b 1890 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1891 (unsigned char*)strend, must,
7fba1cd6 1892 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1893 /* we may be pointing at the wrong string */
1894 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1895 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1896 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1897 if (HOPc(s, -back_max) > last1) {
1898 last1 = HOPc(s, -back_min);
1899 s = HOPc(s, -back_max);
1900 }
1901 else {
1902 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1903
1904 last1 = HOPc(s, -back_min);
1905 s = t;
1906 }
1aa99e6b 1907 if (do_utf8) {
6eb5f6b9
JH
1908 while (s <= last1) {
1909 if (regtry(prog, s))
1910 goto got_it;
1911 s += UTF8SKIP(s);
1912 }
1913 }
1914 else {
1915 while (s <= last1) {
1916 if (regtry(prog, s))
1917 goto got_it;
1918 s++;
1919 }
1920 }
1921 }
a3621e74 1922 DEBUG_EXECUTE_r(if (!did_match)
b7953727 1923 PerlIO_printf(Perl_debug_log,
a0288114 1924 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1925 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1926 ? "anchored" : "floating"),
1927 PL_colors[0],
1928 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 1929 SvPVX_const(must),
b7953727
JH
1930 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1931 );
6eb5f6b9
JH
1932 goto phooey;
1933 }
155aba94 1934 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1935 if (minlen) {
1936 I32 op = (U8)OP(prog->regstclass);
66e933ab 1937 /* don't bother with what can't match */
f14c76ed
RGS
1938 if (PL_regkind[op] != EXACT && op != CANY)
1939 strend = HOPc(strend, -(minlen - 1));
1940 }
a3621e74 1941 DEBUG_EXECUTE_r({
ffc61ed2 1942 SV *prop = sv_newmortal();
cfd0369c
NC
1943 const char *s0;
1944 const char *s1;
9e55ce06
JH
1945 int len0;
1946 int len1;
1947
ffc61ed2 1948 regprop(prop, c);
9e55ce06 1949 s0 = UTF ?
3f7c398e 1950 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 1951 UNI_DISPLAY_REGEX) :
cfd0369c 1952 SvPVX_const(prop);
9e55ce06
JH
1953 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1954 s1 = UTF ?
c728cb41 1955 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1956 len1 = UTF ? SvCUR(dsv1) : strend - s;
1957 PerlIO_printf(Perl_debug_log,
a0288114 1958 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
9e55ce06
JH
1959 len0, len0, s0,
1960 len1, len1, s1);
ffc61ed2 1961 });
06b5626a 1962 if (find_byclass(prog, c, s, strend, 0))
6eb5f6b9 1963 goto got_it;
a3621e74 1964 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1965 }
1966 else {
1967 dontbother = 0;
33b8afdf
JH
1968 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1969 /* Trim the end. */
d6a28714 1970 char *last;
33b8afdf
JH
1971 SV* float_real;
1972
1973 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1974 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1975 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1976
1977 if (flags & REXEC_SCREAM) {
33b8afdf 1978 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1979 end_shift, &scream_pos, 1); /* last one */
1980 if (!last)
ffc61ed2 1981 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1982 /* we may be pointing at the wrong string */
1983 else if (RX_MATCH_COPIED(prog))
3f7c398e 1984 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1985 }
d6a28714
JH
1986 else {
1987 STRLEN len;
cfd0369c 1988 const char * const little = SvPV_const(float_real, len);
d6a28714 1989
33b8afdf 1990 if (SvTAIL(float_real)) {
d6a28714
JH
1991 if (memEQ(strend - len + 1, little, len - 1))
1992 last = strend - len + 1;
7fba1cd6 1993 else if (!multiline)
9041c2e3 1994 last = memEQ(strend - len, little, len)
d6a28714 1995 ? strend - len : Nullch;
b8c5462f 1996 else
d6a28714
JH
1997 goto find_last;
1998 } else {
1999 find_last:
9041c2e3 2000 if (len)
d6a28714 2001 last = rninstr(s, strend, little, little + len);
b8c5462f 2002 else
a0288114 2003 last = strend; /* matching "$" */
b8c5462f 2004 }
b8c5462f 2005 }
bf93d4cc 2006 if (last == NULL) {
a3621e74 2007 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2008 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2009 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2010 goto phooey; /* Should not happen! */
2011 }
d6a28714
JH
2012 dontbother = strend - last + prog->float_min_offset;
2013 }
2014 if (minlen && (dontbother < minlen))
2015 dontbother = minlen - 1;
2016 strend -= dontbother; /* this one's always in bytes! */
2017 /* We don't know much -- general case. */
1aa99e6b 2018 if (do_utf8) {
d6a28714
JH
2019 for (;;) {
2020 if (regtry(prog, s))
2021 goto got_it;
2022 if (s >= strend)
2023 break;
b8c5462f 2024 s += UTF8SKIP(s);
d6a28714
JH
2025 };
2026 }
2027 else {
2028 do {
2029 if (regtry(prog, s))
2030 goto got_it;
2031 } while (s++ < strend);
2032 }
2033 }
2034
2035 /* Failure. */
2036 goto phooey;
2037
2038got_it:
2039 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2040
2041 if (PL_reg_eval_set) {
2042 /* Preserve the current value of $^R */
2043 if (oreplsv != GvSV(PL_replgv))
2044 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2045 restored, the value remains
2046 the same. */
acfe0abc 2047 restore_pos(aTHX_ 0);
d6a28714
JH
2048 }
2049
2050 /* make sure $`, $&, $', and $digit will work later */
2051 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2052 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2053 if (flags & REXEC_COPY_STR) {
2054 I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2055#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2056 if ((SvIsCOW(sv)
2057 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2058 if (DEBUG_C_TEST) {
2059 PerlIO_printf(Perl_debug_log,
2060 "Copy on write: regexp capture, type %d\n",
2061 (int) SvTYPE(sv));
2062 }
2063 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2064 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2065 assert (SvPOKp(prog->saved_copy));
2066 } else
2067#endif
2068 {
2069 RX_MATCH_COPIED_on(prog);
2070 s = savepvn(strbeg, i);
2071 prog->subbeg = s;
2072 }
d6a28714 2073 prog->sublen = i;
d6a28714
JH
2074 }
2075 else {
2076 prog->subbeg = strbeg;
2077 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2078 }
2079 }
9041c2e3 2080
d6a28714
JH
2081 return 1;
2082
2083phooey:
a3621e74 2084 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2085 PL_colors[4], PL_colors[5]));
d6a28714 2086 if (PL_reg_eval_set)
acfe0abc 2087 restore_pos(aTHX_ 0);
d6a28714
JH
2088 return 0;
2089}
2090
2091/*
2092 - regtry - try match at specific point
2093 */
2094STATIC I32 /* 0 failure, 1 success */
2095S_regtry(pTHX_ regexp *prog, char *startpos)
2096{
97aff369 2097 dVAR;
d6a28714
JH
2098 register I32 i;
2099 register I32 *sp;
2100 register I32 *ep;
2101 CHECKPOINT lastcp;
a3621e74 2102 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2103
02db2b7b
IZ
2104#ifdef DEBUGGING
2105 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2106#endif
d6a28714
JH
2107 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2108 MAGIC *mg;
2109
2110 PL_reg_eval_set = RS_init;
a3621e74 2111 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2112 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2113 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2114 ));
e8347627 2115 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2116 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2117 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2118 SAVETMPS;
2119 /* Apparently this is not needed, judging by wantarray. */
e8347627 2120 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2121 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2122
2123 if (PL_reg_sv) {
2124 /* Make $_ available to executed code. */
2125 if (PL_reg_sv != DEFSV) {
59f00321 2126 SAVE_DEFSV;
d6a28714 2127 DEFSV = PL_reg_sv;
b8c5462f 2128 }
d6a28714 2129
9041c2e3 2130 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2131 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2132 /* prepare for quick setting of pos */
14befaf4
DM
2133 sv_magic(PL_reg_sv, (SV*)0,
2134 PERL_MAGIC_regex_global, Nullch, 0);
2135 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2136 mg->mg_len = -1;
b8c5462f 2137 }
d6a28714
JH
2138 PL_reg_magic = mg;
2139 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2140 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2141 }
09687e5a 2142 if (!PL_reg_curpm) {
a02a5408 2143 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2144#ifdef USE_ITHREADS
2145 {
2146 SV* repointer = newSViv(0);
577e12cc 2147 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2148 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2149 av_push(PL_regex_padav,repointer);
2150 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2151 PL_regex_pad = AvARRAY(PL_regex_padav);
2152 }
2153#endif
2154 }
aaa362c4 2155 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2156 PL_reg_oldcurpm = PL_curpm;
2157 PL_curpm = PL_reg_curpm;
2158 if (RX_MATCH_COPIED(prog)) {
2159 /* Here is a serious problem: we cannot rewrite subbeg,
2160 since it may be needed if this match fails. Thus
2161 $` inside (?{}) could fail... */
2162 PL_reg_oldsaved = prog->subbeg;
2163 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2164#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2165 PL_nrs = prog->saved_copy;
2166#endif
d6a28714
JH
2167 RX_MATCH_COPIED_off(prog);
2168 }
2169 else
2170 PL_reg_oldsaved = Nullch;
2171 prog->subbeg = PL_bostr;
2172 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2173 }
973dddac 2174 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2175 PL_reginput = startpos;
2176 PL_regstartp = prog->startp;
2177 PL_regendp = prog->endp;
2178 PL_reglastparen = &prog->lastparen;
a01268b5 2179 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2180 prog->lastparen = 0;
03994de8 2181 prog->lastcloseparen = 0;
d6a28714 2182 PL_regsize = 0;
a3621e74 2183 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2184 if (PL_reg_start_tmpl <= prog->nparens) {
2185 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2186 if(PL_reg_start_tmp)
2187 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2188 else
a02a5408 2189 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2190 }
2191
2192 /* XXXX What this code is doing here?!!! There should be no need
2193 to do this again and again, PL_reglastparen should take care of
3dd2943c 2194 this! --ilya*/
dafc8851
JH
2195
2196 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2197 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2198 * PL_reglastparen), is not needed at all by the test suite
2199 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2200 * enough, for building DynaLoader, or otherwise this
2201 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2202 * will happen. Meanwhile, this code *is* needed for the
2203 * above-mentioned test suite tests to succeed. The common theme
2204 * on those tests seems to be returning null fields from matches.
2205 * --jhi */
dafc8851 2206#if 1
d6a28714
JH
2207 sp = prog->startp;
2208 ep = prog->endp;
2209 if (prog->nparens) {
eb160463 2210 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2211 *++sp = -1;
2212 *++ep = -1;
2213 }
2214 }
dafc8851 2215#endif
02db2b7b 2216 REGCP_SET(lastcp);
d6a28714
JH
2217 if (regmatch(prog->program + 1)) {
2218 prog->endp[0] = PL_reginput - PL_bostr;
2219 return 1;
2220 }
02db2b7b 2221 REGCP_UNWIND(lastcp);
d6a28714
JH
2222 return 0;
2223}
2224
02db2b7b
IZ
2225#define RE_UNWIND_BRANCH 1
2226#define RE_UNWIND_BRANCHJ 2
2227
2228union re_unwind_t;
2229
2230typedef struct { /* XX: makes sense to enlarge it... */
2231 I32 type;
2232 I32 prev;
2233 CHECKPOINT lastcp;
2234} re_unwind_generic_t;
2235
2236typedef struct {
2237 I32 type;
2238 I32 prev;
2239 CHECKPOINT lastcp;
2240 I32 lastparen;
2241 regnode *next;
2242 char *locinput;
2243 I32 nextchr;
2244#ifdef DEBUGGING
2245 int regindent;
2246#endif
2247} re_unwind_branch_t;
2248
2249typedef union re_unwind_t {
2250 I32 type;
2251 re_unwind_generic_t generic;
2252 re_unwind_branch_t branch;
2253} re_unwind_t;
2254
8ba1375e
MJD
2255#define sayYES goto yes
2256#define sayNO goto no
e0f9d4a8 2257#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2258#define sayYES_FINAL goto yes_final
2259#define sayYES_LOUD goto yes_loud
2260#define sayNO_FINAL goto no_final
2261#define sayNO_SILENT goto do_no
2262#define saySAME(x) if (x) goto yes; else goto no
2263
3ab3c9b4
HS
2264#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2265#define POSCACHE_SEEN 1 /* we know what we're caching */
2266#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2267#define CACHEsayYES STMT_START { \
2268 if (cache_offset | cache_bit) { \
2269 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2270 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2271 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2272 /* cache records failure, but this is success */ \
2273 DEBUG_r( \
2274 PerlIO_printf(Perl_debug_log, \
2275 "%*s (remove success from failure cache)\n", \
2276 REPORT_CODE_OFF+PL_regindent*2, "") \
2277 ); \
2278 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2279 } \
2280 } \
2281 sayYES; \
2282} STMT_END
2283#define CACHEsayNO STMT_START { \
2284 if (cache_offset | cache_bit) { \
2285 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2286 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2287 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2288 /* cache records success, but this is failure */ \
2289 DEBUG_r( \
2290 PerlIO_printf(Perl_debug_log, \
2291 "%*s (remove failure from success cache)\n", \
2292 REPORT_CODE_OFF+PL_regindent*2, "") \
2293 ); \
2294 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2295 } \
2296 } \
2297 sayNO; \
2298} STMT_END
2299
a3621e74
YO
2300/* this is used to determine how far from the left messages like
2301 'failed...' are printed. Currently 29 makes these messages line
2302 up with the opcode they refer to. Earlier perls used 25 which
2303 left these messages outdented making reviewing a debug output
2304 quite difficult.
2305*/
2306#define REPORT_CODE_OFF 29
2307
2308
2309/* Make sure there is a test for this +1 options in re_tests */
2310#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2311
2312#define TRIE_CHECK_STATE_IS_ACCEPTING STMT_START { \
2313 if ( trie->states[ state ].wordnum ) { \
2314 if ( !accepted ) { \
2315 ENTER; \
2316 SAVETMPS; \
2317 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN ; \
561b68a9 2318 sv_accept_buff=newSV(bufflen * sizeof(reg_trie_accepted) - 1 );\
a3621e74
YO
2319 SvCUR_set( sv_accept_buff, sizeof(reg_trie_accepted) ); \
2320 SvPOK_on( sv_accept_buff ); \
2321 sv_2mortal( sv_accept_buff ); \
2322 accept_buff = (reg_trie_accepted*)SvPV_nolen( sv_accept_buff );\
2323 } else { \
2324 if ( accepted >= bufflen ) { \
2325 bufflen *= 2; \
2326 accept_buff =(reg_trie_accepted*)SvGROW( sv_accept_buff, \
2327 bufflen * sizeof(reg_trie_accepted) ); \
2328 } \
2329 SvCUR_set( sv_accept_buff,SvCUR( sv_accept_buff ) \
2330 + sizeof( reg_trie_accepted ) ); \
2331 } \
2332 accept_buff[ accepted ].wordnum = trie->states[ state ].wordnum; \
2333 accept_buff[ accepted ].endpos = uc; \
2334 ++accepted; \
2335 } } STMT_END
2336
2337#define TRIE_HANDLE_CHAR STMT_START { \
2338 if ( uvc < 256 ) { \
2339 charid = trie->charmap[ uvc ]; \
2340 } else { \
2341 charid = 0; \
2342 if( trie->widecharmap ) { \
2343 SV** svpp = (SV**)NULL; \
2344 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, \
2345 sizeof( UV ), 0 ); \
2346 if ( svpp ) { \
2347 charid = (U16)SvIV( *svpp ); \
2348 } \
2349 } \
2350 } \
2351 if ( charid && \
cc601c31
YO
2352 ( base + charid > trie->uniquecharcount ) && \
2353 ( base + charid - 1 - trie->uniquecharcount < trie->lasttrans) && \
a3621e74
YO
2354 trie->trans[ base + charid - 1 - trie->uniquecharcount ].check == state ) \
2355 { \
2356 state = trie->trans[ base + charid - 1 - trie->uniquecharcount ].next; \
2357 } else { \
2358 state = 0; \
2359 } \
2360 uc += len; \
2361 } STMT_END
8ba1375e 2362
d6a28714
JH
2363/*
2364 - regmatch - main matching routine
2365 *
2366 * Conceptually the strategy is simple: check to see whether the current
2367 * node matches, call self recursively to see whether the rest matches,
2368 * and then act accordingly. In practice we make some effort to avoid
2369 * recursion, in particular by going through "ordinary" nodes (that don't
2370 * need to know whether the rest of the match failed) by a loop instead of
2371 * by recursion.
2372 */
2373/* [lwall] I've hoisted the register declarations to the outer block in order to
2374 * maybe save a little bit of pushing and popping on the stack. It also takes
2375 * advantage of machines that use a register save mask on subroutine entry.
2376 */
2377STATIC I32 /* 0 failure, 1 success */
2378S_regmatch(pTHX_ regnode *prog)
2379{
27da23d5 2380 dVAR;
d6a28714
JH
2381 register regnode *scan; /* Current node. */
2382 regnode *next; /* Next node. */
2383 regnode *inner; /* Next node in internal branch. */
2384 register I32 nextchr; /* renamed nextchr - nextchar colides with
2385 function of same name */
2386 register I32 n; /* no or next */
b7953727
JH
2387 register I32 ln = 0; /* len or last */
2388 register char *s = Nullch; /* operand or save */
d6a28714 2389 register char *locinput = PL_reginput;
b7953727 2390 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2391 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2392 I32 unwind = 0;
a3621e74
YO
2393
2394 /* used by the trie code */
cbbf8932
AL
2395 SV *sv_accept_buff = NULL; /* accepting states we have traversed */
2396 reg_trie_accepted *accept_buff = NULL; /* "" */
ab74612d
NC
2397 reg_trie_data *trie; /* what trie are we using right now */
2398 U32 accepted = 0; /* how many accepting states we have seen*/
a3621e74 2399
b7953727 2400#if 0
02db2b7b 2401 I32 firstcp = PL_savestack_ix;
b7953727 2402#endif
0d46e09a 2403 register const bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2404#ifdef DEBUGGING
6136c704
AL
2405 SV * const dsv0 = PERL_DEBUG_PAD_ZERO(0);
2406 SV * const dsv1 = PERL_DEBUG_PAD_ZERO(1);
2407 SV * const dsv2 = PERL_DEBUG_PAD_ZERO(2);
a3621e74 2408
ab74612d 2409 SV *re_debug_flags = NULL;
2a782b5b 2410#endif
041457d9 2411 U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
02db2b7b 2412
a3621e74
YO
2413 GET_RE_DEBUG_FLAGS;
2414
d6a28714
JH
2415#ifdef DEBUGGING
2416 PL_regindent++;
2417#endif
2418
a3621e74 2419
d6a28714
JH
2420 /* Note that nextchr is a byte even in UTF */
2421 nextchr = UCHARAT(locinput);
2422 scan = prog;
2423 while (scan != NULL) {
8ba1375e 2424
a3621e74 2425 DEBUG_EXECUTE_r( {
6136c704 2426 SV * const prop = sv_newmortal();
1df70142
AL
2427 const int docolor = *PL_colors[0];
2428 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2429 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2430 /* The part of the string before starttry has one color
2431 (pref0_len chars), between starttry and current
2432 position another one (pref_len - pref0_len chars),
2433 after the current position the third one.
2434 We assume that pref0_len <= pref_len, otherwise we
2435 decrease pref0_len. */
9041c2e3 2436 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2437 ? (5 + taill) - l : locinput - PL_bostr;
2438 int pref0_len;
d6a28714 2439
df1ffd02 2440 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2441 pref_len++;
2442 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2443 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2444 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2445 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2446 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2447 l--;
d6a28714
JH
2448 if (pref0_len < 0)
2449 pref0_len = 0;
2450 if (pref0_len > pref_len)
2451 pref0_len = pref_len;
2452 regprop(prop, scan);
2a782b5b 2453 {
1df70142 2454 const char * const s0 =
f14c76ed 2455 do_utf8 && OP(scan) != CANY ?
2a782b5b 2456 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2457 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2458 locinput - pref_len;
1df70142
AL
2459 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2460 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2461 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2462 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2463 locinput - pref_len + pref0_len;
1df70142
AL
2464 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2465 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2466 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2467 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2468 locinput;
1df70142 2469 const int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2470 PerlIO_printf(Perl_debug_log,
2471 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2472 (IV)(locinput - PL_bostr),
2473 PL_colors[4],
2474 len0, s0,
2475 PL_colors[5],
2476 PL_colors[2],
2477 len1, s1,
2478 PL_colors[3],
2479 (docolor ? "" : "> <"),
2480 PL_colors[0],
2481 len2, s2,
2482 PL_colors[1],
2483 15 - l - pref_len + 1,
2484 "",
2485 (IV)(scan - PL_regprogram), PL_regindent*2, "",
3f7c398e 2486 SvPVX_const(prop));
2a782b5b
JH
2487 }
2488 });
d6a28714
JH
2489
2490 next = scan + NEXT_OFF(scan);
2491 if (next == scan)
2492 next = NULL;
2493
2494 switch (OP(scan)) {
2495 case BOL:
7fba1cd6 2496 if (locinput == PL_bostr)
d6a28714
JH
2497 {
2498 /* regtill = regbol; */
b8c5462f
JH
2499 break;
2500 }
d6a28714
JH
2501 sayNO;
2502 case MBOL:
12d33761
HS
2503 if (locinput == PL_bostr ||
2504 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2505 {
b8c5462f
JH
2506 break;
2507 }
d6a28714
JH
2508 sayNO;
2509 case SBOL:
c2a73568 2510 if (locinput == PL_bostr)
b8c5462f 2511 break;
d6a28714
JH
2512 sayNO;
2513 case GPOS:
2514 if (locinput == PL_reg_ganch)
2515 break;
2516 sayNO;
2517 case EOL:
d6a28714
JH
2518 goto seol;
2519 case MEOL:
d6a28714 2520 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2521 sayNO;
b8c5462f 2522 break;
d6a28714
JH
2523 case SEOL:
2524 seol:
2525 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2526 sayNO;
d6a28714 2527 if (PL_regeol - locinput > 1)
b8c5462f 2528 sayNO;
b8c5462f 2529 break;
d6a28714
JH
2530 case EOS:
2531 if (PL_regeol != locinput)
b8c5462f 2532 sayNO;
d6a28714 2533 break;
ffc61ed2 2534 case SANY:
d6a28714 2535 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2536 sayNO;
f33976b4
DB
2537 if (do_utf8) {
2538 locinput += PL_utf8skip[nextchr];
2539 if (locinput > PL_regeol)
2540 sayNO;
2541 nextchr = UCHARAT(locinput);
2542 }
2543 else
2544 nextchr = UCHARAT(++locinput);
2545 break;
2546 case CANY:
2547 if (!nextchr && locinput >= PL_regeol)
2548 sayNO;
b8c5462f 2549 nextchr = UCHARAT(++locinput);
a0d0e21e 2550 break;
ffc61ed2 2551 case REG_ANY:
1aa99e6b
IH
2552 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2553 sayNO;
2554 if (do_utf8) {
b8c5462f 2555 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2556 if (locinput > PL_regeol)
2557 sayNO;
a0ed51b3 2558 nextchr = UCHARAT(locinput);
a0ed51b3 2559 }
1aa99e6b
IH
2560 else
2561 nextchr = UCHARAT(++locinput);
a0ed51b3 2562 break;
a3621e74
YO
2563
2564
2565
2566 /*
2567 traverse the TRIE keeping track of all accepting states
2568 we transition through until we get to a failing node.
2569
2570 we use two slightly different pieces of code to handle
2571 the traversal depending on whether its case sensitive or
2572 not. we reuse the accept code however. (this should probably
2573 be turned into a macro.)
2574
2575 */
2576 case TRIEF:
2577 case TRIEFL:
2578 {
a3621e74
YO
2579 U8 *uc = ( U8* )locinput;
2580 U32 state = 1;
2581 U16 charid = 0;
2582 U32 base = 0;
2583 UV uvc = 0;
2584 STRLEN len = 0;
2585 STRLEN foldlen = 0;
a3621e74
YO
2586 U8 *uscan = (U8*)NULL;
2587 STRLEN bufflen=0;
2588 accepted = 0;
2589
2590 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2591
2592 while ( state && uc <= (U8*)PL_regeol ) {
2593
2594 TRIE_CHECK_STATE_IS_ACCEPTING;
2595
2596 base = trie->states[ state ].trans.base;
2597
2598 DEBUG_TRIE_EXECUTE_r(
2599 PerlIO_printf( Perl_debug_log,
e4584336 2600 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2601 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
e4584336 2602 (UV)state, (UV)base, (UV)accepted );
a3621e74
YO
2603 );
2604
2605 if ( base ) {
2606
108bb1ad 2607 if ( do_utf8 ) {
a3621e74
YO
2608 if ( foldlen>0 ) {
2609 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2610 foldlen -= len;
2611 uscan += len;
2612 len=0;
2613 } else {
1df70142 2614 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2615 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2616 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2617 foldlen -= UNISKIP( uvc );
2618 uscan = foldbuf + UNISKIP( uvc );
2619 }
2620 } else {
e4584336 2621 uvc = (UV)*uc;
a3621e74
YO
2622 len = 1;
2623 }
2624
2625 TRIE_HANDLE_CHAR;
2626
2627 } else {
2628 state = 0;
2629 }
2630 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2631 PerlIO_printf( Perl_debug_log,
2632 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2633 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2634 );
2635 }
2636 if ( !accepted ) {
2637 sayNO;
2638 } else {
2639 goto TrieAccept;
2640 }
2641 }
2642 /* unreached codepoint: we jump into the middle of the next case
2643 from previous if blocks */
2644 case TRIE:
2645 {
a3621e74
YO
2646 U8 *uc = (U8*)locinput;
2647 U32 state = 1;
2648 U16 charid = 0;
2649 U32 base = 0;
2650 UV uvc = 0;
2651 STRLEN len = 0;
2652 STRLEN bufflen = 0;
2653 accepted = 0;
2654
2655 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2656
2657 while ( state && uc <= (U8*)PL_regeol ) {
2658
2659 TRIE_CHECK_STATE_IS_ACCEPTING;
2660
2661 base = trie->states[ state ].trans.base;
2662
2663 DEBUG_TRIE_EXECUTE_r(
2664 PerlIO_printf( Perl_debug_log,
e4584336 2665 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2666 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
e4584336 2667 (UV)state, (UV)base, (UV)accepted );
a3621e74
YO
2668 );
2669
2670 if ( base ) {
2671
108bb1ad 2672 if ( do_utf8 ) {
a3621e74
YO
2673 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2674 } else {
2675 uvc = (U32)*uc;
2676 len = 1;
2677 }
2678
2679 TRIE_HANDLE_CHAR;
2680
2681 } else {
2682 state = 0;
2683 }
2684 DEBUG_TRIE_EXECUTE_r(
2685 PerlIO_printf( Perl_debug_log,
e4584336
RB
2686 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2687 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2688 );
2689 }
2690 if ( !accepted ) {
2691 sayNO;
2692 }
2693 }
2694
2695
2696 /*
2697 There was at least one accepting state that we
2698 transitioned through. Presumably the number of accepting
2699 states is going to be low, typically one or two. So we
2700 simply scan through to find the one with lowest wordnum.
2701 Once we find it, we swap the last state into its place
2702 and decrement the size. We then try to match the rest of
2703 the pattern at the point where the word ends, if we
2704 succeed then we end the loop, otherwise the loop
2705 eventually terminates once all of the accepting states
2706 have been tried.
2707 */
2708 TrieAccept:
2709 {
2710 int gotit = 0;
2711
2712 if ( accepted == 1 ) {
2713 DEBUG_EXECUTE_r({
2714 SV **tmp = av_fetch( trie->words, accept_buff[ 0 ].wordnum-1, 0 );
2715 PerlIO_printf( Perl_debug_log,
2716 "%*s %sonly one match : #%d <%s>%s\n",
e4584336 2717 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74 2718 accept_buff[ 0 ].wordnum,
cfd0369c 2719 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
a3621e74
YO
2720 PL_colors[5] );
2721 });
cc601c31 2722 PL_reginput = (char *)accept_buff[ 0 ].endpos;
a3621e74
YO
2723 /* in this case we free tmps/leave before we call regmatch
2724 as we wont be using accept_buff again. */
2725 FREETMPS;
2726 LEAVE;
2727 gotit = regmatch( scan + NEXT_OFF( scan ) );
2728 } else {
2729 DEBUG_EXECUTE_r(
e4584336
RB
2730 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
2731 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)accepted,
a3621e74
YO
2732 PL_colors[5] );
2733 );
2734 while ( !gotit && accepted-- ) {
2735 U32 best = 0;
2736 U32 cur;
2737 for( cur = 1 ; cur <= accepted ; cur++ ) {
e4584336
RB
2738 DEBUG_TRIE_EXECUTE_r(
2739 PerlIO_printf( Perl_debug_log,
2740 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2741 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2742 (IV)best, accept_buff[ best ].wordnum, (IV)cur,
2743 accept_buff[ cur ].wordnum, PL_colors[5] );
2744 );
a3621e74
YO
2745
2746 if ( accept_buff[ cur ].wordnum < accept_buff[ best ].wordnum )
2747 best = cur;
2748 }
2749 DEBUG_EXECUTE_r({
6136c704 2750 SV ** const tmp = av_fetch( trie->words, accept_buff[ best ].wordnum - 1, 0 );
a3621e74 2751 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
e4584336 2752 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74 2753 accept_buff[best].wordnum,
cfd0369c 2754 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",scan,
a3621e74
YO
2755 PL_colors[5] );
2756 });
2757 if ( best<accepted ) {
2758 reg_trie_accepted tmp = accept_buff[ best ];
2759 accept_buff[ best ] = accept_buff[ accepted ];
2760 accept_buff[ accepted ] = tmp;
2761 best = accepted;
2762 }
cc601c31 2763 PL_reginput = (char *)accept_buff[ best ].endpos;
a3621e74
YO
2764
2765 /*
2766 as far as I can tell we only need the SAVETMPS/FREETMPS
2767 for re's with EVAL in them but I'm leaving them in for
2768 all until I can be sure.
2769 */
2770 SAVETMPS;
2771 gotit = regmatch( scan + NEXT_OFF( scan ) ) ;
2772 FREETMPS;
2773 }
2774 FREETMPS;
2775 LEAVE;
2776 }
2777
2778 if ( gotit ) {
2779 sayYES;
2780 } else {
2781 sayNO;
2782 }
2783 }
2784 /* unreached codepoint */
d6a28714 2785 case EXACT:
cd439c50
IZ
2786 s = STRING(scan);
2787 ln = STR_LEN(scan);
eb160463 2788 if (do_utf8 != UTF) {
bc517b45 2789 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2790 char *l = locinput;
a3b680e6 2791 const char *e = s + ln;
a72c7584 2792
5ff6fc6d
JH
2793 if (do_utf8) {
2794 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2795 while (s < e) {
a3b680e6 2796 STRLEN ulen;
1aa99e6b 2797 if (l >= PL_regeol)
5ff6fc6d
JH
2798 sayNO;
2799 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2800 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2801 uniflags))
5ff6fc6d 2802 sayNO;
bc517b45 2803 l += ulen;
5ff6fc6d 2804 s ++;
1aa99e6b 2805 }
5ff6fc6d
JH
2806 }
2807 else {
2808 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2809 while (s < e) {
a3b680e6 2810 STRLEN ulen;
1aa99e6b
IH
2811 if (l >= PL_regeol)
2812 sayNO;
5ff6fc6d 2813 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2814 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2815 uniflags))
1aa99e6b 2816 sayNO;
bc517b45 2817 s += ulen;
a72c7584 2818 l ++;
1aa99e6b 2819 }
5ff6fc6d 2820 }
1aa99e6b
IH
2821 locinput = l;
2822 nextchr = UCHARAT(locinput);
2823 break;
2824 }
bc517b45 2825 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2826 /* Inline the first character, for speed. */
2827 if (UCHARAT(s) != nextchr)
2828 sayNO;
2829 if (PL_regeol - locinput < ln)
2830 sayNO;
2831 if (ln > 1 && memNE(s, locinput, ln))
2832 sayNO;
2833 locinput += ln;
2834 nextchr = UCHARAT(locinput);
2835 break;
2836 case EXACTFL:
b8c5462f
JH
2837 PL_reg_flags |= RF_tainted;
2838 /* FALL THROUGH */
d6a28714 2839 case EXACTF:
cd439c50
IZ
2840 s = STRING(scan);
2841 ln = STR_LEN(scan);
d6a28714 2842
d07ddd77
JH
2843 if (do_utf8 || UTF) {
2844 /* Either target or the pattern are utf8. */
d6a28714 2845 char *l = locinput;
d07ddd77 2846 char *e = PL_regeol;
bc517b45 2847
eb160463 2848 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2849 l, &e, 0, do_utf8)) {
5486206c
JH
2850 /* One more case for the sharp s:
2851 * pack("U0U*", 0xDF) =~ /ss/i,
2852 * the 0xC3 0x9F are the UTF-8
2853 * byte sequence for the U+00DF. */
2854 if (!(do_utf8 &&
2855 toLOWER(s[0]) == 's' &&
2856 ln >= 2 &&
2857 toLOWER(s[1]) == 's' &&
2858 (U8)l[0] == 0xC3 &&
2859 e - l >= 2 &&
2860 (U8)l[1] == 0x9F))
2861 sayNO;
2862 }
d07ddd77
JH
2863 locinput = e;
2864 nextchr = UCHARAT(locinput);
2865 break;
a0ed51b3 2866 }
d6a28714 2867
bc517b45
JH
2868 /* Neither the target and the pattern are utf8. */
2869
d6a28714
JH
2870 /* Inline the first character, for speed. */
2871 if (UCHARAT(s) != nextchr &&
2872 UCHARAT(s) != ((OP(scan) == EXACTF)
2873 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2874 sayNO;
d6a28714 2875 if (PL_regeol - locinput < ln)
b8c5462f 2876 sayNO;
d6a28714
JH
2877 if (ln > 1 && (OP(scan) == EXACTF
2878 ? ibcmp(s, locinput, ln)
2879 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2880 sayNO;
d6a28714
JH
2881 locinput += ln;
2882 nextchr = UCHARAT(locinput);
a0d0e21e 2883 break;
d6a28714 2884 case ANYOF:
ffc61ed2 2885 if (do_utf8) {
9e55ce06
JH
2886 STRLEN inclasslen = PL_regeol - locinput;
2887
ba7b4546 2888 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2889 sayNO_ANYOF;
ffc61ed2
JH
2890 if (locinput >= PL_regeol)
2891 sayNO;
0f0076b4 2892 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2893 nextchr = UCHARAT(locinput);
e0f9d4a8 2894 break;
ffc61ed2
JH
2895 }
2896 else {
2897 if (nextchr < 0)
2898 nextchr = UCHARAT(locinput);
7d3e948e 2899 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2900 sayNO_ANYOF;
ffc61ed2
JH
2901 if (!nextchr && locinput >= PL_regeol)
2902 sayNO;
2903 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2904 break;
2905 }
2906 no_anyof:
2907 /* If we might have the case of the German sharp s
2908 * in a casefolding Unicode character class. */
2909
ebc501f0
JH
2910 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2911 locinput += SHARP_S_SKIP;
e0f9d4a8 2912 nextchr = UCHARAT(locinput);
ffc61ed2 2913 }
e0f9d4a8
JH
2914 else
2915 sayNO;
b8c5462f 2916 break;
d6a28714 2917 case ALNUML:
b8c5462f
JH
2918 PL_reg_flags |= RF_tainted;
2919 /* FALL THROUGH */
d6a28714 2920 case ALNUM:
b8c5462f 2921 if (!nextchr)
4633a7c4 2922 sayNO;
ffc61ed2 2923 if (do_utf8) {
1a4fad37 2924 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2925 if (!(OP(scan) == ALNUM
3568d838 2926 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2927 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2928 {
2929 sayNO;
a0ed51b3 2930 }
b8c5462f 2931 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2932 nextchr = UCHARAT(locinput);
2933 break;
2934 }
ffc61ed2 2935 if (!(OP(scan) == ALNUM
d6a28714 2936 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2937 sayNO;
b8c5462f 2938 nextchr = UCHARAT(++locinput);
a0d0e21e 2939 break;
d6a28714 2940 case NALNUML:
b8c5462f
JH
2941 PL_reg_flags |= RF_tainted;
2942 /* FALL THROUGH */
d6a28714
JH
2943 case NALNUM:
2944 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2945 sayNO;
ffc61ed2 2946 if (do_utf8) {
1a4fad37 2947 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2948 if (OP(scan) == NALNUM
3568d838 2949 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2950 : isALNUM_LC_utf8((U8*)locinput))
2951 {
b8c5462f 2952 sayNO;
d6a28714 2953 }
b8c5462f
JH
2954 locinput += PL_utf8skip[nextchr];
2955 nextchr = UCHARAT(locinput);
2956 break;
2957 }
ffc61ed2 2958 if (OP(scan) == NALNUM
d6a28714 2959 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2960 sayNO;
76e3520e 2961 nextchr = UCHARAT(++locinput);
a0d0e21e 2962 break;
d6a28714
JH
2963 case BOUNDL:
2964 case NBOUNDL:
3280af22 2965 PL_reg_flags |= RF_tainted;
bbce6d69 2966 /* FALL THROUGH */
d6a28714
JH
2967 case BOUND:
2968 case NBOUND:
2969 /* was last char in word? */
ffc61ed2 2970 if (do_utf8) {
12d33761
HS
2971 if (locinput == PL_bostr)
2972 ln = '\n';
ffc61ed2 2973 else {
a3b680e6 2974 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2975
b4f7163a 2976 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2977 }
2978 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2979 ln = isALNUM_uni(ln);
1a4fad37 2980 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 2981 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2982 }
2983 else {
9041c2e3 2984 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2985 n = isALNUM_LC_utf8((U8*)locinput);
2986 }
a0ed51b3 2987 }
d6a28714 2988 else {
12d33761
HS
2989 ln = (locinput != PL_bostr) ?
2990 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2991 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2992 ln = isALNUM(ln);
2993 n = isALNUM(nextchr);
2994 }
2995 else {
2996 ln = isALNUM_LC(ln);
2997 n = isALNUM_LC(nextchr);
2998 }
d6a28714 2999 }
ffc61ed2
JH
3000 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3001 OP(scan) == BOUNDL))
3002 sayNO;
a0ed51b3 3003 break;
d6a28714 3004 case SPACEL:
3280af22 3005 PL_reg_flags |= RF_tainted;
bbce6d69 3006 /* FALL THROUGH */
d6a28714 3007 case SPACE:
9442cb0e 3008 if (!nextchr)
4633a7c4 3009 sayNO;
1aa99e6b 3010 if (do_utf8) {
fd400ab9 3011 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3012 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3013 if (!(OP(scan) == SPACE
3568d838 3014 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3015 : isSPACE_LC_utf8((U8*)locinput)))
3016 {
3017 sayNO;
3018 }
3019 locinput += PL_utf8skip[nextchr];
3020 nextchr = UCHARAT(locinput);
3021 break;
d6a28714 3022 }
ffc61ed2
JH
3023 if (!(OP(scan) == SPACE
3024 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3025 sayNO;
3026 nextchr = UCHARAT(++locinput);
3027 }
3028 else {
3029 if (!(OP(scan) == SPACE
3030 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3031 sayNO;
3032 nextchr = UCHARAT(++locinput);
a0ed51b3 3033 }
a0ed51b3 3034 break;
d6a28714 3035 case NSPACEL:
3280af22 3036 PL_reg_flags |= RF_tainted;
bbce6d69 3037 /* FALL THROUGH */
d6a28714 3038 case NSPACE:
9442cb0e 3039 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3040 sayNO;
1aa99e6b 3041 if (do_utf8) {
1a4fad37 3042 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3043 if (OP(scan) == NSPACE
3568d838 3044 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3045 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3046 {
3047 sayNO;
3048 }
3049 locinput += PL_utf8skip[nextchr];
3050 nextchr = UCHARAT(locinput);
3051 break;
a0ed51b3 3052 }
ffc61ed2 3053 if (OP(scan) == NSPACE
d6a28714 3054 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3055 sayNO;
76e3520e 3056 nextchr = UCHARAT(++locinput);
a0d0e21e 3057 break;
d6a28714 3058 case DIGITL:
a0ed51b3
LW
3059 PL_reg_flags |= RF_tainted;
3060 /* FALL THROUGH */
d6a28714 3061 case DIGIT:
9442cb0e 3062 if (!nextchr)
a0ed51b3 3063 sayNO;
1aa99e6b 3064 if (do_utf8) {
1a4fad37 3065 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3066 if (!(OP(scan) == DIGIT
3568d838 3067 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3068 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3069 {
a0ed51b3 3070 sayNO;
dfe13c55 3071 }
6f06b55f 3072 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3073 nextchr = UCHARAT(locinput);
3074 break;
3075 }
ffc61ed2 3076 if (!(OP(scan) == DIGIT
9442cb0e 3077 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3078 sayNO;
3079 nextchr = UCHARAT(++locinput);
3080 break;
d6a28714 3081 case NDIGITL:
b8c5462f
JH
3082 PL_reg_flags |= RF_tainted;
3083 /* FALL THROUGH */
d6a28714 3084 case NDIGIT:
9442cb0e 3085 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3086 sayNO;
1aa99e6b 3087 if (do_utf8) {
1a4fad37 3088 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3089 if (OP(scan) == NDIGIT
3568d838 3090 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3091 : isDIGIT_LC_utf8((U8*)locinput))
3092 {
a0ed51b3 3093 sayNO;
9442cb0e 3094 }
6f06b55f 3095 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3096 nextchr = UCHARAT(locinput);
3097 break;
3098 }
ffc61ed2 3099 if (OP(scan) == NDIGIT
9442cb0e 3100 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3101 sayNO;
3102 nextchr = UCHARAT(++locinput);
3103 break;
3104 case CLUMP:
b7c83a7e 3105 if (locinput >= PL_regeol)
a0ed51b3 3106 sayNO;
b7c83a7e 3107 if (do_utf8) {
1a4fad37 3108 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3109 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3110 sayNO;
3111 locinput += PL_utf8skip[nextchr];
3112 while (locinput < PL_regeol &&
3113 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3114 locinput += UTF8SKIP(locinput);
3115 if (locinput > PL_regeol)
3116 sayNO;
eb08e2da
JH
3117 }
3118 else
3119 locinput++;
a0ed51b3
LW
3120 nextchr = UCHARAT(locinput);
3121 break;
c8756f30 3122 case REFFL:
3280af22 3123 PL_reg_flags |= RF_tainted;
c8756f30 3124 /* FALL THROUGH */
c277df42 3125 case REF:
c8756f30 3126 case REFF:
c277df42 3127 n = ARG(scan); /* which paren pair */
cf93c79d 3128 ln = PL_regstartp[n];
2c2d71f5 3129 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 3130 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 3131 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 3132 if (ln == PL_regendp[n])
a0d0e21e 3133 break;
a0ed51b3 3134
cf93c79d 3135 s = PL_bostr + ln;
1aa99e6b 3136 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3137 char *l = locinput;
a3b680e6 3138 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3139 /*
3140 * Note that we can't do the "other character" lookup trick as
3141 * in the 8-bit case (no pun intended) because in Unicode we
3142 * have to map both upper and title case to lower case.
3143 */
3144 if (OP(scan) == REFF) {
3145 while (s < e) {
a3b680e6
AL
3146 STRLEN ulen1, ulen2;
3147 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3148 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3149
a0ed51b3
LW
3150 if (l >= PL_regeol)
3151 sayNO;
a2a2844f
JH
3152 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3153 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3154 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3155 sayNO;
a2a2844f
JH
3156 s += ulen1;
3157 l += ulen2;
a0ed51b3
LW
3158 }
3159 }
3160 locinput = l;
3161 nextchr = UCHARAT(locinput);
3162 break;
3163 }
3164
a0d0e21e 3165 /* Inline the first character, for speed. */
76e3520e 3166 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3167 (OP(scan) == REF ||
3168 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3169 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3170 sayNO;
cf93c79d 3171 ln = PL_regendp[n] - ln;
3280af22 3172 if (locinput + ln > PL_regeol)
4633a7c4 3173 sayNO;
c8756f30
AK
3174 if (ln > 1 && (OP(scan) == REF
3175 ? memNE(s, locinput, ln)
3176 : (OP(scan) == REFF
3177 ? ibcmp(s, locinput, ln)
3178 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3179 sayNO;
a0d0e21e 3180 locinput += ln;
76e3520e 3181 nextchr = UCHARAT(locinput);
a0d0e21e
LW
3182 break;
3183
3184 case NOTHING:
c277df42 3185 case TAIL:
a0d0e21e
LW
3186 break;
3187 case BACK:
3188 break;
c277df42
IZ
3189 case EVAL:
3190 {
3191 dSP;
6136c704
AL
3192 OP_4tree * const oop = PL_op;
3193 COP * const ocurcop = PL_curcop;
f3548bdc 3194 PAD *old_comppad;
c277df42 3195 SV *ret;
6136c704 3196 struct regexp * const oreg = PL_reg_re;
9041c2e3 3197
c277df42 3198 n = ARG(scan);
533c011a 3199 PL_op = (OP_4tree*)PL_regdata->data[n];
a3621e74 3200 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 3201 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 3202 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 3203
8e5e9ebe 3204 {
6136c704 3205 SV ** const before = SP;
8e5e9ebe
RGS
3206 CALLRUNOPS(aTHX); /* Scalar context. */
3207 SPAGAIN;
3208 if (SP == before)
075aa684 3209 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3210 else {
3211 ret = POPs;
3212 PUTBACK;
3213 }
3214 }
3215
0f5d15d6 3216 PL_op = oop;
f3548bdc 3217 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 3218 PL_curcop = ocurcop;
c277df42 3219 if (logical) {
0f5d15d6
IZ
3220 if (logical == 2) { /* Postponed subexpression. */
3221 regexp *re;
6136c704 3222 MAGIC *mg = NULL;
0f5d15d6 3223 re_cc_state state;
0f5d15d6 3224 CHECKPOINT cp, lastcp;
cb50f42d 3225 int toggleutf;
faf82a0b 3226 register SV *sv;
0f5d15d6 3227
faf82a0b
AE
3228 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3229 mg = mg_find(sv, PERL_MAGIC_qr);
3230 else if (SvSMAGICAL(ret)) {
3231 if (SvGMAGICAL(ret))
3232 sv_unmagic(ret, PERL_MAGIC_qr);
3233 else
3234 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3235 }
faf82a0b 3236
0f5d15d6
IZ
3237 if (mg) {
3238 re = (regexp *)mg->mg_obj;
df0003d4 3239 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3240 }
3241 else {
3242 STRLEN len;
6136c704 3243 const char * const t = SvPV_const(ret, len);
0f5d15d6 3244 PMOP pm;
a3b680e6
AL
3245 char * const oprecomp = PL_regprecomp;
3246 const I32 osize = PL_regsize;
3247 const I32 onpar = PL_regnpar;
0f5d15d6 3248
5fcd1c1b 3249 Zero(&pm, 1, PMOP);
cb50f42d 3250 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3251 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3252 if (!(SvFLAGS(ret)
faf82a0b
AE
3253 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3254 | SVs_GMG)))
14befaf4
DM
3255 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3256 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
3257 PL_regprecomp = oprecomp;
3258 PL_regsize = osize;
3259 PL_regnpar = onpar;
3260 }
a3621e74 3261 DEBUG_EXECUTE_r(
9041c2e3 3262 PerlIO_printf(Perl_debug_log,
a0288114 3263 "Entering embedded \"%s%.60s%s%s\"\n",
0f5d15d6
IZ
3264 PL_colors[0],
3265 re->precomp,
3266 PL_colors[1],
3267 (strlen(re->precomp) > 60 ? "..." : ""))
3268 );
3269 state.node = next;
3270 state.prev = PL_reg_call_cc;
3271 state.cc = PL_regcc;
3272 state.re = PL_reg_re;
3273
2ab05381 3274 PL_regcc = 0;
9041c2e3 3275
0f5d15d6 3276 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3277 REGCP_SET(lastcp);
0f5d15d6
IZ
3278 cache_re(re);
3279 state.ss = PL_savestack_ix;
3280 *PL_reglastparen = 0;
a01268b5 3281 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
3282 PL_reg_call_cc = &state;
3283 PL_reginput = locinput;
cb50f42d
YST
3284 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3285 ((re->reganch & ROPT_UTF8) != 0);
3286 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3287
3288 /* XXXX This is too dramatic a measure... */
3289 PL_reg_maxiter = 0;
3290
0f5d15d6 3291 if (regmatch(re->program + 1)) {
2c914db6
IZ
3292 /* Even though we succeeded, we need to restore
3293 global variables, since we may be wrapped inside
3294 SUSPEND, thus the match may be not finished yet. */
3295
3296 /* XXXX Do this only if SUSPENDed? */
3297 PL_reg_call_cc = state.prev;
3298 PL_regcc = state.cc;
3299 PL_reg_re = state.re;
3300 cache_re(PL_reg_re);
cb50f42d 3301 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
3302
3303 /* XXXX This is too dramatic a measure... */
3304 PL_reg_maxiter = 0;
3305
3306 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
3307 ReREFCNT_dec(re);
3308 regcpblow(cp);
3309 sayYES;
3310 }
0f5d15d6 3311 ReREFCNT_dec(re);
02db2b7b 3312 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3313 regcppop();
3314 PL_reg_call_cc = state.prev;
3315 PL_regcc = state.cc;
3316 PL_reg_re = state.re;
d3790889 3317 cache_re(PL_reg_re);
cb50f42d 3318 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3319
3320 /* XXXX This is too dramatic a measure... */
3321 PL_reg_maxiter = 0;
3322
8e514ae6 3323 logical = 0;
0f5d15d6
IZ
3324 sayNO;
3325 }
c277df42 3326 sw = SvTRUE(ret);
0f5d15d6 3327 logical = 0;
a0ed51b3 3328 }
080c2dec 3329 else {
3280af22 3330 sv_setsv(save_scalar(PL_replgv), ret);
080c2dec
AE
3331 cache_re(oreg);
3332 }
c277df42
IZ
3333 break;
3334 }
a0d0e21e 3335 case OPEN:
c277df42 3336 n = ARG(scan); /* which paren pair */
3280af22
NIS
3337 PL_reg_start_tmp[n] = locinput;
3338 if (n > PL_regsize)
3339 PL_regsize = n;
a0d0e21e
LW
3340 break;
3341 case CLOSE:
c277df42 3342 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3343 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3344 PL_regendp[n] = locinput - PL_bostr;
eb160463 3345 if (n > (I32)*PL_reglastparen)
3280af22 3346 *PL_reglastparen = n;
a01268b5 3347 *PL_reglastcloseparen = n;
a0d0e21e 3348 break;
c277df42
IZ
3349 case GROUPP:
3350 n = ARG(scan); /* which paren pair */
eb160463 3351 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3352 break;
3353 case IFTHEN:
2c2d71f5 3354 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
3355 if (sw)
3356 next = NEXTOPER(NEXTOPER(scan));
3357 else {
3358 next = scan + ARG(scan);
3359 if (OP(next) == IFTHEN) /* Fake one. */
3360 next = NEXTOPER(NEXTOPER(next));
3361 }
3362 break;
3363 case LOGICAL:
0f5d15d6 3364 logical = scan->flags;
c277df42 3365 break;
2ab05381
IZ
3366/*******************************************************************
3367 PL_regcc contains infoblock about the innermost (...)* loop, and
3368 a pointer to the next outer infoblock.
3369
3370 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3371
3372 1) After matching X, regnode for CURLYX is processed;
3373
9041c2e3 3374 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3375 regmatch() recursively with the starting point at WHILEM node;
3376
3377 3) Each hit of WHILEM node tries to match A and Z (in the order
3378 depending on the current iteration, min/max of {min,max} and
3379 greediness). The information about where are nodes for "A"
3380 and "Z" is read from the infoblock, as is info on how many times "A"
3381 was already matched, and greediness.
3382
3383 4) After A matches, the same WHILEM node is hit again.
3384
3385 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3386 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3387 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3388 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3389 of the external loop.
3390
3391 Currently present infoblocks form a tree with a stem formed by PL_curcc
3392 and whatever it mentions via ->next, and additional attached trees
3393 corresponding to temporarily unset infoblocks as in "5" above.
3394
9041c2e3 3395 In the following picture infoblocks for outer loop of
2ab05381
IZ
3396 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3397 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3398 infoblocks are drawn below the "reset" infoblock.
3399
3400 In fact in the picture below we do not show failed matches for Z and T
3401 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3402 more obvious *why* one needs to *temporary* unset infoblocks.]
3403
3404 Matched REx position InfoBlocks Comment
3405 (Y(A)*?Z)*?T x
3406 Y(A)*?Z)*?T x <- O
3407 Y (A)*?Z)*?T x <- O
3408 Y A)*?Z)*?T x <- O <- I
3409 YA )*?Z)*?T x <- O <- I
3410 YA A)*?Z)*?T x <- O <- I
3411 YAA )*?Z)*?T x <- O <- I
3412 YAA Z)*?T x <- O # Temporary unset I
3413 I
3414
3415 YAAZ Y(A)*?Z)*?T x <- O
3416 I
3417
3418 YAAZY (A)*?Z)*?T x <- O
3419 I
3420
3421 YAAZY A)*?Z)*?T x <- O <- I
3422 I
3423
3424 YAAZYA )*?Z)*?T x <- O <- I
3425 I
3426
3427 YAAZYA Z)*?T x <- O # Temporary unset I
3428 I,I
3429
3430 YAAZYAZ )*?T x <- O
3431 I,I
3432
3433 YAAZYAZ T x # Temporary unset O
3434 O
3435 I,I
3436
3437 YAAZYAZT x
3438 O
3439 I,I
3440 *******************************************************************/
a0d0e21e
LW
3441 case CURLYX: {
3442 CURCUR cc;
3280af22 3443 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3444 /* No need to save/restore up to this paren */
3445 I32 parenfloor = scan->flags;
c277df42
IZ
3446
3447 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3448 next += ARG(next);
3280af22
NIS
3449 cc.oldcc = PL_regcc;
3450 PL_regcc = &cc;
cb434fcc
IZ
3451 /* XXXX Probably it is better to teach regpush to support
3452 parenfloor > PL_regsize... */
eb160463 3453 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3454 parenfloor = *PL_reglastparen; /* Pessimization... */
3455 cc.parenfloor = parenfloor;
a0d0e21e
LW
3456 cc.cur = -1;
3457 cc.min = ARG1(scan);
3458 cc.max = ARG2(scan);
c277df42 3459 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3460 cc.next = next;
3461 cc.minmod = minmod;
3462 cc.lastloc = 0;
3280af22 3463 PL_reginput = locinput;
a0d0e21e
LW
3464 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3465 regcpblow(cp);
3280af22 3466 PL_regcc = cc.oldcc;
4633a7c4 3467 saySAME(n);
a0d0e21e
LW
3468 }
3469 /* NOT REACHED */
3470 case WHILEM: {
3471 /*
3472 * This is really hard to understand, because after we match
3473 * what we're trying to match, we must make sure the rest of
2c2d71f5 3474 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3475 * to go back UP the parse tree by recursing ever deeper. And
3476 * if it fails, we have to reset our parent's current state
3477 * that we can try again after backing off.
3478 */
3479
c277df42 3480 CHECKPOINT cp, lastcp;
3280af22 3481 CURCUR* cc = PL_regcc;
6136c704 3482 char * const lastloc = cc->lastloc; /* Detection of 0-len. */
3ab3c9b4 3483 I32 cache_offset = 0, cache_bit = 0;
c277df42 3484
4633a7c4 3485 n = cc->cur + 1; /* how many we know we matched */
3280af22 3486 PL_reginput = locinput;
a0d0e21e 3487
a3621e74 3488 DEBUG_EXECUTE_r(
9041c2e3 3489 PerlIO_printf(Perl_debug_log,
91f3b821 3490 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3491 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3492 (long)n, (long)cc->min,
2797576d 3493 (long)cc->max, PTR2UV(cc))
c277df42 3494 );
4633a7c4 3495
a0d0e21e
LW
3496 /* If degenerate scan matches "", assume scan done. */
3497
579cf2c3 3498 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3499 PL_regcc = cc->oldcc;
2ab05381
IZ
3500 if (PL_regcc)
3501 ln = PL_regcc->cur;
a3621e74 3502 DEBUG_EXECUTE_r(
c3464db5
DD
3503 PerlIO_printf(Perl_debug_log,
3504 "%*s empty match detected, try continuation...\n",
3280af22 3505 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3506 );
a0d0e21e 3507 if (regmatch(cc->next))
4633a7c4 3508 sayYES;
2ab05381
IZ
3509 if (PL_regcc)
3510 PL_regcc->cur = ln;
3280af22 3511 PL_regcc = cc;
4633a7c4 3512 sayNO;
a0d0e21e
LW
3513 }
3514
3515 /* First just match a string of min scans. */
3516
3517 if (n < cc->min) {
3518 cc->cur = n;
3519 cc->lastloc = locinput;
4633a7c4
LW
3520 if (regmatch(cc->scan))
3521 sayYES;
3522 cc->cur = n - 1;
c277df42 3523 cc->lastloc = lastloc;
4633a7c4 3524 sayNO;
a0d0e21e
LW
3525 }
3526
2c2d71f5
JH
3527 if (scan->flags) {
3528 /* Check whether we already were at this position.
3529 Postpone detection until we know the match is not
3530 *that* much linear. */
3531 if (!PL_reg_maxiter) {
3532 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3533 PL_reg_leftiter = PL_reg_maxiter;
3534 }
3535 if (PL_reg_leftiter-- == 0) {
a3b680e6 3536 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
2c2d71f5 3537 if (PL_reg_poscache) {
eb160463 3538 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3539 Renew(PL_reg_poscache, size, char);
3540 PL_reg_poscache_size = size;
3541 }
3542 Zero(PL_reg_poscache, size, char);
3543 }
3544 else {
3545 PL_reg_poscache_size = size;
a02a5408 3546 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3547 }
a3621e74 3548 DEBUG_EXECUTE_r(
2c2d71f5
JH
3549 PerlIO_printf(Perl_debug_log,
3550 "%sDetected a super-linear match, switching on caching%s...\n",
3551 PL_colors[4], PL_colors[5])
3552 );
3553 }
3554 if (PL_reg_leftiter < 0) {
3ab3c9b4 3555 cache_offset = locinput - PL_bostr;
2c2d71f5 3556
3ab3c9b4
HS
3557 cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3558 + cache_offset * (scan->flags>>4);
3559 cache_bit = cache_offset % 8;
3560 cache_offset /= 8;
3561 if (PL_reg_poscache[cache_offset] & (1<<cache_bit)) {
a3621e74 3562 DEBUG_EXECUTE_r(
2c2d71f5
JH
3563 PerlIO_printf(Perl_debug_log,
3564 "%*s already tried at this position...\n",
3565 REPORT_CODE_OFF+PL_regindent*2, "")
3566 );
3ab3c9b4
HS
3567 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3568 /* cache records success */
c2b0868c
HS
3569 sayYES;
3570 else
3ab3c9b4 3571 /* cache records failure */
c2b0868c 3572 sayNO_SILENT;
2c2d71f5 3573 }
3ab3c9b4 3574 PL_reg_poscache[cache_offset] |= (1<<cache_bit);
2c2d71f5
JH
3575 }
3576 }
3577
a0d0e21e
LW
3578 /* Prefer next over scan for minimal matching. */
3579
3580 if (cc->minmod) {
3280af22 3581 PL_regcc = cc->oldcc;
2ab05381
IZ
3582 if (PL_regcc)
3583 ln = PL_regcc->cur;
5f05dabc 3584 cp = regcppush(cc->parenfloor);
02db2b7b 3585 REGCP_SET(lastcp);
5f05dabc 3586 if (regmatch(cc->next)) {
c277df42 3587 regcpblow(cp);
3ab3c9b4 3588 CACHEsayYES; /* All done. */
5f05dabc 3589 }
02db2b7b 3590 REGCP_UNWIND(lastcp);
5f05dabc 3591 regcppop();
2ab05381
IZ
3592 if (PL_regcc)
3593 PL_regcc->cur = ln;
3280af22 3594 PL_regcc = cc;
a0d0e21e 3595
c277df42 3596 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3597 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3598 && !(PL_reg_flags & RF_warned)) {
3599 PL_reg_flags |= RF_warned;
9014280d 3600 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3601 "Complex regular subexpression recursion",
3602 REG_INFTY - 1);
c277df42 3603 }
3ab3c9b4 3604 CACHEsayNO;
c277df42 3605 }
a687059c 3606
a3621e74 3607 DEBUG_EXECUTE_r(
c3464db5
DD
3608 PerlIO_printf(Perl_debug_log,
3609 "%*s trying longer...\n",
3280af22 3610 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3611 );
a0d0e21e 3612 /* Try scanning more and see if it helps. */
3280af22 3613 PL_reginput = locinput;
a0d0e21e
LW
3614 cc->cur = n;
3615 cc->lastloc = locinput;
5f05dabc 3616 cp = regcppush(cc->parenfloor);
02db2b7b 3617 REGCP_SET(lastcp);
5f05dabc 3618 if (regmatch(cc->scan)) {
c277df42 3619 regcpblow(cp);
3ab3c9b4 3620 CACHEsayYES;
5f05dabc 3621 }
02db2b7b 3622 REGCP_UNWIND(lastcp);
5f05dabc 3623 regcppop();
4633a7c4 3624 cc->cur = n - 1;
c277df42 3625 cc->lastloc = lastloc;
3ab3c9b4 3626 CACHEsayNO;
a0d0e21e
LW
3627 }
3628
3629 /* Prefer scan over next for maximal matching. */
3630
3631 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3632 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3633 cc->cur = n;
3634 cc->lastloc = locinput;
02db2b7b 3635 REGCP_SET(lastcp);
5f05dabc 3636 if (regmatch(cc->scan)) {
c277df42 3637 regcpblow(cp);
3ab3c9b4 3638 CACHEsayYES;
5f05dabc 3639 }
02db2b7b 3640 REGCP_UNWIND(lastcp);
a0d0e21e 3641 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3642 PL_reginput = locinput;
a3621e74 3643 DEBUG_EXECUTE_r(
c3464db5
DD
3644 PerlIO_printf(Perl_debug_log,
3645 "%*s failed, try continuation...\n",
3280af22 3646 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3647 );
3648 }
9041c2e3 3649 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3650 && !(PL_reg_flags & RF_warned)) {
3280af22 3651 PL_reg_flags |= RF_warned;
9014280d 3652 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3653 "Complex regular subexpression recursion",
3654 REG_INFTY - 1);
a0d0e21e
LW
3655 }
3656
3657 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3658 PL_regcc = cc->oldcc;
2ab05381
IZ
3659 if (PL_regcc)
3660 ln = PL_regcc->cur;
a0d0e21e 3661 if (regmatch(cc->next))
3ab3c9b4 3662 CACHEsayYES;
2ab05381
IZ
3663 if (PL_regcc)
3664 PL_regcc->cur = ln;
3280af22 3665 PL_regcc = cc;
4633a7c4 3666 cc->cur = n - 1;
c277df42 3667 cc->lastloc = lastloc;
3ab3c9b4 3668 CACHEsayNO;
a0d0e21e
LW
3669 }
3670 /* NOT REACHED */
9041c2e3 3671 case BRANCHJ:
c277df42
IZ
3672 next = scan + ARG(scan);
3673 if (next == scan)
3674 next = NULL;
3675 inner = NEXTOPER(NEXTOPER(scan));
3676 goto do_branch;
9041c2e3 3677 case BRANCH:
c277df42
IZ
3678 inner = NEXTOPER(scan);
3679 do_branch:
3680 {
c277df42
IZ
3681 c1 = OP(scan);
3682 if (OP(next) != c1) /* No choice. */
3683 next = inner; /* Avoid recursion. */
a0d0e21e 3684 else {
a3b680e6 3685 const I32 lastparen = *PL_reglastparen;
02db2b7b 3686 /* Put unwinding data on stack */
6136c704
AL
3687 const I32 unwind1 = SSNEWt(1,re_unwind_branch_t);
3688 re_unwind_branch_t * const uw = SSPTRt(unwind1,re_unwind_branch_t);
3689
02db2b7b
IZ
3690 uw->prev = unwind;
3691 unwind = unwind1;
3692 uw->type = ((c1 == BRANCH)
3693 ? RE_UNWIND_BRANCH
3694 : RE_UNWIND_BRANCHJ);
3695 uw->lastparen = lastparen;
3696 uw->next = next;
3697 uw->locinput = locinput;
3698 uw->nextchr = nextchr;
3699#ifdef DEBUGGING
3700 uw->regindent = ++PL_regindent;
3701#endif
c277df42 3702
02db2b7b
IZ
3703 REGCP_SET(uw->lastcp);
3704
3705 /* Now go into the first branch */
3706 next = inner;
a687059c 3707 }
a0d0e21e
LW
3708 }
3709 break;
3710 case MINMOD:
3711 minmod = 1;
3712 break;
c277df42
IZ
3713 case CURLYM:
3714 {
00db4c45 3715 I32 l = 0;
c277df42 3716 CHECKPOINT lastcp;
9041c2e3 3717
c277df42 3718 /* We suppose that the next guy does not need
0e788c72 3719 backtracking: in particular, it is of constant non-zero length,
c277df42
IZ
3720 and has no parenths to influence future backrefs. */
3721 ln = ARG1(scan); /* min to match */
3722 n = ARG2(scan); /* max to match */
c277df42
IZ
3723 paren = scan->flags;
3724 if (paren) {
3280af22
NIS
3725 if (paren > PL_regsize)
3726 PL_regsize = paren;
eb160463 3727 if (paren > (I32)*PL_reglastparen)
3280af22 3728 *PL_reglastparen = paren;
c277df42 3729 }
dc45a647 3730 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3731 if (paren)
3732 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3733 PL_reginput = locinput;
c277df42
IZ
3734 if (minmod) {
3735 minmod = 0;
3736 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3737 sayNO;
3280af22 3738 locinput = PL_reginput;
cca55fe3 3739 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3740 regnode *text_node = next;
3741
cca55fe3 3742 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3743
cca55fe3 3744 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3745 else {
cca55fe3 3746 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3747 c1 = c2 = -1000;
3748 goto assume_ok_MM;
cca55fe3
JP
3749 }
3750 else { c1 = (U8)*STRING(text_node); }
af5decee 3751 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3752 c2 = PL_fold[c1];
af5decee 3753 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3754 c2 = PL_fold_locale[c1];
3755 else
3756 c2 = c1;
3757 }
a0ed51b3
LW
3758 }
3759 else
c277df42 3760 c1 = c2 = -1000;
cca55fe3 3761 assume_ok_MM:
02db2b7b 3762 REGCP_SET(lastcp);
0e788c72 3763 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
c277df42
IZ
3764 /* If it could work, try it. */
3765 if (c1 == -1000 ||
3280af22
NIS
3766 UCHARAT(PL_reginput) == c1 ||
3767 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3768 {
3769 if (paren) {
f31a99c8 3770 if (ln) {
cf93c79d
IZ
3771 PL_regstartp[paren] =
3772 HOPc(PL_reginput, -l) - PL_bostr;
3773 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3774 }
3775 else
cf93c79d 3776 PL_regendp[paren] = -1;
c277df42
IZ
3777 }
3778 if (regmatch(next))
3779 sayYES;
02db2b7b 3780 REGCP_UNWIND(lastcp);
c277df42
IZ
3781 }
3782 /* Couldn't or didn't -- move forward. */
3280af22 3783 PL_reginput = locinput;
c277df42
IZ
3784 if (regrepeat_hard(scan, 1, &l)) {
3785 ln++;
3280af22 3786 locinput = PL_reginput;
c277df42
IZ
3787 }
3788 else
3789 sayNO;
3790 }
a0ed51b3
LW
3791 }
3792 else {
c277df42 3793 n = regrepeat_hard(scan, n, &l);
3280af22 3794 locinput = PL_reginput;
a3621e74 3795 DEBUG_EXECUTE_r(
5c0ca799 3796 PerlIO_printf(Perl_debug_log,
faccc32b 3797 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3798 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3799 (IV) n, (IV)l)
c277df42
IZ
3800 );
3801 if (n >= ln) {
cca55fe3 3802 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3803 regnode *text_node = next;
3804
cca55fe3 3805 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3806
cca55fe3 3807 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3808 else {
cca55fe3 3809 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3810 c1 = c2 = -1000;
3811 goto assume_ok_REG;
cca55fe3
JP
3812 }
3813 else { c1 = (U8)*STRING(text_node); }
3814
af5decee 3815 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3816 c2 = PL_fold[c1];
af5decee 3817 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3818 c2 = PL_fold_locale[c1];
3819 else
3820 c2 = c1;
3821 }
a0ed51b3
LW
3822 }
3823 else
c277df42
IZ
3824 c1 = c2 = -1000;
3825 }
cca55fe3 3826 assume_ok_REG:
02db2b7b 3827 REGCP_SET(lastcp);
c277df42
IZ
3828 while (n >= ln) {
3829 /* If it could work, try it. */
3830 if (c1 == -1000 ||
3280af22
NIS
3831 UCHARAT(PL_reginput) == c1 ||
3832 UCHARAT(PL_reginput) == c2)
a0ed51b3 3833 {
a3621e74 3834 DEBUG_EXECUTE_r(
c3464db5 3835 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3836 "%*s trying tail with n=%"IVdf"...\n",
3837 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3838 );
3839 if (paren) {
3840 if (n) {
cf93c79d
IZ
3841 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3842 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3843 }
a0ed51b3 3844 else
cf93c79d 3845 PL_regendp[paren] = -1;
c277df42 3846 }
a0ed51b3
LW
3847 if (regmatch(next))
3848 sayYES;
02db2b7b 3849 REGCP_UNWIND(lastcp);
a0ed51b3 3850 }
c277df42
IZ
3851 /* Couldn't or didn't -- back up. */
3852 n--;
dfe13c55 3853 locinput = HOPc(locinput, -l);
3280af22 3854 PL_reginput = locinput;
c277df42
IZ
3855 }
3856 }
3857 sayNO;
3858 break;
3859 }
3860 case CURLYN:
3861 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3862 if (paren > PL_regsize)
3863 PL_regsize = paren;
eb160463 3864 if (paren > (I32)*PL_reglastparen)
3280af22 3865 *PL_reglastparen = paren;
c277df42
IZ
3866 ln = ARG1(scan); /* min to match */
3867 n = ARG2(scan); /* max to match */
dc45a647 3868 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3869 goto repeat;
a0d0e21e 3870 case CURLY:
c277df42 3871 paren = 0;
a0d0e21e
LW
3872 ln = ARG1(scan); /* min to match */
3873 n = ARG2(scan); /* max to match */
dc45a647 3874 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3875 goto repeat;
3876 case STAR:
3877 ln = 0;
c277df42 3878 n = REG_INFTY;
a0d0e21e 3879 scan = NEXTOPER(scan);
c277df42 3880 paren = 0;
a0d0e21e
LW
3881 goto repeat;
3882 case PLUS:
c277df42
IZ
3883 ln = 1;
3884 n = REG_INFTY;
3885 scan = NEXTOPER(scan);
3886 paren = 0;
3887 repeat:
a0d0e21e
LW
3888 /*
3889 * Lookahead to avoid useless match attempts
3890 * when we know what character comes next.
3891 */
5f80c4cf
JP
3892
3893 /*
3894 * Used to only do .*x and .*?x, but now it allows
3895 * for )'s, ('s and (?{ ... })'s to be in the way
3896 * of the quantifier and the EXACT-like node. -- japhy
3897 */
3898
cca55fe3 3899 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3900 U8 *s;
3901 regnode *text_node = next;
3902
cca55fe3 3903 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3904
cca55fe3 3905 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3906 else {
cca55fe3 3907 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3908 c1 = c2 = -1000;
3909 goto assume_ok_easy;
cca55fe3
JP
3910 }
3911 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
3912
3913 if (!UTF) {
3914 c2 = c1 = *s;
f65d3ee7 3915 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3916 c2 = PL_fold[c1];
f65d3ee7 3917 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf 3918 c2 = PL_fold_locale[c1];
1aa99e6b 3919 }
5f80c4cf 3920 else { /* UTF */
f65d3ee7 3921 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3922 STRLEN ulen1, ulen2;
89ebb4a3
JH
3923 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3924 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
3925
3926 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3927 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3928
89ebb4a3 3929 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
041457d9 3930 uniflags);
89ebb4a3 3931 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
041457d9 3932 uniflags);
5f80c4cf
JP
3933 }
3934 else {
89ebb4a3 3935 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 3936 uniflags);
5f80c4cf 3937 }
1aa99e6b
IH
3938 }
3939 }
bbce6d69 3940 }
a0d0e21e 3941 else
bbce6d69 3942 c1 = c2 = -1000;
cca55fe3 3943 assume_ok_easy:
3280af22 3944 PL_reginput = locinput;
a0d0e21e 3945 if (minmod) {
c277df42 3946 CHECKPOINT lastcp;
a0d0e21e
LW
3947 minmod = 0;
3948 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 3949 sayNO;
a0ed51b3 3950 locinput = PL_reginput;
02db2b7b 3951 REGCP_SET(lastcp);
0fe9bf95 3952 if (c1 != -1000) {
1aa99e6b 3953 char *e; /* Should not check after this */
0fe9bf95 3954 char *old = locinput;
b2f2f093 3955 int count = 0;
0fe9bf95 3956
1aa99e6b 3957 if (n == REG_INFTY) {
0fe9bf95 3958 e = PL_regeol - 1;
1aa99e6b
IH
3959 if (do_utf8)
3960 while (UTF8_IS_CONTINUATION(*(U8*)e))
3961 e--;
3962 }
3963 else if (do_utf8) {
3964 int m = n - ln;
3965 for (e = locinput;
3966 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3967 e += UTF8SKIP(e);
3968 }
3969 else {
3970 e = locinput + n - ln;
3971 if (e >= PL_regeol)
3972 e = PL_regeol - 1;
3973 }
0fe9bf95
IZ
3974 while (1) {
3975 /* Find place 'next' could work */
1aa99e6b
IH
3976 if (!do_utf8) {
3977 if (c1 == c2) {
a8e8ab15
JH
3978 while (locinput <= e &&
3979 UCHARAT(locinput) != c1)
1aa99e6b
IH
3980 locinput++;
3981 } else {
9041c2e3 3982 while (locinput <= e
a8e8ab15
JH
3983 && UCHARAT(locinput) != c1
3984 && UCHARAT(locinput) != c2)
1aa99e6b
IH
3985 locinput++;
3986 }
3987 count = locinput - old;
3988 }
3989 else {
1aa99e6b 3990 if (c1 == c2) {
a3b680e6 3991 STRLEN len;
872c91ae
JH
3992 /* count initialised to
3993 * utf8_distance(old, locinput) */
b2f2f093 3994 while (locinput <= e &&
872c91ae 3995 utf8n_to_uvchr((U8*)locinput,
89ebb4a3 3996 UTF8_MAXBYTES, &len,
041457d9 3997 uniflags) != (UV)c1) {
1aa99e6b 3998 locinput += len;
b2f2f093
JH
3999 count++;
4000 }
1aa99e6b 4001 } else {
a3b680e6 4002 STRLEN len;
872c91ae
JH
4003 /* count initialised to
4004 * utf8_distance(old, locinput) */
b2f2f093 4005 while (locinput <= e) {
872c91ae 4006 UV c = utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4007 UTF8_MAXBYTES, &len,
041457d9 4008 uniflags);
eb160463 4009 if (c == (UV)c1 || c == (UV)c2)
1aa99e6b 4010 break;
b2f2f093
JH
4011 locinput += len;
4012 count++;
1aa99e6b
IH
4013 }
4014 }
0fe9bf95 4015 }
9041c2e3 4016 if (locinput > e)
0fe9bf95
IZ
4017 sayNO;
4018 /* PL_reginput == old now */
4019 if (locinput != old) {
4020 ln = 1; /* Did some */
1aa99e6b 4021 if (regrepeat(scan, count) < count)
0fe9bf95
IZ
4022 sayNO;
4023 }
4024 /* PL_reginput == locinput now */
29d1e993 4025 TRYPAREN(paren, ln, locinput);
0fe9bf95 4026 PL_reginput = locinput; /* Could be reset... */
02db2b7b 4027 REGCP_UNWIND(lastcp);
0fe9bf95 4028 /* Couldn't or didn't -- move forward. */
1aa99e6b
IH
4029 old = locinput;
4030 if (do_utf8)
4031 locinput += UTF8SKIP(locinput);
4032 else
4033 locinput++;
b2f2f093 4034 count = 1;
0fe9bf95
IZ
4035 }
4036 }
4037 else
c277df42 4038 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
1aa99e6b
IH
4039 UV c;
4040 if (c1 != -1000) {
4041 if (do_utf8)
872c91ae 4042 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4043 UTF8_MAXBYTES, 0,
041457d9 4044 uniflags);
1aa99e6b 4045 else
9041c2e3 4046 c = UCHARAT(PL_reginput);
2390ecbc 4047 /* If it could work, try it. */
eb160463 4048 if (c == (UV)c1 || c == (UV)c2)
2390ecbc 4049 {
ecc99935 4050 TRYPAREN(paren, ln, PL_reginput);
2390ecbc
PP
4051 REGCP_UNWIND(lastcp);
4052 }
1aa99e6b 4053 }
a0d0e21e 4054 /* If it could work, try it. */
2390ecbc 4055 else if (c1 == -1000)
bbce6d69 4056 {
ecc99935 4057 TRYPAREN(paren, ln, PL_reginput);
02db2b7b 4058 REGCP_UNWIND(lastcp);
bbce6d69 4059 }
c277df42 4060 /* Couldn't or didn't -- move forward. */
a0ed51b3 4061 PL_reginput = locinput;
a0d0e21e
LW
4062 if (regrepeat(scan, 1)) {
4063 ln++;
a0ed51b3
LW
4064 locinput = PL_reginput;
4065 }
4066 else
4633a7c4 4067 sayNO;
a0d0e21e
LW
4068 }
4069 }
4070 else {
c277df42 4071 CHECKPOINT lastcp;
a0d0e21e 4072 n = regrepeat(scan, n);
a0ed51b3 4073 locinput = PL_reginput;
22c35a8c 4074 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
7fba1cd6 4075 (OP(next) != MEOL ||
15272685
HS
4076 OP(next) == SEOL || OP(next) == EOS))
4077 {
a0d0e21e 4078 ln = n; /* why back off? */
1aeab75a
GS
4079 /* ...because $ and \Z can match before *and* after
4080 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4081 We should back off by one in this case. */
4082 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
4083 ln--;
4084 }
02db2b7b 4085 REGCP_SET(lastcp);
c277df42 4086 if (paren) {
8fa7f367 4087 UV c = 0;
c277df42 4088 while (n >= ln) {
1aa99e6b
IH
4089 if (c1 != -1000) {
4090 if (do_utf8)
872c91ae 4091 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4092 UTF8_MAXBYTES, 0,
041457d9 4093 uniflags);
1aa99e6b 4094 else
9041c2e3 4095 c = UCHARAT(PL_reginput);
1aa99e6b 4096 }
c277df42 4097 /* If it could work, try it. */
eb160463 4098 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 4099 {
29d1e993 4100 TRYPAREN(paren, n, PL_reginput);
02db2b7b 4101 REGCP_UNWIND(lastcp);
c277df42
IZ
4102 }
4103 /* Couldn't or didn't -- back up. */
4104 n--;
dfe13c55 4105 PL_reginput = locinput = HOPc(locinput, -1);
c277df42 4106 }
a0ed51b3
LW
4107 }
4108 else {
8fa7f367 4109 UV c = 0;
c277df42 4110 while (n >= ln) {
1aa99e6b
IH
4111 if (c1 != -1000) {
4112 if (do_utf8)
872c91ae 4113 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4114 UTF8_MAXBYTES, 0,
041457d9 4115 uniflags);
1aa99e6b 4116 else
9041c2e3 4117 c = UCHARAT(PL_reginput);
1aa99e6b 4118 }
c277df42 4119 /* If it could work, try it. */
eb160463 4120 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 4121 {
29d1e993 4122 TRYPAREN(paren, n, PL_reginput);
02db2b7b 4123 REGCP_UNWIND(lastcp);
c277df42
IZ
4124 }
4125 /* Couldn't or didn't -- back up. */
4126 n--;
dfe13c55 4127 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 4128 }
a0d0e21e
LW
4129 }
4130 }
4633a7c4 4131 sayNO;
c277df42 4132 break;
a0d0e21e 4133 case END:
0f5d15d6
IZ
4134 if (PL_reg_call_cc) {
4135 re_cc_state *cur_call_cc = PL_reg_call_cc;
4136 CURCUR *cctmp = PL_regcc;
4137 regexp *re = PL_reg_re;
6136c704
AL
4138 CHECKPOINT lastcp;
4139 I32 tmp;
4140
4141 /* Save *all* the positions. */
4142 const CHECKPOINT cp = regcppush(0);
02db2b7b 4143 REGCP_SET(lastcp);
6136c704
AL
4144
4145 /* Restore parens of the caller. */
4146 tmp = PL_savestack_ix;
4147 PL_savestack_ix = PL_reg_call_cc->ss;
4148 regcppop();
4149 PL_savestack_ix = tmp;
4150
4151 /* Make position available to the callcc. */
4152 PL_reginput = locinput;
4153
0f5d15d6
IZ
4154 cache_re(PL_reg_call_cc->re);
4155 PL_regcc = PL_reg_call_cc->cc;
4156 PL_reg_call_cc = PL_reg_call_cc->prev;
4157 if (regmatch(cur_call_cc->node)) {
4158 PL_reg_call_cc = cur_call_cc;
4159 regcpblow(cp);
4160 sayYES;
4161 }
02db2b7b 4162 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
4163 regcppop();
4164 PL_reg_call_cc = cur_call_cc;
4165 PL_regcc = cctmp;
4166 PL_reg_re = re;
4167 cache_re(re);
4168
a3621e74 4169 DEBUG_EXECUTE_r(
0f5d15d6
IZ
4170 PerlIO_printf(Perl_debug_log,
4171 "%*s continuation failed...\n",
4172 REPORT_CODE_OFF+PL_regindent*2, "")
4173 );
7821416a 4174 sayNO_SILENT;
0f5d15d6 4175 }
7821416a 4176 if (locinput < PL_regtill) {
a3621e74 4177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4178 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4179 PL_colors[4],
4180 (long)(locinput - PL_reg_starttry),
4181 (long)(PL_regtill - PL_reg_starttry),
4182 PL_colors[5]));
4183 sayNO_FINAL; /* Cannot match: too short. */
4184 }
4185 PL_reginput = locinput; /* put where regtry can find it */
4186 sayYES_FINAL; /* Success! */
7e5428c5 4187 case SUCCEED:
3280af22 4188 PL_reginput = locinput; /* put where regtry can find it */
7821416a 4189 sayYES_LOUD; /* Success! */
c277df42
IZ
4190 case SUSPEND:
4191 n = 1;
9fe1d20c 4192 PL_reginput = locinput;
9041c2e3 4193 goto do_ifmatch;
a0d0e21e 4194 case UNLESSM:
c277df42 4195 n = 0;
a0ed51b3 4196 if (scan->flags) {
efb30f32
HS
4197 s = HOPBACKc(locinput, scan->flags);
4198 if (!s)
4199 goto say_yes;
4200 PL_reginput = s;
a0ed51b3
LW
4201 }
4202 else
4203 PL_reginput = locinput;
c277df42
IZ
4204 goto do_ifmatch;
4205 case IFMATCH:
4206 n = 1;
a0ed51b3 4207 if (scan->flags) {
efb30f32
HS
4208 s = HOPBACKc(locinput, scan->flags);
4209 if (!s)
4210 goto say_no;
4211 PL_reginput = s;
a0ed51b3
LW
4212 }
4213 else
4214 PL_reginput = locinput;
4215
c277df42 4216 do_ifmatch:
c277df42
IZ
4217 inner = NEXTOPER(NEXTOPER(scan));
4218 if (regmatch(inner) != n) {
4219 say_no:
4220 if (logical) {
4221 logical = 0;
4222 sw = 0;
4223 goto do_longjump;
a0ed51b3
LW
4224 }
4225 else
c277df42
IZ
4226 sayNO;
4227 }
4228 say_yes:
4229 if (logical) {
4230 logical = 0;
4231 sw = 1;
4232 }
fe44a5e8 4233 if (OP(scan) == SUSPEND) {
3280af22 4234 locinput = PL_reginput;
565764a8 4235 nextchr = UCHARAT(locinput);
fe44a5e8 4236 }
c277df42
IZ
4237 /* FALL THROUGH. */
4238 case LONGJMP:
4239 do_longjump:
4240 next = scan + ARG(scan);
4241 if (next == scan)
4242 next = NULL;
a0d0e21e
LW
4243 break;
4244 default:
b900a521 4245 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4246 PTR2UV(scan), OP(scan));
cea2e8a9 4247 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4248 }
02db2b7b 4249 reenter:
a0d0e21e
LW
4250 scan = next;
4251 }
a687059c 4252
a0d0e21e
LW
4253 /*
4254 * We get here only if there's trouble -- normally "case END" is
4255 * the terminating point.
4256 */
cea2e8a9 4257 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4258 /*NOTREACHED*/
4633a7c4
LW
4259 sayNO;
4260
7821416a 4261yes_loud:
a3621e74 4262 DEBUG_EXECUTE_r(
7821416a
IZ
4263 PerlIO_printf(Perl_debug_log,
4264 "%*s %scould match...%s\n",
e4584336 4265 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a
IZ
4266 );
4267 goto yes;
4268yes_final:
a3621e74 4269 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4270 PL_colors[4], PL_colors[5]));
4633a7c4
LW
4271yes:
4272#ifdef DEBUGGING
3280af22 4273 PL_regindent--;
4633a7c4 4274#endif
02db2b7b
IZ
4275
4276#if 0 /* Breaks $^R */
4277 if (unwind)
4278 regcpblow(firstcp);
4279#endif
4633a7c4
LW
4280 return 1;
4281
4282no:
a3621e74 4283 DEBUG_EXECUTE_r(
7821416a
IZ
4284 PerlIO_printf(Perl_debug_log,
4285 "%*s %sfailed...%s\n",
e4584336 4286 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a
IZ
4287 );
4288 goto do_no;
4289no_final:
4290do_no:
02db2b7b 4291 if (unwind) {
6136c704 4292 re_unwind_t * const uw = SSPTRt(unwind,re_unwind_t);
02db2b7b
IZ
4293
4294 switch (uw->type) {
4295 case RE_UNWIND_BRANCH:
4296 case RE_UNWIND_BRANCHJ:
4297 {
6136c704 4298 re_unwind_branch_t * const uwb = &(uw->branch);
a3b680e6 4299 const I32 lastparen = uwb->lastparen;
9041c2e3 4300
02db2b7b
IZ
4301 REGCP_UNWIND(uwb->lastcp);
4302 for (n = *PL_reglastparen; n > lastparen; n--)
4303 PL_regendp[n] = -1;
4304 *PL_reglastparen = n;
4305 scan = next = uwb->next;
9041c2e3
NIS
4306 if ( !scan ||
4307 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b
IZ
4308 ? BRANCH : BRANCHJ) ) { /* Failure */
4309 unwind = uwb->prev;
4310#ifdef DEBUGGING
4311 PL_regindent--;
4312#endif
4313 goto do_no;
4314 }
4315 /* Have more choice yet. Reuse the same uwb. */
02db2b7b
IZ
4316 if ((n = (uwb->type == RE_UNWIND_BRANCH
4317 ? NEXT_OFF(next) : ARG(next))))
4318 next += n;
4319 else
4320 next = NULL; /* XXXX Needn't unwinding in this case... */
4321 uwb->next = next;
4322 next = NEXTOPER(scan);
4323 if (uwb->type == RE_UNWIND_BRANCHJ)
4324 next = NEXTOPER(next);
4325 locinput = uwb->locinput;
4326 nextchr = uwb->nextchr;
4327#ifdef DEBUGGING
4328 PL_regindent = uwb->regindent;
4329#endif
4330
4331 goto reenter;
4332 }
4333 /* NOT REACHED */
4334 default:
4335 Perl_croak(aTHX_ "regexp unwind memory corruption");
4336 }
4337 /* NOT REACHED */
4338 }
4633a7c4 4339#ifdef DEBUGGING
3280af22 4340 PL_regindent--;
4633a7c4 4341#endif
a0d0e21e 4342 return 0;
a687059c
LW
4343}
4344
4345/*
4346 - regrepeat - repeatedly match something simple, report how many
4347 */
4348/*
4349 * [This routine now assumes that it will only match on things of length 1.
4350 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4351 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4352 */
76e3520e 4353STATIC I32
a3b680e6 4354S_regrepeat(pTHX_ const regnode *p, I32 max)
a687059c 4355{
27da23d5 4356 dVAR;
a0d0e21e 4357 register char *scan;
a0d0e21e 4358 register I32 c;
3280af22 4359 register char *loceol = PL_regeol;
a0ed51b3 4360 register I32 hardcount = 0;
53c4c00c 4361 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4362
3280af22 4363 scan = PL_reginput;
faf11cac
HS
4364 if (max == REG_INFTY)
4365 max = I32_MAX;
4366 else if (max < loceol - scan)
a0d0e21e 4367 loceol = scan + max;
a0d0e21e 4368 switch (OP(p)) {
22c35a8c 4369 case REG_ANY:
1aa99e6b 4370 if (do_utf8) {
ffc61ed2 4371 loceol = PL_regeol;
1aa99e6b 4372 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4373 scan += UTF8SKIP(scan);
4374 hardcount++;
4375 }
4376 } else {
4377 while (scan < loceol && *scan != '\n')
4378 scan++;
a0ed51b3
LW
4379 }
4380 break;
ffc61ed2 4381 case SANY:
def8e4ea
JH
4382 if (do_utf8) {
4383 loceol = PL_regeol;
a0804c9e 4384 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4385 scan += UTF8SKIP(scan);
4386 hardcount++;
4387 }
4388 }
4389 else
4390 scan = loceol;
a0ed51b3 4391 break;
f33976b4
DB
4392 case CANY:
4393 scan = loceol;
4394 break;
090f7165
JH
4395 case EXACT: /* length of string is 1 */
4396 c = (U8)*STRING(p);
4397 while (scan < loceol && UCHARAT(scan) == c)
4398 scan++;
bbce6d69 4399 break;
4400 case EXACTF: /* length of string is 1 */
cd439c50 4401 c = (U8)*STRING(p);
bbce6d69 4402 while (scan < loceol &&
22c35a8c 4403 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4404 scan++;
4405 break;
4406 case EXACTFL: /* length of string is 1 */
3280af22 4407 PL_reg_flags |= RF_tainted;
cd439c50 4408 c = (U8)*STRING(p);
bbce6d69 4409 while (scan < loceol &&
22c35a8c 4410 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4411 scan++;
4412 break;
4413 case ANYOF:
ffc61ed2
JH
4414 if (do_utf8) {
4415 loceol = PL_regeol;
cfc92286
JH
4416 while (hardcount < max && scan < loceol &&
4417 reginclass(p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4418 scan += UTF8SKIP(scan);
4419 hardcount++;
4420 }
4421 } else {
7d3e948e 4422 while (scan < loceol && REGINCLASS(p, (U8*)scan))
ffc61ed2
JH
4423 scan++;
4424 }
a0d0e21e
LW
4425 break;
4426 case ALNUM:
1aa99e6b 4427 if (do_utf8) {
ffc61ed2 4428 loceol = PL_regeol;
1a4fad37 4429 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4430 while (hardcount < max && scan < loceol &&
3568d838 4431 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4432 scan += UTF8SKIP(scan);
4433 hardcount++;
4434 }
4435 } else {
4436 while (scan < loceol && isALNUM(*scan))
4437 scan++;
a0ed51b3
LW
4438 }
4439 break;
bbce6d69 4440 case ALNUML:
3280af22 4441 PL_reg_flags |= RF_tainted;
1aa99e6b 4442 if (do_utf8) {
ffc61ed2 4443 loceol = PL_regeol;
1aa99e6b
IH
4444 while (hardcount < max && scan < loceol &&
4445 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4446 scan += UTF8SKIP(scan);
4447 hardcount++;
4448 }
4449 } else {
4450 while (scan < loceol && isALNUM_LC(*scan))
4451 scan++;
a0ed51b3
LW
4452 }
4453 break;
a0d0e21e 4454 case NALNUM:
1aa99e6b 4455 if (do_utf8) {
ffc61ed2 4456 loceol = PL_regeol;
1a4fad37 4457 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4458 while (hardcount < max && scan < loceol &&
3568d838 4459 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4460 scan += UTF8SKIP(scan);
4461 hardcount++;
4462 }
4463 } else {
4464 while (scan < loceol && !isALNUM(*scan))
4465 scan++;
a0ed51b3
LW
4466 }
4467 break;
bbce6d69 4468 case NALNUML:
3280af22 4469 PL_reg_flags |= RF_tainted;
1aa99e6b 4470 if (do_utf8) {
ffc61ed2 4471 loceol = PL_regeol;
1aa99e6b
IH
4472 while (hardcount < max && scan < loceol &&
4473 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4474 scan += UTF8SKIP(scan);
4475 hardcount++;
4476 }
4477 } else {
4478 while (scan < loceol && !isALNUM_LC(*scan))
4479 scan++;
a0ed51b3
LW
4480 }
4481 break;
a0d0e21e 4482 case SPACE:
1aa99e6b 4483 if (do_utf8) {
ffc61ed2 4484 loceol = PL_regeol;
1a4fad37 4485 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4486 while (hardcount < max && scan < loceol &&
3568d838
JH
4487 (*scan == ' ' ||
4488 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4489 scan += UTF8SKIP(scan);
4490 hardcount++;
4491 }
4492 } else {
4493 while (scan < loceol && isSPACE(*scan))
4494 scan++;
a0ed51b3
LW
4495 }
4496 break;
bbce6d69 4497 case SPACEL:
3280af22 4498 PL_reg_flags |= RF_tainted;
1aa99e6b 4499 if (do_utf8) {
ffc61ed2 4500 loceol = PL_regeol;
1aa99e6b 4501 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4502 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4503 scan += UTF8SKIP(scan);
4504 hardcount++;
4505 }
4506 } else {
4507 while (scan < loceol && isSPACE_LC(*scan))
4508 scan++;
a0ed51b3
LW
4509 }
4510 break;
a0d0e21e 4511 case NSPACE:
1aa99e6b 4512 if (do_utf8) {
ffc61ed2 4513 loceol = PL_regeol;
1a4fad37 4514 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4515 while (hardcount < max && scan < loceol &&
3568d838
JH
4516 !(*scan == ' ' ||
4517 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4518 scan += UTF8SKIP(scan);
4519 hardcount++;
4520 }
4521 } else {
4522 while (scan < loceol && !isSPACE(*scan))
4523 scan++;
4524 break;
a0ed51b3 4525 }
bbce6d69 4526 case NSPACEL:
3280af22 4527 PL_reg_flags |= RF_tainted;
1aa99e6b 4528 if (do_utf8) {
ffc61ed2 4529 loceol = PL_regeol;
1aa99e6b 4530 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4531 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4532 scan += UTF8SKIP(scan);
4533 hardcount++;
4534 }
4535 } else {
4536 while (scan < loceol && !isSPACE_LC(*scan))
4537 scan++;
a0ed51b3
LW
4538 }
4539 break;
a0d0e21e 4540 case DIGIT:
1aa99e6b 4541 if (do_utf8) {
ffc61ed2 4542 loceol = PL_regeol;
1a4fad37 4543 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4544 while (hardcount < max && scan < loceol &&
3568d838 4545 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4546 scan += UTF8SKIP(scan);
4547 hardcount++;
4548 }
4549 } else {
4550 while (scan < loceol && isDIGIT(*scan))
4551 scan++;
a0ed51b3
LW
4552 }
4553 break;
a0d0e21e 4554 case NDIGIT:
1aa99e6b 4555 if (do_utf8) {
ffc61ed2 4556 loceol = PL_regeol;
1a4fad37 4557 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4558 while (hardcount < max && scan < loceol &&
3568d838 4559 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4560 scan += UTF8SKIP(scan);
4561 hardcount++;
4562 }
4563 } else {
4564 while (scan < loceol && !isDIGIT(*scan))
4565 scan++;
a0ed51b3
LW
4566 }
4567 break;
a0d0e21e
LW
4568 default: /* Called on something of 0 width. */
4569 break; /* So match right here or not at all. */
4570 }
a687059c 4571
a0ed51b3
LW
4572 if (hardcount)
4573 c = hardcount;
4574 else
4575 c = scan - PL_reginput;
3280af22 4576 PL_reginput = scan;
a687059c 4577
a3621e74 4578 DEBUG_r({
ab74612d 4579 SV *re_debug_flags = NULL;
6136c704 4580 SV * const prop = sv_newmortal();
a3621e74
YO
4581 GET_RE_DEBUG_FLAGS;
4582 DEBUG_EXECUTE_r({
c277df42 4583 regprop(prop, p);
9041c2e3
NIS
4584 PerlIO_printf(Perl_debug_log,
4585 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3f7c398e 4586 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
c277df42 4587 });
a3621e74 4588 });
9041c2e3 4589
a0d0e21e 4590 return(c);
a687059c
LW
4591}
4592
4593/*
c277df42 4594 - regrepeat_hard - repeatedly match something, report total lenth and length
9041c2e3 4595 *
0e788c72 4596 * The repeater is supposed to have constant non-zero length.
c277df42
IZ
4597 */
4598
76e3520e 4599STATIC I32
cea2e8a9 4600S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
c277df42 4601{
97aff369 4602 dVAR;
b7953727 4603 register char *scan = Nullch;
c277df42 4604 register char *start;
3280af22 4605 register char *loceol = PL_regeol;
a0ed51b3 4606 I32 l = 0;
708e3b05 4607 I32 count = 0, res = 1;
a0ed51b3
LW
4608
4609 if (!max)
4610 return 0;
c277df42 4611
3280af22 4612 start = PL_reginput;
53c4c00c 4613 if (PL_reg_match_utf8) {
708e3b05 4614 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4615 if (!count++) {
4616 l = 0;
4617 while (start < PL_reginput) {
4618 l++;
4619 start += UTF8SKIP(start);
4620 }
4621 *lp = l;
4622 if (l == 0)
4623 return max;
4624 }
4625 if (count == max)
4626 return count;
4627 }
4628 }
4629 else {
708e3b05 4630 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4631 if (!count++) {
4632 *lp = l = PL_reginput - start;
4633 if (max != REG_INFTY && l*max < loceol - scan)
4634 loceol = scan + l*max;
4635 if (l == 0)
4636 return max;
c277df42
IZ
4637 }
4638 }
4639 }
708e3b05 4640 if (!res)
3280af22 4641 PL_reginput = scan;
9041c2e3 4642
a0ed51b3 4643 return count;
c277df42
IZ
4644}
4645
4646/*
ffc61ed2
JH
4647- regclass_swash - prepare the utf8 swash
4648*/
4649
4650SV *
a3b680e6 4651Perl_regclass_swash(pTHX_ register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4652{
97aff369 4653 dVAR;
9e55ce06
JH
4654 SV *sw = NULL;
4655 SV *si = NULL;
4656 SV *alt = NULL;
ffc61ed2
JH
4657
4658 if (PL_regdata && PL_regdata->count) {
a3b680e6 4659 const U32 n = ARG(node);
ffc61ed2
JH
4660
4661 if (PL_regdata->what[n] == 's') {
890ce7af
AL
4662 SV * const rv = (SV*)PL_regdata->data[n];
4663 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 4664 SV **const ary = AvARRAY(av);
9e55ce06 4665 SV **a, **b;
9041c2e3 4666
711a919c 4667 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
4668 * documentation of these array elements. */
4669
b11f357e 4670 si = *ary;
8f7f7219 4671 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
4672 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4673
ffc61ed2
JH
4674 if (a)
4675 sw = *a;
4676 else if (si && doinit) {
4677 sw = swash_init("utf8", "", si, 1, 0);
4678 (void)av_store(av, 1, sw);
4679 }
9e55ce06
JH
4680 if (b)
4681 alt = *b;
ffc61ed2
JH
4682 }
4683 }
4684
9e55ce06
JH
4685 if (listsvp)
4686 *listsvp = si;
4687 if (altsvp)
4688 *altsvp = alt;
ffc61ed2
JH
4689
4690 return sw;
4691}
4692
4693/*
ba7b4546 4694 - reginclass - determine if a character falls into a character class
832705d4
JH
4695
4696 The n is the ANYOF regnode, the p is the target string, lenp
4697 is pointer to the maximum length of how far to go in the p
4698 (if the lenp is zero, UTF8SKIP(p) is used),
4699 do_utf8 tells whether the target string is in UTF-8.
4700
bbce6d69 4701 */
4702
76e3520e 4703STATIC bool
a3b680e6 4704S_reginclass(pTHX_ register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4705{
27da23d5 4706 dVAR;
a3b680e6 4707 const char flags = ANYOF_FLAGS(n);
bbce6d69 4708 bool match = FALSE;
cc07378b 4709 UV c = *p;
ae9ddab8 4710 STRLEN len = 0;
9e55ce06 4711 STRLEN plen;
1aa99e6b 4712
19f67299
TS
4713 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
4714 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4715 ckWARN(WARN_UTF8) ? UTF8_CHECK_ONLY :
4716 UTF8_ALLOW_ANYUV|UTF8_CHECK_ONLY);
4717 if (len == (STRLEN)-1)
4718 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
4719 }
bbce6d69 4720
0f0076b4 4721 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4722 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4723 if (lenp)
4724 *lenp = 0;
ffc61ed2 4725 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4726 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4727 match = TRUE;
bbce6d69 4728 }
3568d838 4729 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4730 match = TRUE;
ffc61ed2 4731 if (!match) {
9e55ce06 4732 AV *av;
890ce7af 4733 SV * const sw = regclass_swash(n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4734
4735 if (sw) {
3568d838 4736 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4737 match = TRUE;
4738 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4739 if (!match && lenp && av) {
4740 I32 i;
9e55ce06 4741 for (i = 0; i <= av_len(av); i++) {
890ce7af 4742 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 4743 STRLEN len;
890ce7af 4744 const char * const s = SvPV_const(sv, len);
9e55ce06 4745
061b10df 4746 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4747 *lenp = len;
4748 match = TRUE;
4749 break;
4750 }
4751 }
4752 }
4753 if (!match) {
89ebb4a3 4754 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
4755 STRLEN tmplen;
4756
9e55ce06
JH
4757 to_utf8_fold(p, tmpbuf, &tmplen);
4758 if (swash_fetch(sw, tmpbuf, do_utf8))
4759 match = TRUE;
4760 }
ffc61ed2
JH
4761 }
4762 }
bbce6d69 4763 }
9e55ce06 4764 if (match && lenp && *lenp == 0)
0f0076b4 4765 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4766 }
1aa99e6b 4767 if (!match && c < 256) {
ffc61ed2
JH
4768 if (ANYOF_BITMAP_TEST(n, c))
4769 match = TRUE;
4770 else if (flags & ANYOF_FOLD) {
eb160463 4771 U8 f;
a0ed51b3 4772
ffc61ed2
JH
4773 if (flags & ANYOF_LOCALE) {
4774 PL_reg_flags |= RF_tainted;
4775 f = PL_fold_locale[c];
4776 }
4777 else
4778 f = PL_fold[c];
4779 if (f != c && ANYOF_BITMAP_TEST(n, f))
4780 match = TRUE;
4781 }
4782
4783 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4784 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4785 if (
4786 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4787 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4788 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4789 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4790 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4791 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4792 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4793 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4794 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4795 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4796 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4797 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4798 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4799 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4800 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4801 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4802 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4803 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4804 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4805 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4806 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4807 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4808 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4809 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4810 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4811 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4812 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4813 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4814 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4815 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4816 ) /* How's that for a conditional? */
4817 {
4818 match = TRUE;
4819 }
a0ed51b3 4820 }
a0ed51b3
LW
4821 }
4822
a0ed51b3
LW
4823 return (flags & ANYOF_INVERT) ? !match : match;
4824}
161b471a 4825
dfe13c55 4826STATIC U8 *
cea2e8a9 4827S_reghop(pTHX_ U8 *s, I32 off)
9041c2e3 4828{
97aff369 4829 dVAR;
1aa99e6b
IH
4830 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4831}
4832
4833STATIC U8 *
4834S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
9041c2e3 4835{
97aff369 4836 dVAR;
a0ed51b3 4837 if (off >= 0) {
1aa99e6b 4838 while (off-- && s < lim) {
ffc61ed2 4839 /* XXX could check well-formedness here */
a0ed51b3 4840 s += UTF8SKIP(s);
ffc61ed2 4841 }
a0ed51b3
LW
4842 }
4843 else {
4844 while (off++) {
1aa99e6b 4845 if (s > lim) {
a0ed51b3 4846 s--;
ffc61ed2 4847 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4848 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4849 s--;
ffc61ed2
JH
4850 }
4851 /* XXX could check well-formedness here */
a0ed51b3
LW
4852 }
4853 }
4854 }
4855 return s;
4856}
161b471a 4857
dfe13c55 4858STATIC U8 *
1aa99e6b 4859S_reghopmaybe(pTHX_ U8 *s, I32 off)
9041c2e3 4860{
97aff369 4861 dVAR;
1aa99e6b
IH
4862 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4863}
4864
4865STATIC U8 *
4866S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
a0ed51b3 4867{
97aff369 4868 dVAR;
a0ed51b3 4869 if (off >= 0) {
1aa99e6b 4870 while (off-- && s < lim) {
ffc61ed2 4871 /* XXX could check well-formedness here */
a0ed51b3 4872 s += UTF8SKIP(s);
ffc61ed2 4873 }
a0ed51b3
LW
4874 if (off >= 0)
4875 return 0;
4876 }
4877 else {
4878 while (off++) {
1aa99e6b 4879 if (s > lim) {
a0ed51b3 4880 s--;
ffc61ed2 4881 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4882 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4883 s--;
ffc61ed2
JH
4884 }
4885 /* XXX could check well-formedness here */
a0ed51b3
LW
4886 }
4887 else
4888 break;
4889 }
4890 if (off <= 0)
4891 return 0;
4892 }
4893 return s;
4894}
51371543 4895
51371543 4896static void
acfe0abc 4897restore_pos(pTHX_ void *arg)
51371543 4898{
97aff369 4899 dVAR;
9d4ba2ae 4900 PERL_UNUSED_ARG(arg);
51371543
GS
4901 if (PL_reg_eval_set) {
4902 if (PL_reg_oldsaved) {
4903 PL_reg_re->subbeg = PL_reg_oldsaved;
4904 PL_reg_re->sublen = PL_reg_oldsavedlen;
f8c7b90f 4905#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
4906 PL_reg_re->saved_copy = PL_nrs;
4907#endif
51371543
GS
4908 RX_MATCH_COPIED_on(PL_reg_re);
4909 }
4910 PL_reg_magic->mg_len = PL_reg_oldpos;
4911 PL_reg_eval_set = 0;
4912 PL_curpm = PL_reg_oldcurpm;
4913 }
4914}
33b8afdf
JH
4915
4916STATIC void
4917S_to_utf8_substr(pTHX_ register regexp *prog)
4918{
33b8afdf 4919 if (prog->float_substr && !prog->float_utf8) {
9d4ba2ae 4920 SV* sv;
f2b990bf 4921 prog->float_utf8 = sv = newSVsv(prog->float_substr);
33b8afdf
JH
4922 sv_utf8_upgrade(sv);
4923 if (SvTAIL(prog->float_substr))
4924 SvTAIL_on(sv);
4925 if (prog->float_substr == prog->check_substr)
4926 prog->check_utf8 = sv;
4927 }
4928 if (prog->anchored_substr && !prog->anchored_utf8) {
9d4ba2ae 4929 SV* sv;
f2b990bf 4930 prog->anchored_utf8 = sv = newSVsv(prog->anchored_substr);
33b8afdf
JH
4931 sv_utf8_upgrade(sv);
4932 if (SvTAIL(prog->anchored_substr))
4933 SvTAIL_on(sv);
4934 if (prog->anchored_substr == prog->check_substr)
4935 prog->check_utf8 = sv;
4936 }
4937}
4938
4939STATIC void
4940S_to_byte_substr(pTHX_ register regexp *prog)
4941{
97aff369 4942 dVAR;
33b8afdf 4943 if (prog->float_utf8 && !prog->float_substr) {
9d4ba2ae 4944 SV* sv;
f2b990bf 4945 prog->float_substr = sv = newSVsv(prog->float_utf8);
33b8afdf
JH
4946 if (sv_utf8_downgrade(sv, TRUE)) {
4947 if (SvTAIL(prog->float_utf8))
4948 SvTAIL_on(sv);
4949 } else {
4950 SvREFCNT_dec(sv);
4951 prog->float_substr = sv = &PL_sv_undef;
4952 }
4953 if (prog->float_utf8 == prog->check_utf8)
4954 prog->check_substr = sv;
4955 }
4956 if (prog->anchored_utf8 && !prog->anchored_substr) {
9d4ba2ae 4957 SV* sv;
f2b990bf 4958 prog->anchored_substr = sv = newSVsv(prog->anchored_utf8);
33b8afdf
JH
4959 if (sv_utf8_downgrade(sv, TRUE)) {
4960 if (SvTAIL(prog->anchored_utf8))
4961 SvTAIL_on(sv);
4962 } else {
4963 SvREFCNT_dec(sv);
4964 prog->anchored_substr = sv = &PL_sv_undef;
4965 }
4966 if (prog->anchored_utf8 == prog->check_utf8)
4967 prog->check_substr = sv;
4968 }
4969}
66610fdd
RGS
4970
4971/*
4972 * Local variables:
4973 * c-indentation-style: bsd
4974 * c-basic-offset: 4
4975 * indent-tabs-mode: t
4976 * End:
4977 *
37442d52
RGS
4978 * ex: set ts=8 sts=4 sw=4 noet:
4979 */