This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
restore EU::MM test fixes from 26079, 26813, and 26919
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
50/* *These* symbols are masked to allow static link. */
51# define Perl_pregexec my_pregexec
9041c2e3 52# define Perl_reginitcolors my_reginitcolors
490a3f88 53# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
9041c2e3 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 **** Alterations to Henry's code are...
79 ****
4bb101f2 80 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 81 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 82 ****
9ef589d8
LW
83 **** You may distribute under the terms of either the GNU General Public
84 **** License or the Artistic License, as specified in the README file.
a687059c
LW
85 *
86 * Beware that some of this code is subtly aware of the way operator
87 * precedence is structured in regular expressions. Serious changes in
88 * regular-expression syntax might require a total rethink.
89 */
90#include "EXTERN.h"
864dbfa3 91#define PERL_IN_REGEXEC_C
a687059c 92#include "perl.h"
0f5d15d6 93
a687059c
LW
94#include "regcomp.h"
95
c277df42
IZ
96#define RF_tainted 1 /* tainted information used? */
97#define RF_warned 2 /* warned about big count? */
ce862d02 98#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
99#define RF_utf8 8 /* String contains multibyte chars? */
100
eb160463 101#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
102
103#define RS_init 1 /* eval environment created */
104#define RS_set 2 /* replsv value is set */
c277df42 105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
ba7b4546 110#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 111
c277df42
IZ
112/*
113 * Forwards.
114 */
115
33b8afdf 116#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 117#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 118
dfe13c55
GS
119#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
120#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
121#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
122#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
123#define HOPc(pos,off) ((char*)HOP(pos,off))
124#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 125
efb30f32 126#define HOPBACK(pos, off) ( \
e54858b0 127 (PL_reg_match_utf8) \
efb30f32
HS
128 ? reghopmaybe((U8*)pos, -off) \
129 : (pos - off >= PL_bostr) \
130 ? (U8*)(pos - off) \
131 : (U8*)NULL \
132)
133#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
134
1aa99e6b
IH
135#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
136#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
137#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
138#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
139#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
140#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
141
1a4fad37
AL
142#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
143 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
144#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
145#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
146#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
147#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 148
5f80c4cf 149/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
150#define JUMPABLE(rn) ( \
151 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
152 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
153 OP(rn) == PLUS || OP(rn) == MINMOD || \
154 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
155)
156
cca55fe3
JP
157#define HAS_TEXT(rn) ( \
158 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
159)
e2d8ce26 160
a84d97b6
HS
161/*
162 Search for mandatory following text node; for lookahead, the text must
163 follow but for lookbehind (rn->flags != 0) we skip to the next step.
164*/
cca55fe3 165#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 166 while (JUMPABLE(rn)) \
a84d97b6 167 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 168 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
169 else if (OP(rn) == PLUS) \
170 rn = NEXTOPER(rn); \
a84d97b6
HS
171 else if (OP(rn) == IFMATCH) \
172 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 173 else rn += NEXT_OFF(rn); \
5f80c4cf 174} STMT_END
74750237 175
acfe0abc 176static void restore_pos(pTHX_ void *arg);
51371543 177
76e3520e 178STATIC CHECKPOINT
cea2e8a9 179S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 180{
97aff369 181 dVAR;
a3b680e6 182 const int retval = PL_savestack_ix;
b1ce53c5 183#define REGCP_PAREN_ELEMS 4
a3b680e6 184 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
185 int p;
186
e49a9654
IH
187 if (paren_elems_to_push < 0)
188 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
189
a01268b5 190#define REGCP_OTHER_ELEMS 6
4b3c1a47 191 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 192 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 193/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
194 SSPUSHINT(PL_regendp[p]);
195 SSPUSHINT(PL_regstartp[p]);
3280af22 196 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
197 SSPUSHINT(p);
198 }
b1ce53c5 199/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
200 SSPUSHINT(PL_regsize);
201 SSPUSHINT(*PL_reglastparen);
a01268b5 202 SSPUSHINT(*PL_reglastcloseparen);
3280af22 203 SSPUSHPTR(PL_reginput);
41123dfd
JH
204#define REGCP_FRAME_ELEMS 2
205/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
206 * are needed for the regexp context stack bookkeeping. */
207 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 208 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 209
a0d0e21e
LW
210 return retval;
211}
212
c277df42 213/* These are needed since we do not localize EVAL nodes: */
a3621e74 214# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 215 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 216 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 217
a3621e74 218# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 219 PerlIO_printf(Perl_debug_log, \
faccc32b 220 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 221 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 222
76e3520e 223STATIC char *
cea2e8a9 224S_regcppop(pTHX)
a0d0e21e 225{
97aff369 226 dVAR;
b1ce53c5 227 I32 i;
a0d0e21e
LW
228 U32 paren = 0;
229 char *input;
b1ce53c5 230
a3621e74
YO
231 GET_RE_DEBUG_FLAGS_DECL;
232
b1ce53c5 233 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 234 i = SSPOPINT;
b1ce53c5
JH
235 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
236 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 237 input = (char *) SSPOPPTR;
a01268b5 238 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
239 *PL_reglastparen = SSPOPINT;
240 PL_regsize = SSPOPINT;
b1ce53c5
JH
241
242 /* Now restore the parentheses context. */
41123dfd
JH
243 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
244 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 245 I32 tmps;
a0d0e21e 246 paren = (U32)SSPOPINT;
3280af22 247 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
248 PL_regstartp[paren] = SSPOPINT;
249 tmps = SSPOPINT;
3280af22
NIS
250 if (paren <= *PL_reglastparen)
251 PL_regendp[paren] = tmps;
a3621e74 252 DEBUG_EXECUTE_r(
c3464db5 253 PerlIO_printf(Perl_debug_log,
b900a521 254 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 255 (UV)paren, (IV)PL_regstartp[paren],
b900a521 256 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 257 (IV)PL_regendp[paren],
3280af22 258 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 259 );
a0d0e21e 260 }
a3621e74 261 DEBUG_EXECUTE_r(
eb160463 262 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 263 PerlIO_printf(Perl_debug_log,
faccc32b
JH
264 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
265 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
266 }
267 );
daf18116 268#if 1
dafc8851
JH
269 /* It would seem that the similar code in regtry()
270 * already takes care of this, and in fact it is in
271 * a better location to since this code can #if 0-ed out
272 * but the code in regtry() is needed or otherwise tests
273 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
274 * (as of patchlevel 7877) will fail. Then again,
275 * this code seems to be necessary or otherwise
276 * building DynaLoader will fail:
277 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
278 * --jhi */
eb160463
GS
279 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
280 if ((I32)paren > PL_regsize)
cf93c79d
IZ
281 PL_regstartp[paren] = -1;
282 PL_regendp[paren] = -1;
a0d0e21e 283 }
dafc8851 284#endif
a0d0e21e
LW
285 return input;
286}
287
0f5d15d6
IZ
288typedef struct re_cc_state
289{
290 I32 ss;
291 regnode *node;
292 struct re_cc_state *prev;
293 CURCUR *cc;
294 regexp *re;
295} re_cc_state;
296
02db2b7b 297#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 298
29d1e993
HS
299#define TRYPAREN(paren, n, input) { \
300 if (paren) { \
301 if (n) { \
302 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
303 PL_regendp[paren] = input - PL_bostr; \
304 } \
305 else \
306 PL_regendp[paren] = -1; \
307 } \
308 if (regmatch(next)) \
309 sayYES; \
310 if (paren && n) \
311 PL_regendp[paren] = -1; \
312}
313
314
a687059c 315/*
e50aee73 316 * pregexec and friends
a687059c
LW
317 */
318
319/*
c277df42 320 - pregexec - match a regexp against a string
a687059c 321 */
c277df42 322I32
864dbfa3 323Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 324 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
325/* strend: pointer to null at end of string */
326/* strbeg: real beginning of string */
327/* minend: end of match must be >=minend after stringarg. */
328/* nosave: For optimizations. */
329{
330 return
9041c2e3 331 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
332 nosave ? 0 : REXEC_COPY_STR);
333}
0f5d15d6
IZ
334
335STATIC void
cea2e8a9 336S_cache_re(pTHX_ regexp *prog)
0f5d15d6 337{
97aff369 338 dVAR;
0f5d15d6
IZ
339 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
340#ifdef DEBUGGING
341 PL_regprogram = prog->program;
342#endif
343 PL_regnpar = prog->nparens;
9041c2e3
NIS
344 PL_regdata = prog->data;
345 PL_reg_re = prog;
0f5d15d6 346}
22e551b9 347
9041c2e3 348/*
cad2e5aa
JH
349 * Need to implement the following flags for reg_anch:
350 *
351 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
352 * USE_INTUIT_ML
353 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
354 * INTUIT_AUTORITATIVE_ML
355 * INTUIT_ONCE_NOML - Intuit can match in one location only.
356 * INTUIT_ONCE_ML
357 *
358 * Another flag for this function: SECOND_TIME (so that float substrs
359 * with giant delta may be not rechecked).
360 */
361
362/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
363
3f7c398e 364/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
365 Otherwise, only SvCUR(sv) is used to get strbeg. */
366
367/* XXXX We assume that strpos is strbeg unless sv. */
368
6eb5f6b9
JH
369/* XXXX Some places assume that there is a fixed substring.
370 An update may be needed if optimizer marks as "INTUITable"
371 RExen without fixed substrings. Similarly, it is assumed that
372 lengths of all the strings are no more than minlen, thus they
373 cannot come from lookahead.
374 (Or minlen should take into account lookahead.) */
375
2c2d71f5
JH
376/* A failure to find a constant substring means that there is no need to make
377 an expensive call to REx engine, thus we celebrate a failure. Similarly,
378 finding a substring too deep into the string means that less calls to
30944b6d
IZ
379 regtry() should be needed.
380
381 REx compiler's optimizer found 4 possible hints:
382 a) Anchored substring;
383 b) Fixed substring;
384 c) Whether we are anchored (beginning-of-line or \G);
385 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 386 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
387 string which does not contradict any of them.
388 */
2c2d71f5 389
6eb5f6b9
JH
390/* Most of decisions we do here should have been done at compile time.
391 The nodes of the REx which we used for the search should have been
392 deleted from the finite automaton. */
393
cad2e5aa
JH
394char *
395Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
396 char *strend, U32 flags, re_scream_pos_data *data)
397{
97aff369 398 dVAR;
b7953727 399 register I32 start_shift = 0;
cad2e5aa 400 /* Should be nonnegative! */
b7953727 401 register I32 end_shift = 0;
2c2d71f5
JH
402 register char *s;
403 register SV *check;
a1933d95 404 char *strbeg;
cad2e5aa 405 char *t;
a3b680e6 406 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 407 I32 ml_anch;
bd61b366
SS
408 register char *other_last = NULL; /* other substr checked before this */
409 char *check_at = NULL; /* check substr found at this pos */
1df70142 410 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 411#ifdef DEBUGGING
890ce7af
AL
412 const char * const i_strpos = strpos;
413 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 414#endif
a3621e74
YO
415
416 GET_RE_DEBUG_FLAGS_DECL;
417
a30b2f1f 418 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 419
b8d68ded 420 if (prog->reganch & ROPT_UTF8) {
a3621e74 421 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
422 "UTF-8 regex...\n"));
423 PL_reg_flags |= RF_utf8;
424 }
425
a3621e74 426 DEBUG_EXECUTE_r({
1df70142 427 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
428 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
429 strpos;
1df70142 430 const int len = PL_reg_match_utf8 ?
b8d68ded 431 strlen(s) : strend - strpos;
2a782b5b
JH
432 if (!PL_colorset)
433 reginitcolors();
b8d68ded 434 if (PL_reg_match_utf8)
a3621e74 435 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 436 "UTF-8 target...\n"));
2a782b5b 437 PerlIO_printf(Perl_debug_log,
a0288114 438 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 439 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
440 prog->precomp,
441 PL_colors[1],
442 (strlen(prog->precomp) > 60 ? "..." : ""),
443 PL_colors[0],
444 (int)(len > 60 ? 60 : len),
445 s, PL_colors[1],
446 (len > 60 ? "..." : "")
447 );
448 });
cad2e5aa 449
c344f387
JH
450 /* CHR_DIST() would be more correct here but it makes things slow. */
451 if (prog->minlen > strend - strpos) {
a3621e74 452 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 453 "String too short... [re_intuit_start]\n"));
cad2e5aa 454 goto fail;
2c2d71f5 455 }
a1933d95 456 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 457 PL_regeol = strend;
33b8afdf
JH
458 if (do_utf8) {
459 if (!prog->check_utf8 && prog->check_substr)
460 to_utf8_substr(prog);
461 check = prog->check_utf8;
462 } else {
463 if (!prog->check_substr && prog->check_utf8)
464 to_byte_substr(prog);
465 check = prog->check_substr;
466 }
467 if (check == &PL_sv_undef) {
a3621e74 468 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
469 "Non-utf string cannot match utf check string\n"));
470 goto fail;
471 }
2c2d71f5 472 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
473 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
474 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 475 && !multiline ) ); /* Check after \n? */
cad2e5aa 476
7e25d62c
JH
477 if (!ml_anch) {
478 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
479 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 480 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
481 && sv && !SvROK(sv)
482 && (strpos != strbeg)) {
a3621e74 483 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
484 goto fail;
485 }
486 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 487 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 488 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
489 I32 slen;
490
1aa99e6b 491 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
492 if (SvTAIL(check)) {
493 slen = SvCUR(check); /* >= 1 */
cad2e5aa 494
9041c2e3 495 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 496 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 497 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 498 goto fail_finish;
cad2e5aa
JH
499 }
500 /* Now should match s[0..slen-2] */
501 slen--;
3f7c398e 502 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 503 || (slen > 1
3f7c398e 504 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 505 report_neq:
a3621e74 506 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
507 goto fail_finish;
508 }
cad2e5aa 509 }
3f7c398e 510 else if (*SvPVX_const(check) != *s
653099ff 511 || ((slen = SvCUR(check)) > 1
3f7c398e 512 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 513 goto report_neq;
c315bfe8 514 check_at = s;
2c2d71f5 515 goto success_at_start;
7e25d62c 516 }
cad2e5aa 517 }
2c2d71f5 518 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 519 s = strpos;
2c2d71f5 520 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 521 end_shift = prog->minlen - start_shift -
653099ff 522 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 523 if (!ml_anch) {
a3b680e6 524 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 525 - (SvTAIL(check) != 0);
a3b680e6 526 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
527
528 if (end_shift < eshift)
529 end_shift = eshift;
530 }
cad2e5aa 531 }
2c2d71f5 532 else { /* Can match at random position */
cad2e5aa
JH
533 ml_anch = 0;
534 s = strpos;
2c2d71f5
JH
535 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
536 /* Should be nonnegative! */
537 end_shift = prog->minlen - start_shift -
653099ff 538 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
539 }
540
2c2d71f5 541#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 542 if (end_shift < 0)
6bbae5e6 543 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
544#endif
545
2c2d71f5
JH
546 restart:
547 /* Find a possible match in the region s..strend by looking for
548 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 549 if (flags & REXEC_SCREAM) {
cad2e5aa 550 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 551 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 552
2c2d71f5
JH
553 if (PL_screamfirst[BmRARE(check)] >= 0
554 || ( BmRARE(check) == '\n'
555 && (BmPREVIOUS(check) == SvCUR(check) - 1)
556 && SvTAIL(check) ))
9041c2e3 557 s = screaminstr(sv, check,
2c2d71f5 558 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 559 else
2c2d71f5 560 goto fail_finish;
4addbd3b
HS
561 /* we may be pointing at the wrong string */
562 if (s && RX_MATCH_COPIED(prog))
3f7c398e 563 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
564 if (data)
565 *data->scream_olds = s;
566 }
f33976b4 567 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
568 s = fbm_instr((U8*)(s + start_shift),
569 (U8*)(strend - end_shift),
7fba1cd6 570 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 571 else
1aa99e6b
IH
572 s = fbm_instr(HOP3(s, start_shift, strend),
573 HOP3(strend, -end_shift, strbeg),
7fba1cd6 574 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
575
576 /* Update the count-of-usability, remove useless subpatterns,
577 unshift s. */
2c2d71f5 578
a0288114 579 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 580 (s ? "Found" : "Did not find"),
33b8afdf 581 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 582 PL_colors[0],
7b0972df 583 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 584 SvPVX_const(check),
2c2d71f5
JH
585 PL_colors[1], (SvTAIL(check) ? "$" : ""),
586 (s ? " at offset " : "...\n") ) );
587
588 if (!s)
589 goto fail_finish;
590
6eb5f6b9
JH
591 check_at = s;
592
2c2d71f5 593 /* Finish the diagnostic message */
a3621e74 594 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
595
596 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
597 Start with the other substr.
598 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 599 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
600 *always* match. Probably should be marked during compile...
601 Probably it is right to do no SCREAM here...
602 */
603
33b8afdf 604 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 605 /* Take into account the "other" substring. */
2c2d71f5
JH
606 /* XXXX May be hopelessly wrong for UTF... */
607 if (!other_last)
6eb5f6b9 608 other_last = strpos;
33b8afdf 609 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
610 do_other_anchored:
611 {
890ce7af
AL
612 char * const last = HOP3c(s, -start_shift, strbeg);
613 char *last1, *last2;
2c2d71f5 614 char *s1 = s;
33b8afdf 615 SV* must;
2c2d71f5 616
2c2d71f5
JH
617 t = s - prog->check_offset_max;
618 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 619 && (!do_utf8
1aa99e6b 620 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 621 && t > strpos)))
30944b6d 622 /* EMPTY */;
2c2d71f5
JH
623 else
624 t = strpos;
1aa99e6b 625 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
626 if (t < other_last) /* These positions already checked */
627 t = other_last;
1aa99e6b 628 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
629 if (last < last1)
630 last1 = last;
631 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
632 /* On end-of-str: see comment below. */
33b8afdf
JH
633 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
634 if (must == &PL_sv_undef) {
635 s = (char*)NULL;
a3621e74 636 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
637 }
638 else
639 s = fbm_instr(
640 (unsigned char*)t,
641 HOP3(HOP3(last1, prog->anchored_offset, strend)
642 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
643 must,
7fba1cd6 644 multiline ? FBMrf_MULTILINE : 0
33b8afdf 645 );
a3621e74 646 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 647 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
648 (s ? "Found" : "Contradicts"),
649 PL_colors[0],
33b8afdf
JH
650 (int)(SvCUR(must)
651 - (SvTAIL(must)!=0)),
3f7c398e 652 SvPVX_const(must),
33b8afdf 653 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
654 if (!s) {
655 if (last1 >= last2) {
a3621e74 656 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
657 ", giving up...\n"));
658 goto fail_finish;
659 }
a3621e74 660 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 661 ", trying floating at offset %ld...\n",
1aa99e6b
IH
662 (long)(HOP3c(s1, 1, strend) - i_strpos)));
663 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
664 s = HOP3c(last, 1, strend);
2c2d71f5
JH
665 goto restart;
666 }
667 else {
a3621e74 668 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 669 (long)(s - i_strpos)));
1aa99e6b
IH
670 t = HOP3c(s, -prog->anchored_offset, strbeg);
671 other_last = HOP3c(s, 1, strend);
30944b6d 672 s = s1;
2c2d71f5
JH
673 if (t == strpos)
674 goto try_at_start;
2c2d71f5
JH
675 goto try_at_offset;
676 }
30944b6d 677 }
2c2d71f5
JH
678 }
679 else { /* Take into account the floating substring. */
33b8afdf
JH
680 char *last, *last1;
681 char *s1 = s;
682 SV* must;
683
684 t = HOP3c(s, -start_shift, strbeg);
685 last1 = last =
686 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
687 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
688 last = HOP3c(t, prog->float_max_offset, strend);
689 s = HOP3c(t, prog->float_min_offset, strend);
690 if (s < other_last)
691 s = other_last;
2c2d71f5 692 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
693 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
694 /* fbm_instr() takes into account exact value of end-of-str
695 if the check is SvTAIL(ed). Since false positives are OK,
696 and end-of-str is not later than strend we are OK. */
697 if (must == &PL_sv_undef) {
698 s = (char*)NULL;
a3621e74 699 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
700 }
701 else
2c2d71f5 702 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
703 (unsigned char*)last + SvCUR(must)
704 - (SvTAIL(must)!=0),
7fba1cd6 705 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 706 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
707 (s ? "Found" : "Contradicts"),
708 PL_colors[0],
709 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 710 SvPVX_const(must),
33b8afdf
JH
711 PL_colors[1], (SvTAIL(must) ? "$" : "")));
712 if (!s) {
713 if (last1 == last) {
a3621e74 714 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
715 ", giving up...\n"));
716 goto fail_finish;
2c2d71f5 717 }
a3621e74 718 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
719 ", trying anchored starting at offset %ld...\n",
720 (long)(s1 + 1 - i_strpos)));
721 other_last = last;
722 s = HOP3c(t, 1, strend);
723 goto restart;
724 }
725 else {
a3621e74 726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
727 (long)(s - i_strpos)));
728 other_last = s; /* Fix this later. --Hugo */
729 s = s1;
730 if (t == strpos)
731 goto try_at_start;
732 goto try_at_offset;
733 }
2c2d71f5 734 }
cad2e5aa 735 }
2c2d71f5
JH
736
737 t = s - prog->check_offset_max;
2c2d71f5 738 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 739 && (!do_utf8
1aa99e6b
IH
740 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
741 && t > strpos))) {
2c2d71f5
JH
742 /* Fixed substring is found far enough so that the match
743 cannot start at strpos. */
744 try_at_offset:
cad2e5aa 745 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
746 /* Eventually fbm_*() should handle this, but often
747 anchored_offset is not 0, so this check will not be wasted. */
748 /* XXXX In the code below we prefer to look for "^" even in
749 presence of anchored substrings. And we search even
750 beyond the found float position. These pessimizations
751 are historical artefacts only. */
752 find_anchor:
2c2d71f5 753 while (t < strend - prog->minlen) {
cad2e5aa 754 if (*t == '\n') {
4ee3650e 755 if (t < check_at - prog->check_offset_min) {
33b8afdf 756 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
757 /* Since we moved from the found position,
758 we definitely contradict the found anchored
30944b6d
IZ
759 substr. Due to the above check we do not
760 contradict "check" substr.
761 Thus we can arrive here only if check substr
762 is float. Redo checking for "other"=="fixed".
763 */
9041c2e3 764 strpos = t + 1;
a3621e74 765 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 766 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
767 goto do_other_anchored;
768 }
4ee3650e
GS
769 /* We don't contradict the found floating substring. */
770 /* XXXX Why not check for STCLASS? */
cad2e5aa 771 s = t + 1;
a3621e74 772 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 773 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
774 goto set_useful;
775 }
4ee3650e
GS
776 /* Position contradicts check-string */
777 /* XXXX probably better to look for check-string
778 than for "\n", so one should lower the limit for t? */
a3621e74 779 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 780 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 781 other_last = strpos = s = t + 1;
cad2e5aa
JH
782 goto restart;
783 }
784 t++;
785 }
a3621e74 786 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 787 PL_colors[0], PL_colors[1]));
2c2d71f5 788 goto fail_finish;
cad2e5aa 789 }
f5952150 790 else {
a3621e74 791 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 792 PL_colors[0], PL_colors[1]));
f5952150 793 }
cad2e5aa
JH
794 s = t;
795 set_useful:
33b8afdf 796 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
797 }
798 else {
f5952150 799 /* The found string does not prohibit matching at strpos,
2c2d71f5 800 - no optimization of calling REx engine can be performed,
f5952150
GS
801 unless it was an MBOL and we are not after MBOL,
802 or a future STCLASS check will fail this. */
2c2d71f5
JH
803 try_at_start:
804 /* Even in this situation we may use MBOL flag if strpos is offset
805 wrt the start of the string. */
05b4157f 806 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 807 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
808 /* May be due to an implicit anchor of m{.*foo} */
809 && !(prog->reganch & ROPT_IMPLICIT))
810 {
cad2e5aa
JH
811 t = strpos;
812 goto find_anchor;
813 }
a3621e74 814 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 815 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 816 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 817 );
2c2d71f5 818 success_at_start:
30944b6d 819 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
820 && (do_utf8 ? (
821 prog->check_utf8 /* Could be deleted already */
822 && --BmUSEFUL(prog->check_utf8) < 0
823 && (prog->check_utf8 == prog->float_utf8)
824 ) : (
825 prog->check_substr /* Could be deleted already */
826 && --BmUSEFUL(prog->check_substr) < 0
827 && (prog->check_substr == prog->float_substr)
828 )))
66e933ab 829 {
cad2e5aa 830 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 831 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
832 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
833 if (do_utf8 ? prog->check_substr : prog->check_utf8)
834 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
835 prog->check_substr = prog->check_utf8 = NULL; /* disable */
836 prog->float_substr = prog->float_utf8 = NULL; /* clear */
837 check = NULL; /* abort */
cad2e5aa 838 s = strpos;
3cf5c195
IZ
839 /* XXXX This is a remnant of the old implementation. It
840 looks wasteful, since now INTUIT can use many
6eb5f6b9 841 other heuristics. */
cad2e5aa
JH
842 prog->reganch &= ~RE_USE_INTUIT;
843 }
844 else
845 s = strpos;
846 }
847
6eb5f6b9
JH
848 /* Last resort... */
849 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
850 if (prog->regstclass) {
851 /* minlen == 0 is possible if regstclass is \b or \B,
852 and the fixed substr is ''$.
853 Since minlen is already taken into account, s+1 is before strend;
854 accidentally, minlen >= 1 guaranties no false positives at s + 1
855 even for \b or \B. But (minlen? 1 : 0) below assumes that
856 regstclass does not come from lookahead... */
857 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
858 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 859 const U8* const str = (U8*)STRING(prog->regstclass);
06b5626a 860 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 861 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 862 : 1);
a3b680e6 863 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 864 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 865 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
866 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
867 cl_l, strend)
868 : strend);
6eb5f6b9
JH
869
870 t = s;
9926ca43 871 cache_re(prog);
06b5626a 872 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
6eb5f6b9
JH
873 if (!s) {
874#ifdef DEBUGGING
cbbf8932 875 const char *what = NULL;
6eb5f6b9
JH
876#endif
877 if (endpos == strend) {
a3621e74 878 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
879 "Could not match STCLASS...\n") );
880 goto fail;
881 }
a3621e74 882 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 883 "This position contradicts STCLASS...\n") );
653099ff
GS
884 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
885 goto fail;
6eb5f6b9 886 /* Contradict one of substrings */
33b8afdf
JH
887 if (prog->anchored_substr || prog->anchored_utf8) {
888 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 889 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 890 hop_and_restart:
1aa99e6b 891 s = HOP3c(t, 1, strend);
66e933ab
GS
892 if (s + start_shift + end_shift > strend) {
893 /* XXXX Should be taken into account earlier? */
a3621e74 894 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
895 "Could not match STCLASS...\n") );
896 goto fail;
897 }
5e39e1e5
HS
898 if (!check)
899 goto giveup;
a3621e74 900 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 901 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
902 what, (long)(s + start_shift - i_strpos)) );
903 goto restart;
904 }
66e933ab 905 /* Have both, check_string is floating */
6eb5f6b9
JH
906 if (t + start_shift >= check_at) /* Contradicts floating=check */
907 goto retry_floating_check;
908 /* Recheck anchored substring, but not floating... */
9041c2e3 909 s = check_at;
5e39e1e5
HS
910 if (!check)
911 goto giveup;
a3621e74 912 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 913 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
914 (long)(other_last - i_strpos)) );
915 goto do_other_anchored;
916 }
60e71179
GS
917 /* Another way we could have checked stclass at the
918 current position only: */
919 if (ml_anch) {
920 s = t = t + 1;
5e39e1e5
HS
921 if (!check)
922 goto giveup;
a3621e74 923 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 924 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 925 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 926 goto try_at_offset;
66e933ab 927 }
33b8afdf 928 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 929 goto fail;
6eb5f6b9
JH
930 /* Check is floating subtring. */
931 retry_floating_check:
932 t = check_at - start_shift;
a3621e74 933 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
934 goto hop_and_restart;
935 }
b7953727 936 if (t != s) {
a3621e74 937 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 938 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
939 (long)(t - i_strpos), (long)(s - i_strpos))
940 );
941 }
942 else {
a3621e74 943 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
944 "Does not contradict STCLASS...\n");
945 );
946 }
6eb5f6b9 947 }
5e39e1e5 948 giveup:
a3621e74 949 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
950 PL_colors[4], (check ? "Guessed" : "Giving up"),
951 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 952 return s;
2c2d71f5
JH
953
954 fail_finish: /* Substring not found */
33b8afdf
JH
955 if (prog->check_substr || prog->check_utf8) /* could be removed already */
956 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 957 fail:
a3621e74 958 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 959 PL_colors[4], PL_colors[5]));
bd61b366 960 return NULL;
cad2e5aa 961}
9661b544 962
6eb5f6b9 963/* We know what class REx starts with. Try to find this position... */
3c3eec57 964STATIC char *
a3b680e6 965S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 norun)
a687059c 966{
27da23d5 967 dVAR;
1df70142 968 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 969 char *m;
d8093b23 970 STRLEN ln;
5dab1207 971 STRLEN lnc;
078c425b 972 register STRLEN uskip;
d8093b23 973 unsigned int c1;
974 unsigned int c2;
6eb5f6b9
JH
975 char *e;
976 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 977 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 978
6eb5f6b9
JH
979 /* We know what class it must start with. */
980 switch (OP(c)) {
6eb5f6b9 981 case ANYOF:
388cc4de 982 if (do_utf8) {
078c425b 983 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
984 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
985 !UTF8_IS_INVARIANT((U8)s[0]) ?
986 reginclass(c, (U8*)s, 0, do_utf8) :
987 REGINCLASS(c, (U8*)s)) {
988 if (tmp && (norun || regtry(prog, s)))
989 goto got_it;
990 else
991 tmp = doevery;
992 }
993 else
994 tmp = 1;
078c425b 995 s += uskip;
388cc4de
HS
996 }
997 }
998 else {
999 while (s < strend) {
1000 STRLEN skip = 1;
1001
1002 if (REGINCLASS(c, (U8*)s) ||
1003 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1004 /* The assignment of 2 is intentional:
1005 * for the folded sharp s, the skip is 2. */
1006 (skip = SHARP_S_SKIP))) {
1007 if (tmp && (norun || regtry(prog, s)))
1008 goto got_it;
1009 else
1010 tmp = doevery;
1011 }
1012 else
1013 tmp = 1;
1014 s += skip;
1015 }
a0d0e21e 1016 }
6eb5f6b9 1017 break;
f33976b4
DB
1018 case CANY:
1019 while (s < strend) {
1020 if (tmp && (norun || regtry(prog, s)))
1021 goto got_it;
1022 else
1023 tmp = doevery;
1024 s++;
1025 }
1026 break;
6eb5f6b9 1027 case EXACTF:
5dab1207
NIS
1028 m = STRING(c);
1029 ln = STR_LEN(c); /* length to match in octets/bytes */
1030 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1031 if (UTF) {
a2a2844f 1032 STRLEN ulen1, ulen2;
5dab1207 1033 U8 *sm = (U8 *) m;
89ebb4a3
JH
1034 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1035 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
041457d9 1036 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
a2a2844f
JH
1037
1038 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1039 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1040
89ebb4a3 1041 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1042 0, uniflags);
89ebb4a3 1043 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1044 0, uniflags);
5dab1207
NIS
1045 lnc = 0;
1046 while (sm < ((U8 *) m + ln)) {
1047 lnc++;
1048 sm += UTF8SKIP(sm);
1049 }
1aa99e6b
IH
1050 }
1051 else {
1052 c1 = *(U8*)m;
1053 c2 = PL_fold[c1];
1054 }
6eb5f6b9
JH
1055 goto do_exactf;
1056 case EXACTFL:
5dab1207
NIS
1057 m = STRING(c);
1058 ln = STR_LEN(c);
1059 lnc = (I32) ln;
d8093b23 1060 c1 = *(U8*)m;
6eb5f6b9
JH
1061 c2 = PL_fold_locale[c1];
1062 do_exactf:
db12adc6 1063 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1064
6eb5f6b9
JH
1065 if (norun && e < s)
1066 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1067
60a8b682
JH
1068 /* The idea in the EXACTF* cases is to first find the
1069 * first character of the EXACTF* node and then, if
1070 * necessary, case-insensitively compare the full
1071 * text of the node. The c1 and c2 are the first
1072 * characters (though in Unicode it gets a bit
1073 * more complicated because there are more cases
7f16dd3d
JH
1074 * than just upper and lower: one needs to use
1075 * the so-called folding case for case-insensitive
1076 * matching (called "loose matching" in Unicode).
1077 * ibcmp_utf8() will do just that. */
60a8b682 1078
1aa99e6b 1079 if (do_utf8) {
575cac57 1080 UV c, f;
89ebb4a3 1081 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1082 STRLEN len, foldlen;
041457d9 1083 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
09091399 1084 if (c1 == c2) {
5dab1207
NIS
1085 /* Upper and lower of 1st char are equal -
1086 * probably not a "letter". */
1aa99e6b 1087 while (s <= e) {
89ebb4a3 1088 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1089 uniflags);
80aecb99
JH
1090 if ( c == c1
1091 && (ln == len ||
66423254 1092 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1093 m, (char **)0, ln, (bool)UTF))
55da9344 1094 && (norun || regtry(prog, s)) )
1aa99e6b 1095 goto got_it;
80aecb99 1096 else {
1df70142 1097 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1098 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1099 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1100 if ( f != c
1101 && (f == c1 || f == c2)
1102 && (ln == foldlen ||
66423254
JH
1103 !ibcmp_utf8((char *) foldbuf,
1104 (char **)0, foldlen, do_utf8,
d07ddd77 1105 m,
eb160463 1106 (char **)0, ln, (bool)UTF))
80aecb99
JH
1107 && (norun || regtry(prog, s)) )
1108 goto got_it;
1109 }
1aa99e6b
IH
1110 s += len;
1111 }
09091399
JH
1112 }
1113 else {
1aa99e6b 1114 while (s <= e) {
89ebb4a3 1115 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1116 uniflags);
80aecb99 1117
60a8b682 1118 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1119 * Note that not all the possible combinations
1120 * are handled here: some of them are handled
1121 * by the standard folding rules, and some of
1122 * them (the character class or ANYOF cases)
1123 * are handled during compiletime in
1124 * regexec.c:S_regclass(). */
880bd946
JH
1125 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1126 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1127 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1128
1129 if ( (c == c1 || c == c2)
1130 && (ln == len ||
66423254 1131 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1132 m, (char **)0, ln, (bool)UTF))
55da9344 1133 && (norun || regtry(prog, s)) )
1aa99e6b 1134 goto got_it;
80aecb99 1135 else {
1df70142 1136 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1137 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1138 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1139 if ( f != c
1140 && (f == c1 || f == c2)
1141 && (ln == foldlen ||
a6872d42 1142 !ibcmp_utf8((char *) foldbuf,
66423254 1143 (char **)0, foldlen, do_utf8,
d07ddd77 1144 m,
eb160463 1145 (char **)0, ln, (bool)UTF))
80aecb99
JH
1146 && (norun || regtry(prog, s)) )
1147 goto got_it;
1148 }
1aa99e6b
IH
1149 s += len;
1150 }
09091399 1151 }
1aa99e6b
IH
1152 }
1153 else {
1154 if (c1 == c2)
1155 while (s <= e) {
1156 if ( *(U8*)s == c1
1157 && (ln == 1 || !(OP(c) == EXACTF
1158 ? ibcmp(s, m, ln)
1159 : ibcmp_locale(s, m, ln)))
1160 && (norun || regtry(prog, s)) )
1161 goto got_it;
1162 s++;
1163 }
1164 else
1165 while (s <= e) {
1166 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1167 && (ln == 1 || !(OP(c) == EXACTF
1168 ? ibcmp(s, m, ln)
1169 : ibcmp_locale(s, m, ln)))
1170 && (norun || regtry(prog, s)) )
1171 goto got_it;
1172 s++;
1173 }
b3c9acc1
IZ
1174 }
1175 break;
bbce6d69 1176 case BOUNDL:
3280af22 1177 PL_reg_flags |= RF_tainted;
bbce6d69 1178 /* FALL THROUGH */
a0d0e21e 1179 case BOUND:
ffc61ed2 1180 if (do_utf8) {
12d33761 1181 if (s == PL_bostr)
ffc61ed2
JH
1182 tmp = '\n';
1183 else {
6136c704 1184 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
b4f7163a 1185 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1186 }
1187 tmp = ((OP(c) == BOUND ?
9041c2e3 1188 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1189 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1190 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1191 if (tmp == !(OP(c) == BOUND ?
3568d838 1192 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1193 isALNUM_LC_utf8((U8*)s)))
1194 {
1195 tmp = !tmp;
1196 if ((norun || regtry(prog, s)))
1197 goto got_it;
1198 }
078c425b 1199 s += uskip;
a687059c 1200 }
a0d0e21e 1201 }
667bb95a 1202 else {
12d33761 1203 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1204 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1205 while (s < strend) {
1206 if (tmp ==
1207 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1208 tmp = !tmp;
1209 if ((norun || regtry(prog, s)))
1210 goto got_it;
1211 }
1212 s++;
a0ed51b3 1213 }
a0ed51b3 1214 }
6eb5f6b9 1215 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1216 goto got_it;
1217 break;
bbce6d69 1218 case NBOUNDL:
3280af22 1219 PL_reg_flags |= RF_tainted;
bbce6d69 1220 /* FALL THROUGH */
a0d0e21e 1221 case NBOUND:
ffc61ed2 1222 if (do_utf8) {
12d33761 1223 if (s == PL_bostr)
ffc61ed2
JH
1224 tmp = '\n';
1225 else {
6136c704 1226 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
b4f7163a 1227 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1228 }
1229 tmp = ((OP(c) == NBOUND ?
9041c2e3 1230 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1231 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1232 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1233 if (tmp == !(OP(c) == NBOUND ?
3568d838 1234 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1235 isALNUM_LC_utf8((U8*)s)))
1236 tmp = !tmp;
1237 else if ((norun || regtry(prog, s)))
1238 goto got_it;
078c425b 1239 s += uskip;
ffc61ed2 1240 }
a0d0e21e 1241 }
667bb95a 1242 else {
12d33761 1243 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1244 tmp = ((OP(c) == NBOUND ?
1245 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1246 while (s < strend) {
1247 if (tmp ==
1248 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1249 tmp = !tmp;
1250 else if ((norun || regtry(prog, s)))
1251 goto got_it;
1252 s++;
1253 }
a0ed51b3 1254 }
6eb5f6b9 1255 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1256 goto got_it;
1257 break;
a0d0e21e 1258 case ALNUM:
ffc61ed2 1259 if (do_utf8) {
1a4fad37 1260 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1261 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1262 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1263 if (tmp && (norun || regtry(prog, s)))
1264 goto got_it;
1265 else
1266 tmp = doevery;
1267 }
bbce6d69 1268 else
ffc61ed2 1269 tmp = 1;
078c425b 1270 s += uskip;
bbce6d69 1271 }
bbce6d69 1272 }
ffc61ed2
JH
1273 else {
1274 while (s < strend) {
1275 if (isALNUM(*s)) {
1276 if (tmp && (norun || regtry(prog, s)))
1277 goto got_it;
1278 else
1279 tmp = doevery;
1280 }
a0ed51b3 1281 else
ffc61ed2
JH
1282 tmp = 1;
1283 s++;
a0ed51b3 1284 }
a0ed51b3
LW
1285 }
1286 break;
bbce6d69 1287 case ALNUML:
3280af22 1288 PL_reg_flags |= RF_tainted;
ffc61ed2 1289 if (do_utf8) {
078c425b 1290 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1291 if (isALNUM_LC_utf8((U8*)s)) {
1292 if (tmp && (norun || regtry(prog, s)))
1293 goto got_it;
1294 else
1295 tmp = doevery;
1296 }
a687059c 1297 else
ffc61ed2 1298 tmp = 1;
078c425b 1299 s += uskip;
a0d0e21e 1300 }
a0d0e21e 1301 }
ffc61ed2
JH
1302 else {
1303 while (s < strend) {
1304 if (isALNUM_LC(*s)) {
1305 if (tmp && (norun || regtry(prog, s)))
1306 goto got_it;
1307 else
1308 tmp = doevery;
1309 }
a0ed51b3 1310 else
ffc61ed2
JH
1311 tmp = 1;
1312 s++;
a0ed51b3 1313 }
a0ed51b3
LW
1314 }
1315 break;
a0d0e21e 1316 case NALNUM:
ffc61ed2 1317 if (do_utf8) {
1a4fad37 1318 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1319 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1320 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1321 if (tmp && (norun || regtry(prog, s)))
1322 goto got_it;
1323 else
1324 tmp = doevery;
1325 }
bbce6d69 1326 else
ffc61ed2 1327 tmp = 1;
078c425b 1328 s += uskip;
bbce6d69 1329 }
bbce6d69 1330 }
ffc61ed2
JH
1331 else {
1332 while (s < strend) {
1333 if (!isALNUM(*s)) {
1334 if (tmp && (norun || regtry(prog, s)))
1335 goto got_it;
1336 else
1337 tmp = doevery;
1338 }
a0ed51b3 1339 else
ffc61ed2
JH
1340 tmp = 1;
1341 s++;
a0ed51b3 1342 }
a0ed51b3
LW
1343 }
1344 break;
bbce6d69 1345 case NALNUML:
3280af22 1346 PL_reg_flags |= RF_tainted;
ffc61ed2 1347 if (do_utf8) {
078c425b 1348 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1349 if (!isALNUM_LC_utf8((U8*)s)) {
1350 if (tmp && (norun || regtry(prog, s)))
1351 goto got_it;
1352 else
1353 tmp = doevery;
1354 }
a687059c 1355 else
ffc61ed2 1356 tmp = 1;
078c425b 1357 s += uskip;
a687059c 1358 }
a0d0e21e 1359 }
ffc61ed2
JH
1360 else {
1361 while (s < strend) {
1362 if (!isALNUM_LC(*s)) {
1363 if (tmp && (norun || regtry(prog, s)))
1364 goto got_it;
1365 else
1366 tmp = doevery;
1367 }
a0ed51b3 1368 else
ffc61ed2
JH
1369 tmp = 1;
1370 s++;
a0ed51b3 1371 }
a0ed51b3
LW
1372 }
1373 break;
a0d0e21e 1374 case SPACE:
ffc61ed2 1375 if (do_utf8) {
1a4fad37 1376 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1377 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1378 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1379 if (tmp && (norun || regtry(prog, s)))
1380 goto got_it;
1381 else
1382 tmp = doevery;
1383 }
a0d0e21e 1384 else
ffc61ed2 1385 tmp = 1;
078c425b 1386 s += uskip;
2304df62 1387 }
a0d0e21e 1388 }
ffc61ed2
JH
1389 else {
1390 while (s < strend) {
1391 if (isSPACE(*s)) {
1392 if (tmp && (norun || regtry(prog, s)))
1393 goto got_it;
1394 else
1395 tmp = doevery;
1396 }
a0ed51b3 1397 else
ffc61ed2
JH
1398 tmp = 1;
1399 s++;
a0ed51b3 1400 }
a0ed51b3
LW
1401 }
1402 break;
bbce6d69 1403 case SPACEL:
3280af22 1404 PL_reg_flags |= RF_tainted;
ffc61ed2 1405 if (do_utf8) {
078c425b 1406 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1407 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1408 if (tmp && (norun || regtry(prog, s)))
1409 goto got_it;
1410 else
1411 tmp = doevery;
1412 }
bbce6d69 1413 else
ffc61ed2 1414 tmp = 1;
078c425b 1415 s += uskip;
bbce6d69 1416 }
bbce6d69 1417 }
ffc61ed2
JH
1418 else {
1419 while (s < strend) {
1420 if (isSPACE_LC(*s)) {
1421 if (tmp && (norun || regtry(prog, s)))
1422 goto got_it;
1423 else
1424 tmp = doevery;
1425 }
a0ed51b3 1426 else
ffc61ed2
JH
1427 tmp = 1;
1428 s++;
a0ed51b3 1429 }
a0ed51b3
LW
1430 }
1431 break;
a0d0e21e 1432 case NSPACE:
ffc61ed2 1433 if (do_utf8) {
1a4fad37 1434 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1435 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1436 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1437 if (tmp && (norun || regtry(prog, s)))
1438 goto got_it;
1439 else
1440 tmp = doevery;
1441 }
a0d0e21e 1442 else
ffc61ed2 1443 tmp = 1;
078c425b 1444 s += uskip;
a687059c 1445 }
a0d0e21e 1446 }
ffc61ed2
JH
1447 else {
1448 while (s < strend) {
1449 if (!isSPACE(*s)) {
1450 if (tmp && (norun || regtry(prog, s)))
1451 goto got_it;
1452 else
1453 tmp = doevery;
1454 }
a0ed51b3 1455 else
ffc61ed2
JH
1456 tmp = 1;
1457 s++;
a0ed51b3 1458 }
a0ed51b3
LW
1459 }
1460 break;
bbce6d69 1461 case NSPACEL:
3280af22 1462 PL_reg_flags |= RF_tainted;
ffc61ed2 1463 if (do_utf8) {
078c425b 1464 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1465 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1466 if (tmp && (norun || regtry(prog, s)))
1467 goto got_it;
1468 else
1469 tmp = doevery;
1470 }
bbce6d69 1471 else
ffc61ed2 1472 tmp = 1;
078c425b 1473 s += uskip;
bbce6d69 1474 }
bbce6d69 1475 }
ffc61ed2
JH
1476 else {
1477 while (s < strend) {
1478 if (!isSPACE_LC(*s)) {
1479 if (tmp && (norun || regtry(prog, s)))
1480 goto got_it;
1481 else
1482 tmp = doevery;
1483 }
a0ed51b3 1484 else
ffc61ed2
JH
1485 tmp = 1;
1486 s++;
a0ed51b3 1487 }
a0ed51b3
LW
1488 }
1489 break;
a0d0e21e 1490 case DIGIT:
ffc61ed2 1491 if (do_utf8) {
1a4fad37 1492 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1493 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1494 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1495 if (tmp && (norun || regtry(prog, s)))
1496 goto got_it;
1497 else
1498 tmp = doevery;
1499 }
a0d0e21e 1500 else
ffc61ed2 1501 tmp = 1;
078c425b 1502 s += uskip;
2b69d0c2 1503 }
a0d0e21e 1504 }
ffc61ed2
JH
1505 else {
1506 while (s < strend) {
1507 if (isDIGIT(*s)) {
1508 if (tmp && (norun || regtry(prog, s)))
1509 goto got_it;
1510 else
1511 tmp = doevery;
1512 }
a0ed51b3 1513 else
ffc61ed2
JH
1514 tmp = 1;
1515 s++;
a0ed51b3 1516 }
a0ed51b3
LW
1517 }
1518 break;
b8c5462f
JH
1519 case DIGITL:
1520 PL_reg_flags |= RF_tainted;
ffc61ed2 1521 if (do_utf8) {
078c425b 1522 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1523 if (isDIGIT_LC_utf8((U8*)s)) {
1524 if (tmp && (norun || regtry(prog, s)))
1525 goto got_it;
1526 else
1527 tmp = doevery;
1528 }
b8c5462f 1529 else
ffc61ed2 1530 tmp = 1;
078c425b 1531 s += uskip;
b8c5462f 1532 }
b8c5462f 1533 }
ffc61ed2
JH
1534 else {
1535 while (s < strend) {
1536 if (isDIGIT_LC(*s)) {
1537 if (tmp && (norun || regtry(prog, s)))
1538 goto got_it;
1539 else
1540 tmp = doevery;
1541 }
b8c5462f 1542 else
ffc61ed2
JH
1543 tmp = 1;
1544 s++;
b8c5462f 1545 }
b8c5462f
JH
1546 }
1547 break;
a0d0e21e 1548 case NDIGIT:
ffc61ed2 1549 if (do_utf8) {
1a4fad37 1550 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1551 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1552 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1553 if (tmp && (norun || regtry(prog, s)))
1554 goto got_it;
1555 else
1556 tmp = doevery;
1557 }
a0d0e21e 1558 else
ffc61ed2 1559 tmp = 1;
078c425b 1560 s += uskip;
a687059c 1561 }
a0d0e21e 1562 }
ffc61ed2
JH
1563 else {
1564 while (s < strend) {
1565 if (!isDIGIT(*s)) {
1566 if (tmp && (norun || regtry(prog, s)))
1567 goto got_it;
1568 else
1569 tmp = doevery;
1570 }
a0ed51b3 1571 else
ffc61ed2
JH
1572 tmp = 1;
1573 s++;
a0ed51b3 1574 }
a0ed51b3
LW
1575 }
1576 break;
b8c5462f
JH
1577 case NDIGITL:
1578 PL_reg_flags |= RF_tainted;
ffc61ed2 1579 if (do_utf8) {
078c425b 1580 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1581 if (!isDIGIT_LC_utf8((U8*)s)) {
1582 if (tmp && (norun || regtry(prog, s)))
1583 goto got_it;
1584 else
1585 tmp = doevery;
1586 }
b8c5462f 1587 else
ffc61ed2 1588 tmp = 1;
078c425b 1589 s += uskip;
b8c5462f 1590 }
a0ed51b3 1591 }
ffc61ed2
JH
1592 else {
1593 while (s < strend) {
1594 if (!isDIGIT_LC(*s)) {
1595 if (tmp && (norun || regtry(prog, s)))
1596 goto got_it;
1597 else
1598 tmp = doevery;
1599 }
cf93c79d 1600 else
ffc61ed2
JH
1601 tmp = 1;
1602 s++;
b8c5462f 1603 }
b8c5462f
JH
1604 }
1605 break;
b3c9acc1 1606 default:
3c3eec57
GS
1607 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1608 break;
d6a28714 1609 }
6eb5f6b9
JH
1610 return 0;
1611 got_it:
1612 return s;
1613}
1614
1615/*
1616 - regexec_flags - match a regexp against a string
1617 */
1618I32
1619Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1620 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1621/* strend: pointer to null at end of string */
1622/* strbeg: real beginning of string */
1623/* minend: end of match must be >=minend after stringarg. */
1624/* data: May be used for some additional optimizations. */
1625/* nosave: For optimizations. */
1626{
97aff369 1627 dVAR;
6eb5f6b9
JH
1628 register char *s;
1629 register regnode *c;
1630 register char *startpos = stringarg;
6eb5f6b9
JH
1631 I32 minlen; /* must match at least this many chars */
1632 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1633 I32 end_shift = 0; /* Same for the end. */ /* CC */
1634 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1635 char *scream_olds = NULL;
6eb5f6b9 1636 SV* oreplsv = GvSV(PL_replgv);
1df70142 1637 const bool do_utf8 = DO_UTF8(sv);
a3b680e6 1638 const I32 multiline = prog->reganch & PMf_MULTILINE;
2a782b5b 1639#ifdef DEBUGGING
6136c704
AL
1640 SV * const dsv0 = PERL_DEBUG_PAD_ZERO(0);
1641 SV * const dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1642#endif
a3621e74
YO
1643
1644 GET_RE_DEBUG_FLAGS_DECL;
1645
9d4ba2ae 1646 PERL_UNUSED_ARG(data);
a30b2f1f 1647 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1648
1649 PL_regcc = 0;
1650
1651 cache_re(prog);
1652#ifdef DEBUGGING
aea4f609 1653 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1654#endif
1655
1656 /* Be paranoid... */
1657 if (prog == NULL || startpos == NULL) {
1658 Perl_croak(aTHX_ "NULL regexp parameter");
1659 return 0;
1660 }
1661
1662 minlen = prog->minlen;
61a36c01 1663 if (strend - startpos < minlen) {
a3621e74 1664 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1665 "String too short [regexec_flags]...\n"));
1666 goto phooey;
1aa99e6b 1667 }
6eb5f6b9 1668
6eb5f6b9
JH
1669 /* Check validity of program. */
1670 if (UCHARAT(prog->program) != REG_MAGIC) {
1671 Perl_croak(aTHX_ "corrupted regexp program");
1672 }
1673
1674 PL_reg_flags = 0;
1675 PL_reg_eval_set = 0;
1676 PL_reg_maxiter = 0;
1677
1678 if (prog->reganch & ROPT_UTF8)
1679 PL_reg_flags |= RF_utf8;
1680
1681 /* Mark beginning of line for ^ and lookbehind. */
1682 PL_regbol = startpos;
1683 PL_bostr = strbeg;
1684 PL_reg_sv = sv;
1685
1686 /* Mark end of line for $ (and such) */
1687 PL_regeol = strend;
1688
1689 /* see how far we have to get to not match where we matched before */
1690 PL_regtill = startpos+minend;
1691
1692 /* We start without call_cc context. */
1693 PL_reg_call_cc = 0;
1694
1695 /* If there is a "must appear" string, look for it. */
1696 s = startpos;
1697
1698 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1699 MAGIC *mg;
1700
1701 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1702 PL_reg_ganch = startpos;
1703 else if (sv && SvTYPE(sv) >= SVt_PVMG
1704 && SvMAGIC(sv)
14befaf4
DM
1705 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1706 && mg->mg_len >= 0) {
6eb5f6b9
JH
1707 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1708 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1709 if (s > PL_reg_ganch)
6eb5f6b9
JH
1710 goto phooey;
1711 s = PL_reg_ganch;
1712 }
1713 }
1714 else /* pos() not defined */
1715 PL_reg_ganch = strbeg;
1716 }
1717
a0714e2c 1718 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1719 re_scream_pos_data d;
1720
1721 d.scream_olds = &scream_olds;
1722 d.scream_pos = &scream_pos;
1723 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1724 if (!s) {
a3621e74 1725 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1726 goto phooey; /* not present */
3fa9c3d7 1727 }
6eb5f6b9
JH
1728 }
1729
a3621e74 1730 DEBUG_EXECUTE_r({
1df70142
AL
1731 const char * const s0 = UTF
1732 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1733 UNI_DISPLAY_REGEX)
1734 : prog->precomp;
1735 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1736 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1737 UNI_DISPLAY_REGEX) : startpos;
1df70142 1738 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1739 if (!PL_colorset)
1740 reginitcolors();
1741 PerlIO_printf(Perl_debug_log,
a0288114 1742 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1743 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1744 len0, len0, s0,
2a782b5b 1745 PL_colors[1],
9e55ce06 1746 len0 > 60 ? "..." : "",
2a782b5b 1747 PL_colors[0],
9e55ce06
JH
1748 (int)(len1 > 60 ? 60 : len1),
1749 s1, PL_colors[1],
1750 (len1 > 60 ? "..." : "")
2a782b5b
JH
1751 );
1752 });
6eb5f6b9
JH
1753
1754 /* Simplest case: anchored match need be tried only once. */
1755 /* [unless only anchor is BOL and multiline is set] */
1756 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1757 if (s == startpos && regtry(prog, startpos))
1758 goto got_it;
7fba1cd6 1759 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1760 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1761 {
1762 char *end;
1763
1764 if (minlen)
1765 dontbother = minlen - 1;
1aa99e6b 1766 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1767 /* for multiline we only have to try after newlines */
33b8afdf 1768 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1769 if (s == startpos)
1770 goto after_try;
1771 while (1) {
1772 if (regtry(prog, s))
1773 goto got_it;
1774 after_try:
1775 if (s >= end)
1776 goto phooey;
1777 if (prog->reganch & RE_USE_INTUIT) {
1778 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1779 if (!s)
1780 goto phooey;
1781 }
1782 else
1783 s++;
1784 }
1785 } else {
1786 if (s > startpos)
1787 s--;
1788 while (s < end) {
1789 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1790 if (regtry(prog, s))
1791 goto got_it;
1792 }
1793 }
1794 }
1795 }
1796 goto phooey;
1797 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1798 if (regtry(prog, PL_reg_ganch))
1799 goto got_it;
1800 goto phooey;
1801 }
1802
1803 /* Messy cases: unanchored match. */
33b8afdf 1804 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1805 /* we have /x+whatever/ */
1806 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1807 char ch;
bf93d4cc
GS
1808#ifdef DEBUGGING
1809 int did_match = 0;
1810#endif
33b8afdf
JH
1811 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1812 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1813 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1814
1aa99e6b 1815 if (do_utf8) {
6eb5f6b9
JH
1816 while (s < strend) {
1817 if (*s == ch) {
a3621e74 1818 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1819 if (regtry(prog, s)) goto got_it;
1820 s += UTF8SKIP(s);
1821 while (s < strend && *s == ch)
1822 s += UTF8SKIP(s);
1823 }
1824 s += UTF8SKIP(s);
1825 }
1826 }
1827 else {
1828 while (s < strend) {
1829 if (*s == ch) {
a3621e74 1830 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1831 if (regtry(prog, s)) goto got_it;
1832 s++;
1833 while (s < strend && *s == ch)
1834 s++;
1835 }
1836 s++;
1837 }
1838 }
a3621e74 1839 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1840 PerlIO_printf(Perl_debug_log,
b7953727
JH
1841 "Did not find anchored character...\n")
1842 );
6eb5f6b9 1843 }
a0714e2c
SS
1844 else if (prog->anchored_substr != NULL
1845 || prog->anchored_utf8 != NULL
1846 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1847 && prog->float_max_offset < strend - s)) {
1848 SV *must;
1849 I32 back_max;
1850 I32 back_min;
1851 char *last;
6eb5f6b9 1852 char *last1; /* Last position checked before */
bf93d4cc
GS
1853#ifdef DEBUGGING
1854 int did_match = 0;
1855#endif
33b8afdf
JH
1856 if (prog->anchored_substr || prog->anchored_utf8) {
1857 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1858 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1859 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1860 back_max = back_min = prog->anchored_offset;
1861 } else {
1862 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1863 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1864 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1865 back_max = prog->float_max_offset;
1866 back_min = prog->float_min_offset;
1867 }
1868 if (must == &PL_sv_undef)
1869 /* could not downgrade utf8 check substring, so must fail */
1870 goto phooey;
1871
1872 last = HOP3c(strend, /* Cannot start after this */
1873 -(I32)(CHR_SVLEN(must)
1874 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1875
1876 if (s > PL_bostr)
1877 last1 = HOPc(s, -1);
1878 else
1879 last1 = s - 1; /* bogus */
1880
a0288114 1881 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1882 check_substr==must. */
1883 scream_pos = -1;
1884 dontbother = end_shift;
1885 strend = HOPc(strend, -dontbother);
1886 while ( (s <= last) &&
9041c2e3 1887 ((flags & REXEC_SCREAM)
1aa99e6b 1888 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1889 end_shift, &scream_pos, 0))
1aa99e6b 1890 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1891 (unsigned char*)strend, must,
7fba1cd6 1892 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1893 /* we may be pointing at the wrong string */
1894 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1895 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1896 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1897 if (HOPc(s, -back_max) > last1) {
1898 last1 = HOPc(s, -back_min);
1899 s = HOPc(s, -back_max);
1900 }
1901 else {
1902 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1903
1904 last1 = HOPc(s, -back_min);
1905 s = t;
1906 }
1aa99e6b 1907 if (do_utf8) {
6eb5f6b9
JH
1908 while (s <= last1) {
1909 if (regtry(prog, s))
1910 goto got_it;
1911 s += UTF8SKIP(s);
1912 }
1913 }
1914 else {
1915 while (s <= last1) {
1916 if (regtry(prog, s))
1917 goto got_it;
1918 s++;
1919 }
1920 }
1921 }
a3621e74 1922 DEBUG_EXECUTE_r(if (!did_match)
b7953727 1923 PerlIO_printf(Perl_debug_log,
a0288114 1924 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1925 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1926 ? "anchored" : "floating"),
1927 PL_colors[0],
1928 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 1929 SvPVX_const(must),
b7953727
JH
1930 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1931 );
6eb5f6b9
JH
1932 goto phooey;
1933 }
155aba94 1934 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1935 if (minlen) {
1936 I32 op = (U8)OP(prog->regstclass);
66e933ab 1937 /* don't bother with what can't match */
f14c76ed
RGS
1938 if (PL_regkind[op] != EXACT && op != CANY)
1939 strend = HOPc(strend, -(minlen - 1));
1940 }
a3621e74 1941 DEBUG_EXECUTE_r({
ffc61ed2 1942 SV *prop = sv_newmortal();
cfd0369c
NC
1943 const char *s0;
1944 const char *s1;
9e55ce06
JH
1945 int len0;
1946 int len1;
1947
ffc61ed2 1948 regprop(prop, c);
9e55ce06 1949 s0 = UTF ?
3f7c398e 1950 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 1951 UNI_DISPLAY_REGEX) :
cfd0369c 1952 SvPVX_const(prop);
9e55ce06
JH
1953 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1954 s1 = UTF ?
c728cb41 1955 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1956 len1 = UTF ? SvCUR(dsv1) : strend - s;
1957 PerlIO_printf(Perl_debug_log,
a0288114 1958 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
9e55ce06
JH
1959 len0, len0, s0,
1960 len1, len1, s1);
ffc61ed2 1961 });
06b5626a 1962 if (find_byclass(prog, c, s, strend, 0))
6eb5f6b9 1963 goto got_it;
a3621e74 1964 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1965 }
1966 else {
1967 dontbother = 0;
a0714e2c 1968 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1969 /* Trim the end. */
d6a28714 1970 char *last;
33b8afdf
JH
1971 SV* float_real;
1972
1973 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1974 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1975 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1976
1977 if (flags & REXEC_SCREAM) {
33b8afdf 1978 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1979 end_shift, &scream_pos, 1); /* last one */
1980 if (!last)
ffc61ed2 1981 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1982 /* we may be pointing at the wrong string */
1983 else if (RX_MATCH_COPIED(prog))
3f7c398e 1984 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1985 }
d6a28714
JH
1986 else {
1987 STRLEN len;
cfd0369c 1988 const char * const little = SvPV_const(float_real, len);
d6a28714 1989
33b8afdf 1990 if (SvTAIL(float_real)) {
d6a28714
JH
1991 if (memEQ(strend - len + 1, little, len - 1))
1992 last = strend - len + 1;
7fba1cd6 1993 else if (!multiline)
9041c2e3 1994 last = memEQ(strend - len, little, len)
bd61b366 1995 ? strend - len : NULL;
b8c5462f 1996 else
d6a28714
JH
1997 goto find_last;
1998 } else {
1999 find_last:
9041c2e3 2000 if (len)
d6a28714 2001 last = rninstr(s, strend, little, little + len);
b8c5462f 2002 else
a0288114 2003 last = strend; /* matching "$" */
b8c5462f 2004 }
b8c5462f 2005 }
bf93d4cc 2006 if (last == NULL) {
a3621e74 2007 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2008 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2009 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2010 goto phooey; /* Should not happen! */
2011 }
d6a28714
JH
2012 dontbother = strend - last + prog->float_min_offset;
2013 }
2014 if (minlen && (dontbother < minlen))
2015 dontbother = minlen - 1;
2016 strend -= dontbother; /* this one's always in bytes! */
2017 /* We don't know much -- general case. */
1aa99e6b 2018 if (do_utf8) {
d6a28714
JH
2019 for (;;) {
2020 if (regtry(prog, s))
2021 goto got_it;
2022 if (s >= strend)
2023 break;
b8c5462f 2024 s += UTF8SKIP(s);
d6a28714
JH
2025 };
2026 }
2027 else {
2028 do {
2029 if (regtry(prog, s))
2030 goto got_it;
2031 } while (s++ < strend);
2032 }
2033 }
2034
2035 /* Failure. */
2036 goto phooey;
2037
2038got_it:
2039 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2040
2041 if (PL_reg_eval_set) {
2042 /* Preserve the current value of $^R */
2043 if (oreplsv != GvSV(PL_replgv))
2044 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2045 restored, the value remains
2046 the same. */
acfe0abc 2047 restore_pos(aTHX_ 0);
d6a28714
JH
2048 }
2049
2050 /* make sure $`, $&, $', and $digit will work later */
2051 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2052 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2053 if (flags & REXEC_COPY_STR) {
2054 I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2055#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2056 if ((SvIsCOW(sv)
2057 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2058 if (DEBUG_C_TEST) {
2059 PerlIO_printf(Perl_debug_log,
2060 "Copy on write: regexp capture, type %d\n",
2061 (int) SvTYPE(sv));
2062 }
2063 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2064 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2065 assert (SvPOKp(prog->saved_copy));
2066 } else
2067#endif
2068 {
2069 RX_MATCH_COPIED_on(prog);
2070 s = savepvn(strbeg, i);
2071 prog->subbeg = s;
2072 }
d6a28714 2073 prog->sublen = i;
d6a28714
JH
2074 }
2075 else {
2076 prog->subbeg = strbeg;
2077 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2078 }
2079 }
9041c2e3 2080
d6a28714
JH
2081 return 1;
2082
2083phooey:
a3621e74 2084 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2085 PL_colors[4], PL_colors[5]));
d6a28714 2086 if (PL_reg_eval_set)
acfe0abc 2087 restore_pos(aTHX_ 0);
d6a28714
JH
2088 return 0;
2089}
2090
2091/*
2092 - regtry - try match at specific point
2093 */
2094STATIC I32 /* 0 failure, 1 success */
2095S_regtry(pTHX_ regexp *prog, char *startpos)
2096{
97aff369 2097 dVAR;
d6a28714
JH
2098 register I32 i;
2099 register I32 *sp;
2100 register I32 *ep;
2101 CHECKPOINT lastcp;
a3621e74 2102 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2103
02db2b7b
IZ
2104#ifdef DEBUGGING
2105 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2106#endif
d6a28714
JH
2107 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2108 MAGIC *mg;
2109
2110 PL_reg_eval_set = RS_init;
a3621e74 2111 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2112 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2113 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2114 ));
e8347627 2115 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2116 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2117 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2118 SAVETMPS;
2119 /* Apparently this is not needed, judging by wantarray. */
e8347627 2120 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2121 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2122
2123 if (PL_reg_sv) {
2124 /* Make $_ available to executed code. */
2125 if (PL_reg_sv != DEFSV) {
59f00321 2126 SAVE_DEFSV;
d6a28714 2127 DEFSV = PL_reg_sv;
b8c5462f 2128 }
d6a28714 2129
9041c2e3 2130 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2131 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2132 /* prepare for quick setting of pos */
d300d9fa
NC
2133#ifdef PERL_OLD_COPY_ON_WRITE
2134 if (SvIsCOW(sv))
2135 sv_force_normal_flags(sv, 0);
2136#endif
2137 mg = sv_magicext(PL_reg_sv, (SV*)0, PERL_MAGIC_regex_global,
2138 &PL_vtbl_mglob, NULL, 0);
d6a28714 2139 mg->mg_len = -1;
b8c5462f 2140 }
d6a28714
JH
2141 PL_reg_magic = mg;
2142 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2143 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2144 }
09687e5a 2145 if (!PL_reg_curpm) {
a02a5408 2146 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2147#ifdef USE_ITHREADS
2148 {
2149 SV* repointer = newSViv(0);
577e12cc 2150 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2151 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2152 av_push(PL_regex_padav,repointer);
2153 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2154 PL_regex_pad = AvARRAY(PL_regex_padav);
2155 }
2156#endif
2157 }
aaa362c4 2158 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2159 PL_reg_oldcurpm = PL_curpm;
2160 PL_curpm = PL_reg_curpm;
2161 if (RX_MATCH_COPIED(prog)) {
2162 /* Here is a serious problem: we cannot rewrite subbeg,
2163 since it may be needed if this match fails. Thus
2164 $` inside (?{}) could fail... */
2165 PL_reg_oldsaved = prog->subbeg;
2166 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2167#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2168 PL_nrs = prog->saved_copy;
2169#endif
d6a28714
JH
2170 RX_MATCH_COPIED_off(prog);
2171 }
2172 else
bd61b366 2173 PL_reg_oldsaved = NULL;
d6a28714
JH
2174 prog->subbeg = PL_bostr;
2175 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2176 }
973dddac 2177 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2178 PL_reginput = startpos;
2179 PL_regstartp = prog->startp;
2180 PL_regendp = prog->endp;
2181 PL_reglastparen = &prog->lastparen;
a01268b5 2182 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2183 prog->lastparen = 0;
03994de8 2184 prog->lastcloseparen = 0;
d6a28714 2185 PL_regsize = 0;
a3621e74 2186 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2187 if (PL_reg_start_tmpl <= prog->nparens) {
2188 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2189 if(PL_reg_start_tmp)
2190 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2191 else
a02a5408 2192 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2193 }
2194
2195 /* XXXX What this code is doing here?!!! There should be no need
2196 to do this again and again, PL_reglastparen should take care of
3dd2943c 2197 this! --ilya*/
dafc8851
JH
2198
2199 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2200 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2201 * PL_reglastparen), is not needed at all by the test suite
2202 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2203 * enough, for building DynaLoader, or otherwise this
2204 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2205 * will happen. Meanwhile, this code *is* needed for the
2206 * above-mentioned test suite tests to succeed. The common theme
2207 * on those tests seems to be returning null fields from matches.
2208 * --jhi */
dafc8851 2209#if 1
d6a28714
JH
2210 sp = prog->startp;
2211 ep = prog->endp;
2212 if (prog->nparens) {
eb160463 2213 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2214 *++sp = -1;
2215 *++ep = -1;
2216 }
2217 }
dafc8851 2218#endif
02db2b7b 2219 REGCP_SET(lastcp);
d6a28714
JH
2220 if (regmatch(prog->program + 1)) {
2221 prog->endp[0] = PL_reginput - PL_bostr;
2222 return 1;
2223 }
02db2b7b 2224 REGCP_UNWIND(lastcp);
d6a28714
JH
2225 return 0;
2226}
2227
02db2b7b
IZ
2228#define RE_UNWIND_BRANCH 1
2229#define RE_UNWIND_BRANCHJ 2
2230
2231union re_unwind_t;
2232
2233typedef struct { /* XX: makes sense to enlarge it... */
2234 I32 type;
2235 I32 prev;
2236 CHECKPOINT lastcp;
2237} re_unwind_generic_t;
2238
2239typedef struct {
2240 I32 type;
2241 I32 prev;
2242 CHECKPOINT lastcp;
2243 I32 lastparen;
2244 regnode *next;
2245 char *locinput;
2246 I32 nextchr;
2247#ifdef DEBUGGING
2248 int regindent;
2249#endif
2250} re_unwind_branch_t;
2251
2252typedef union re_unwind_t {
2253 I32 type;
2254 re_unwind_generic_t generic;
2255 re_unwind_branch_t branch;
2256} re_unwind_t;
2257
8ba1375e
MJD
2258#define sayYES goto yes
2259#define sayNO goto no
e0f9d4a8 2260#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2261#define sayYES_FINAL goto yes_final
2262#define sayYES_LOUD goto yes_loud
2263#define sayNO_FINAL goto no_final
2264#define sayNO_SILENT goto do_no
2265#define saySAME(x) if (x) goto yes; else goto no
2266
3ab3c9b4
HS
2267#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2268#define POSCACHE_SEEN 1 /* we know what we're caching */
2269#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2270#define CACHEsayYES STMT_START { \
2271 if (cache_offset | cache_bit) { \
2272 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2273 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2274 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2275 /* cache records failure, but this is success */ \
2276 DEBUG_r( \
2277 PerlIO_printf(Perl_debug_log, \
2278 "%*s (remove success from failure cache)\n", \
2279 REPORT_CODE_OFF+PL_regindent*2, "") \
2280 ); \
2281 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2282 } \
2283 } \
2284 sayYES; \
2285} STMT_END
2286#define CACHEsayNO STMT_START { \
2287 if (cache_offset | cache_bit) { \
2288 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2289 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2290 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2291 /* cache records success, but this is failure */ \
2292 DEBUG_r( \
2293 PerlIO_printf(Perl_debug_log, \
2294 "%*s (remove failure from success cache)\n", \
2295 REPORT_CODE_OFF+PL_regindent*2, "") \
2296 ); \
2297 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2298 } \
2299 } \
2300 sayNO; \
2301} STMT_END
2302
a3621e74
YO
2303/* this is used to determine how far from the left messages like
2304 'failed...' are printed. Currently 29 makes these messages line
2305 up with the opcode they refer to. Earlier perls used 25 which
2306 left these messages outdented making reviewing a debug output
2307 quite difficult.
2308*/
2309#define REPORT_CODE_OFF 29
2310
2311
2312/* Make sure there is a test for this +1 options in re_tests */
2313#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2314
5b47454d 2315
8ba1375e 2316
d6a28714
JH
2317/*
2318 - regmatch - main matching routine
2319 *
2320 * Conceptually the strategy is simple: check to see whether the current
2321 * node matches, call self recursively to see whether the rest matches,
2322 * and then act accordingly. In practice we make some effort to avoid
2323 * recursion, in particular by going through "ordinary" nodes (that don't
2324 * need to know whether the rest of the match failed) by a loop instead of
2325 * by recursion.
2326 */
2327/* [lwall] I've hoisted the register declarations to the outer block in order to
2328 * maybe save a little bit of pushing and popping on the stack. It also takes
2329 * advantage of machines that use a register save mask on subroutine entry.
2330 */
2331STATIC I32 /* 0 failure, 1 success */
2332S_regmatch(pTHX_ regnode *prog)
2333{
27da23d5 2334 dVAR;
d6a28714
JH
2335 register regnode *scan; /* Current node. */
2336 regnode *next; /* Next node. */
2337 regnode *inner; /* Next node in internal branch. */
2338 register I32 nextchr; /* renamed nextchr - nextchar colides with
2339 function of same name */
2340 register I32 n; /* no or next */
b7953727 2341 register I32 ln = 0; /* len or last */
bd61b366 2342 register char *s = NULL; /* operand or save */
d6a28714 2343 register char *locinput = PL_reginput;
b7953727 2344 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2345 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2346 I32 unwind = 0;
a3621e74 2347
b7953727 2348#if 0
02db2b7b 2349 I32 firstcp = PL_savestack_ix;
b7953727 2350#endif
0d46e09a 2351 register const bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2352#ifdef DEBUGGING
6136c704
AL
2353 SV * const dsv0 = PERL_DEBUG_PAD_ZERO(0);
2354 SV * const dsv1 = PERL_DEBUG_PAD_ZERO(1);
2355 SV * const dsv2 = PERL_DEBUG_PAD_ZERO(2);
a3621e74 2356
ab74612d 2357 SV *re_debug_flags = NULL;
2a782b5b 2358#endif
041457d9 2359 U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
02db2b7b 2360
a3621e74
YO
2361 GET_RE_DEBUG_FLAGS;
2362
d6a28714
JH
2363#ifdef DEBUGGING
2364 PL_regindent++;
2365#endif
2366
a3621e74 2367
d6a28714
JH
2368 /* Note that nextchr is a byte even in UTF */
2369 nextchr = UCHARAT(locinput);
2370 scan = prog;
2371 while (scan != NULL) {
8ba1375e 2372
a3621e74 2373 DEBUG_EXECUTE_r( {
6136c704 2374 SV * const prop = sv_newmortal();
1df70142
AL
2375 const int docolor = *PL_colors[0];
2376 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2377 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2378 /* The part of the string before starttry has one color
2379 (pref0_len chars), between starttry and current
2380 position another one (pref_len - pref0_len chars),
2381 after the current position the third one.
2382 We assume that pref0_len <= pref_len, otherwise we
2383 decrease pref0_len. */
9041c2e3 2384 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2385 ? (5 + taill) - l : locinput - PL_bostr;
2386 int pref0_len;
d6a28714 2387
df1ffd02 2388 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2389 pref_len++;
2390 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2391 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2392 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2393 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2394 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2395 l--;
d6a28714
JH
2396 if (pref0_len < 0)
2397 pref0_len = 0;
2398 if (pref0_len > pref_len)
2399 pref0_len = pref_len;
2400 regprop(prop, scan);
2a782b5b 2401 {
1df70142 2402 const char * const s0 =
f14c76ed 2403 do_utf8 && OP(scan) != CANY ?
2a782b5b 2404 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2405 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2406 locinput - pref_len;
1df70142
AL
2407 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2408 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2409 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2410 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2411 locinput - pref_len + pref0_len;
1df70142
AL
2412 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2413 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2414 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2415 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2416 locinput;
1df70142 2417 const int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2418 PerlIO_printf(Perl_debug_log,
2419 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2420 (IV)(locinput - PL_bostr),
2421 PL_colors[4],
2422 len0, s0,
2423 PL_colors[5],
2424 PL_colors[2],
2425 len1, s1,
2426 PL_colors[3],
2427 (docolor ? "" : "> <"),
2428 PL_colors[0],
2429 len2, s2,
2430 PL_colors[1],
2431 15 - l - pref_len + 1,
2432 "",
2433 (IV)(scan - PL_regprogram), PL_regindent*2, "",
3f7c398e 2434 SvPVX_const(prop));
2a782b5b
JH
2435 }
2436 });
d6a28714
JH
2437
2438 next = scan + NEXT_OFF(scan);
2439 if (next == scan)
2440 next = NULL;
2441
2442 switch (OP(scan)) {
2443 case BOL:
7fba1cd6 2444 if (locinput == PL_bostr)
d6a28714
JH
2445 {
2446 /* regtill = regbol; */
b8c5462f
JH
2447 break;
2448 }
d6a28714
JH
2449 sayNO;
2450 case MBOL:
12d33761
HS
2451 if (locinput == PL_bostr ||
2452 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2453 {
b8c5462f
JH
2454 break;
2455 }
d6a28714
JH
2456 sayNO;
2457 case SBOL:
c2a73568 2458 if (locinput == PL_bostr)
b8c5462f 2459 break;
d6a28714
JH
2460 sayNO;
2461 case GPOS:
2462 if (locinput == PL_reg_ganch)
2463 break;
2464 sayNO;
2465 case EOL:
d6a28714
JH
2466 goto seol;
2467 case MEOL:
d6a28714 2468 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2469 sayNO;
b8c5462f 2470 break;
d6a28714
JH
2471 case SEOL:
2472 seol:
2473 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2474 sayNO;
d6a28714 2475 if (PL_regeol - locinput > 1)
b8c5462f 2476 sayNO;
b8c5462f 2477 break;
d6a28714
JH
2478 case EOS:
2479 if (PL_regeol != locinput)
b8c5462f 2480 sayNO;
d6a28714 2481 break;
ffc61ed2 2482 case SANY:
d6a28714 2483 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2484 sayNO;
f33976b4
DB
2485 if (do_utf8) {
2486 locinput += PL_utf8skip[nextchr];
2487 if (locinput > PL_regeol)
2488 sayNO;
2489 nextchr = UCHARAT(locinput);
2490 }
2491 else
2492 nextchr = UCHARAT(++locinput);
2493 break;
2494 case CANY:
2495 if (!nextchr && locinput >= PL_regeol)
2496 sayNO;
b8c5462f 2497 nextchr = UCHARAT(++locinput);
a0d0e21e 2498 break;
ffc61ed2 2499 case REG_ANY:
1aa99e6b
IH
2500 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2501 sayNO;
2502 if (do_utf8) {
b8c5462f 2503 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2504 if (locinput > PL_regeol)
2505 sayNO;
a0ed51b3 2506 nextchr = UCHARAT(locinput);
a0ed51b3 2507 }
1aa99e6b
IH
2508 else
2509 nextchr = UCHARAT(++locinput);
a0ed51b3 2510 break;
a3621e74
YO
2511
2512
2513
2514 /*
2515 traverse the TRIE keeping track of all accepting states
2516 we transition through until we get to a failing node.
2517
a3621e74
YO
2518
2519 */
5b47454d 2520 case TRIE:
a3621e74
YO
2521 case TRIEF:
2522 case TRIEFL:
2523 {
a3621e74
YO
2524 U8 *uc = ( U8* )locinput;
2525 U32 state = 1;
2526 U16 charid = 0;
2527 U32 base = 0;
2528 UV uvc = 0;
2529 STRLEN len = 0;
2530 STRLEN foldlen = 0;
a3621e74
YO
2531 U8 *uscan = (U8*)NULL;
2532 STRLEN bufflen=0;
5b47454d
DM
2533 const enum { trie_plain, trie_utf8, trie_uft8_fold }
2534 trie_type = do_utf8 ?
2535 (OP(scan) == TRIE ? trie_utf8 : trie_uft8_fold)
2536 : trie_plain;
2537
2538 int gotit = 0;
2539 /* accepting states we have traversed */
2540 SV *sv_accept_buff = NULL;
2541 reg_trie_accepted *accept_buff = NULL;
2542 reg_trie_data *trie; /* what trie are we using right now */
2543 U32 accepted = 0; /* how many accepting states we have seen */
a3621e74
YO
2544
2545 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2546
2547 while ( state && uc <= (U8*)PL_regeol ) {
2548
5b47454d
DM
2549 if (trie->states[ state ].wordnum) {
2550 if (!accepted ) {
2551 ENTER;
2552 SAVETMPS;
2553 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2554 sv_accept_buff=newSV(bufflen *
2555 sizeof(reg_trie_accepted) - 1);
2556 SvCUR_set(sv_accept_buff,
2557 sizeof(reg_trie_accepted));
2558 SvPOK_on(sv_accept_buff);
2559 sv_2mortal(sv_accept_buff);
2560 accept_buff =
2561 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2562 }
2563 else {
2564 if (accepted >= bufflen) {
2565 bufflen *= 2;
2566 accept_buff =(reg_trie_accepted*)
2567 SvGROW(sv_accept_buff,
2568 bufflen * sizeof(reg_trie_accepted));
2569 }
2570 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2571 + sizeof(reg_trie_accepted));
2572 }
2573 accept_buff[accepted].wordnum = trie->states[state].wordnum;
2574 accept_buff[accepted].endpos = uc;
2575 ++accepted;
2576 }
a3621e74
YO
2577
2578 base = trie->states[ state ].trans.base;
2579
2580 DEBUG_TRIE_EXECUTE_r(
2581 PerlIO_printf( Perl_debug_log,
e4584336 2582 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2583 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
e4584336 2584 (UV)state, (UV)base, (UV)accepted );
a3621e74
YO
2585 );
2586
2587 if ( base ) {
5b47454d
DM
2588 switch (trie_type) {
2589 case trie_uft8_fold:
a3621e74
YO
2590 if ( foldlen>0 ) {
2591 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2592 foldlen -= len;
2593 uscan += len;
2594 len=0;
2595 } else {
1df70142 2596 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2597 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2598 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2599 foldlen -= UNISKIP( uvc );
2600 uscan = foldbuf + UNISKIP( uvc );
2601 }
5b47454d
DM
2602 break;
2603 case trie_utf8:
2604 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2605 &len, uniflags );
2606 break;
2607 case trie_plain:
e4584336 2608 uvc = (UV)*uc;
a3621e74
YO
2609 len = 1;
2610 }
2611
5b47454d
DM
2612 if (uvc < 256) {
2613 charid = trie->charmap[ uvc ];
2614 }
2615 else {
2616 charid = 0;
2617 if (trie->widecharmap) {
2618 SV** svpp = (SV**)NULL;
2619 svpp = hv_fetch(trie->widecharmap,
2620 (char*)&uvc, sizeof(UV), 0);
2621 if (svpp)
2622 charid = (U16)SvIV(*svpp);
2623 }
2624 }
a3621e74 2625
5b47454d
DM
2626 if (charid &&
2627 (base + charid > trie->uniquecharcount )
2628 && (base + charid - 1 - trie->uniquecharcount
2629 < trie->lasttrans)
2630 && trie->trans[base + charid - 1 -
2631 trie->uniquecharcount].check == state)
2632 {
2633 state = trie->trans[base + charid - 1 -
2634 trie->uniquecharcount ].next;
2635 }
2636 else {
2637 state = 0;
2638 }
2639 uc += len;
2640
2641 }
2642 else {
a3621e74
YO
2643 state = 0;
2644 }
2645 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2646 PerlIO_printf( Perl_debug_log,
2647 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2648 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2649 );
2650 }
5b47454d 2651 if (!accepted )
a3621e74 2652 sayNO;
a3621e74
YO
2653
2654 /*
2655 There was at least one accepting state that we
2656 transitioned through. Presumably the number of accepting
2657 states is going to be low, typically one or two. So we
2658 simply scan through to find the one with lowest wordnum.
2659 Once we find it, we swap the last state into its place
2660 and decrement the size. We then try to match the rest of
2661 the pattern at the point where the word ends, if we
2662 succeed then we end the loop, otherwise the loop
2663 eventually terminates once all of the accepting states
2664 have been tried.
2665 */
a3621e74
YO
2666
2667 if ( accepted == 1 ) {
2668 DEBUG_EXECUTE_r({
2669 SV **tmp = av_fetch( trie->words, accept_buff[ 0 ].wordnum-1, 0 );
2670 PerlIO_printf( Perl_debug_log,
2671 "%*s %sonly one match : #%d <%s>%s\n",
e4584336 2672 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74 2673 accept_buff[ 0 ].wordnum,
cfd0369c 2674 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
a3621e74
YO
2675 PL_colors[5] );
2676 });
cc601c31 2677 PL_reginput = (char *)accept_buff[ 0 ].endpos;
a3621e74
YO
2678 /* in this case we free tmps/leave before we call regmatch
2679 as we wont be using accept_buff again. */
2680 FREETMPS;
2681 LEAVE;
2682 gotit = regmatch( scan + NEXT_OFF( scan ) );
2683 } else {
2684 DEBUG_EXECUTE_r(
e4584336
RB
2685 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
2686 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)accepted,
a3621e74
YO
2687 PL_colors[5] );
2688 );
2689 while ( !gotit && accepted-- ) {
2690 U32 best = 0;
2691 U32 cur;
2692 for( cur = 1 ; cur <= accepted ; cur++ ) {
e4584336
RB
2693 DEBUG_TRIE_EXECUTE_r(
2694 PerlIO_printf( Perl_debug_log,
2695 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2696 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2697 (IV)best, accept_buff[ best ].wordnum, (IV)cur,
2698 accept_buff[ cur ].wordnum, PL_colors[5] );
2699 );
a3621e74
YO
2700
2701 if ( accept_buff[ cur ].wordnum < accept_buff[ best ].wordnum )
2702 best = cur;
2703 }
2704 DEBUG_EXECUTE_r({
6136c704 2705 SV ** const tmp = av_fetch( trie->words, accept_buff[ best ].wordnum - 1, 0 );
a3621e74 2706 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
e4584336 2707 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74 2708 accept_buff[best].wordnum,
cfd0369c 2709 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",scan,
a3621e74
YO
2710 PL_colors[5] );
2711 });
2712 if ( best<accepted ) {
2713 reg_trie_accepted tmp = accept_buff[ best ];
2714 accept_buff[ best ] = accept_buff[ accepted ];
2715 accept_buff[ accepted ] = tmp;
2716 best = accepted;
2717 }
cc601c31 2718 PL_reginput = (char *)accept_buff[ best ].endpos;
a3621e74
YO
2719
2720 /*
2721 as far as I can tell we only need the SAVETMPS/FREETMPS
2722 for re's with EVAL in them but I'm leaving them in for
2723 all until I can be sure.
2724 */
2725 SAVETMPS;
2726 gotit = regmatch( scan + NEXT_OFF( scan ) ) ;
2727 FREETMPS;
2728 }
2729 FREETMPS;
2730 LEAVE;
2731 }
2732
2733 if ( gotit ) {
2734 sayYES;
2735 } else {
2736 sayNO;
2737 }
2738 }
2739 /* unreached codepoint */
d6a28714 2740 case EXACT:
cd439c50
IZ
2741 s = STRING(scan);
2742 ln = STR_LEN(scan);
eb160463 2743 if (do_utf8 != UTF) {
bc517b45 2744 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2745 char *l = locinput;
a3b680e6 2746 const char *e = s + ln;
a72c7584 2747
5ff6fc6d
JH
2748 if (do_utf8) {
2749 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2750 while (s < e) {
a3b680e6 2751 STRLEN ulen;
1aa99e6b 2752 if (l >= PL_regeol)
5ff6fc6d
JH
2753 sayNO;
2754 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2755 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2756 uniflags))
5ff6fc6d 2757 sayNO;
bc517b45 2758 l += ulen;
5ff6fc6d 2759 s ++;
1aa99e6b 2760 }
5ff6fc6d
JH
2761 }
2762 else {
2763 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2764 while (s < e) {
a3b680e6 2765 STRLEN ulen;
1aa99e6b
IH
2766 if (l >= PL_regeol)
2767 sayNO;
5ff6fc6d 2768 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2769 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2770 uniflags))
1aa99e6b 2771 sayNO;
bc517b45 2772 s += ulen;
a72c7584 2773 l ++;
1aa99e6b 2774 }
5ff6fc6d 2775 }
1aa99e6b
IH
2776 locinput = l;
2777 nextchr = UCHARAT(locinput);
2778 break;
2779 }
bc517b45 2780 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2781 /* Inline the first character, for speed. */
2782 if (UCHARAT(s) != nextchr)
2783 sayNO;
2784 if (PL_regeol - locinput < ln)
2785 sayNO;
2786 if (ln > 1 && memNE(s, locinput, ln))
2787 sayNO;
2788 locinput += ln;
2789 nextchr = UCHARAT(locinput);
2790 break;
2791 case EXACTFL:
b8c5462f
JH
2792 PL_reg_flags |= RF_tainted;
2793 /* FALL THROUGH */
d6a28714 2794 case EXACTF:
cd439c50
IZ
2795 s = STRING(scan);
2796 ln = STR_LEN(scan);
d6a28714 2797
d07ddd77
JH
2798 if (do_utf8 || UTF) {
2799 /* Either target or the pattern are utf8. */
d6a28714 2800 char *l = locinput;
d07ddd77 2801 char *e = PL_regeol;
bc517b45 2802
eb160463 2803 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2804 l, &e, 0, do_utf8)) {
5486206c
JH
2805 /* One more case for the sharp s:
2806 * pack("U0U*", 0xDF) =~ /ss/i,
2807 * the 0xC3 0x9F are the UTF-8
2808 * byte sequence for the U+00DF. */
2809 if (!(do_utf8 &&
2810 toLOWER(s[0]) == 's' &&
2811 ln >= 2 &&
2812 toLOWER(s[1]) == 's' &&
2813 (U8)l[0] == 0xC3 &&
2814 e - l >= 2 &&
2815 (U8)l[1] == 0x9F))
2816 sayNO;
2817 }
d07ddd77
JH
2818 locinput = e;
2819 nextchr = UCHARAT(locinput);
2820 break;
a0ed51b3 2821 }
d6a28714 2822
bc517b45
JH
2823 /* Neither the target and the pattern are utf8. */
2824
d6a28714
JH
2825 /* Inline the first character, for speed. */
2826 if (UCHARAT(s) != nextchr &&
2827 UCHARAT(s) != ((OP(scan) == EXACTF)
2828 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2829 sayNO;
d6a28714 2830 if (PL_regeol - locinput < ln)
b8c5462f 2831 sayNO;
d6a28714
JH
2832 if (ln > 1 && (OP(scan) == EXACTF
2833 ? ibcmp(s, locinput, ln)
2834 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2835 sayNO;
d6a28714
JH
2836 locinput += ln;
2837 nextchr = UCHARAT(locinput);
a0d0e21e 2838 break;
d6a28714 2839 case ANYOF:
ffc61ed2 2840 if (do_utf8) {
9e55ce06
JH
2841 STRLEN inclasslen = PL_regeol - locinput;
2842
ba7b4546 2843 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2844 sayNO_ANYOF;
ffc61ed2
JH
2845 if (locinput >= PL_regeol)
2846 sayNO;
0f0076b4 2847 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2848 nextchr = UCHARAT(locinput);
e0f9d4a8 2849 break;
ffc61ed2
JH
2850 }
2851 else {
2852 if (nextchr < 0)
2853 nextchr = UCHARAT(locinput);
7d3e948e 2854 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2855 sayNO_ANYOF;
ffc61ed2
JH
2856 if (!nextchr && locinput >= PL_regeol)
2857 sayNO;
2858 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2859 break;
2860 }
2861 no_anyof:
2862 /* If we might have the case of the German sharp s
2863 * in a casefolding Unicode character class. */
2864
ebc501f0
JH
2865 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2866 locinput += SHARP_S_SKIP;
e0f9d4a8 2867 nextchr = UCHARAT(locinput);
ffc61ed2 2868 }
e0f9d4a8
JH
2869 else
2870 sayNO;
b8c5462f 2871 break;
d6a28714 2872 case ALNUML:
b8c5462f
JH
2873 PL_reg_flags |= RF_tainted;
2874 /* FALL THROUGH */
d6a28714 2875 case ALNUM:
b8c5462f 2876 if (!nextchr)
4633a7c4 2877 sayNO;
ffc61ed2 2878 if (do_utf8) {
1a4fad37 2879 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2880 if (!(OP(scan) == ALNUM
3568d838 2881 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2882 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2883 {
2884 sayNO;
a0ed51b3 2885 }
b8c5462f 2886 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2887 nextchr = UCHARAT(locinput);
2888 break;
2889 }
ffc61ed2 2890 if (!(OP(scan) == ALNUM
d6a28714 2891 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2892 sayNO;
b8c5462f 2893 nextchr = UCHARAT(++locinput);
a0d0e21e 2894 break;
d6a28714 2895 case NALNUML:
b8c5462f
JH
2896 PL_reg_flags |= RF_tainted;
2897 /* FALL THROUGH */
d6a28714
JH
2898 case NALNUM:
2899 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2900 sayNO;
ffc61ed2 2901 if (do_utf8) {
1a4fad37 2902 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2903 if (OP(scan) == NALNUM
3568d838 2904 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2905 : isALNUM_LC_utf8((U8*)locinput))
2906 {
b8c5462f 2907 sayNO;
d6a28714 2908 }
b8c5462f
JH
2909 locinput += PL_utf8skip[nextchr];
2910 nextchr = UCHARAT(locinput);
2911 break;
2912 }
ffc61ed2 2913 if (OP(scan) == NALNUM
d6a28714 2914 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2915 sayNO;
76e3520e 2916 nextchr = UCHARAT(++locinput);
a0d0e21e 2917 break;
d6a28714
JH
2918 case BOUNDL:
2919 case NBOUNDL:
3280af22 2920 PL_reg_flags |= RF_tainted;
bbce6d69 2921 /* FALL THROUGH */
d6a28714
JH
2922 case BOUND:
2923 case NBOUND:
2924 /* was last char in word? */
ffc61ed2 2925 if (do_utf8) {
12d33761
HS
2926 if (locinput == PL_bostr)
2927 ln = '\n';
ffc61ed2 2928 else {
a3b680e6 2929 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2930
b4f7163a 2931 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2932 }
2933 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2934 ln = isALNUM_uni(ln);
1a4fad37 2935 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 2936 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2937 }
2938 else {
9041c2e3 2939 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2940 n = isALNUM_LC_utf8((U8*)locinput);
2941 }
a0ed51b3 2942 }
d6a28714 2943 else {
12d33761
HS
2944 ln = (locinput != PL_bostr) ?
2945 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2946 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2947 ln = isALNUM(ln);
2948 n = isALNUM(nextchr);
2949 }
2950 else {
2951 ln = isALNUM_LC(ln);
2952 n = isALNUM_LC(nextchr);
2953 }
d6a28714 2954 }
ffc61ed2
JH
2955 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2956 OP(scan) == BOUNDL))
2957 sayNO;
a0ed51b3 2958 break;
d6a28714 2959 case SPACEL:
3280af22 2960 PL_reg_flags |= RF_tainted;
bbce6d69 2961 /* FALL THROUGH */
d6a28714 2962 case SPACE:
9442cb0e 2963 if (!nextchr)
4633a7c4 2964 sayNO;
1aa99e6b 2965 if (do_utf8) {
fd400ab9 2966 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 2967 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 2968 if (!(OP(scan) == SPACE
3568d838 2969 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2970 : isSPACE_LC_utf8((U8*)locinput)))
2971 {
2972 sayNO;
2973 }
2974 locinput += PL_utf8skip[nextchr];
2975 nextchr = UCHARAT(locinput);
2976 break;
d6a28714 2977 }
ffc61ed2
JH
2978 if (!(OP(scan) == SPACE
2979 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2980 sayNO;
2981 nextchr = UCHARAT(++locinput);
2982 }
2983 else {
2984 if (!(OP(scan) == SPACE
2985 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2986 sayNO;
2987 nextchr = UCHARAT(++locinput);
a0ed51b3 2988 }
a0ed51b3 2989 break;
d6a28714 2990 case NSPACEL:
3280af22 2991 PL_reg_flags |= RF_tainted;
bbce6d69 2992 /* FALL THROUGH */
d6a28714 2993 case NSPACE:
9442cb0e 2994 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2995 sayNO;
1aa99e6b 2996 if (do_utf8) {
1a4fad37 2997 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 2998 if (OP(scan) == NSPACE
3568d838 2999 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3000 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3001 {
3002 sayNO;
3003 }
3004 locinput += PL_utf8skip[nextchr];
3005 nextchr = UCHARAT(locinput);
3006 break;
a0ed51b3 3007 }
ffc61ed2 3008 if (OP(scan) == NSPACE
d6a28714 3009 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3010 sayNO;
76e3520e 3011 nextchr = UCHARAT(++locinput);
a0d0e21e 3012 break;
d6a28714 3013 case DIGITL:
a0ed51b3
LW
3014 PL_reg_flags |= RF_tainted;
3015 /* FALL THROUGH */
d6a28714 3016 case DIGIT:
9442cb0e 3017 if (!nextchr)
a0ed51b3 3018 sayNO;
1aa99e6b 3019 if (do_utf8) {
1a4fad37 3020 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3021 if (!(OP(scan) == DIGIT
3568d838 3022 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3023 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3024 {
a0ed51b3 3025 sayNO;
dfe13c55 3026 }
6f06b55f 3027 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3028 nextchr = UCHARAT(locinput);
3029 break;
3030 }
ffc61ed2 3031 if (!(OP(scan) == DIGIT
9442cb0e 3032 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3033 sayNO;
3034 nextchr = UCHARAT(++locinput);
3035 break;
d6a28714 3036 case NDIGITL:
b8c5462f
JH
3037 PL_reg_flags |= RF_tainted;
3038 /* FALL THROUGH */
d6a28714 3039 case NDIGIT:
9442cb0e 3040 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3041 sayNO;
1aa99e6b 3042 if (do_utf8) {
1a4fad37 3043 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3044 if (OP(scan) == NDIGIT
3568d838 3045 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3046 : isDIGIT_LC_utf8((U8*)locinput))
3047 {
a0ed51b3 3048 sayNO;
9442cb0e 3049 }
6f06b55f 3050 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3051 nextchr = UCHARAT(locinput);
3052 break;
3053 }
ffc61ed2 3054 if (OP(scan) == NDIGIT
9442cb0e 3055 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3056 sayNO;
3057 nextchr = UCHARAT(++locinput);
3058 break;
3059 case CLUMP:
b7c83a7e 3060 if (locinput >= PL_regeol)
a0ed51b3 3061 sayNO;
b7c83a7e 3062 if (do_utf8) {
1a4fad37 3063 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3064 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3065 sayNO;
3066 locinput += PL_utf8skip[nextchr];
3067 while (locinput < PL_regeol &&
3068 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3069 locinput += UTF8SKIP(locinput);
3070 if (locinput > PL_regeol)
3071 sayNO;
eb08e2da
JH
3072 }
3073 else
3074 locinput++;
a0ed51b3
LW
3075 nextchr = UCHARAT(locinput);
3076 break;
c8756f30 3077 case REFFL:
3280af22 3078 PL_reg_flags |= RF_tainted;
c8756f30 3079 /* FALL THROUGH */
c277df42 3080 case REF:
c8756f30 3081 case REFF:
c277df42 3082 n = ARG(scan); /* which paren pair */
cf93c79d 3083 ln = PL_regstartp[n];
2c2d71f5 3084 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 3085 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 3086 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 3087 if (ln == PL_regendp[n])
a0d0e21e 3088 break;
a0ed51b3 3089
cf93c79d 3090 s = PL_bostr + ln;
1aa99e6b 3091 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3092 char *l = locinput;
a3b680e6 3093 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3094 /*
3095 * Note that we can't do the "other character" lookup trick as
3096 * in the 8-bit case (no pun intended) because in Unicode we
3097 * have to map both upper and title case to lower case.
3098 */
3099 if (OP(scan) == REFF) {
3100 while (s < e) {
a3b680e6
AL
3101 STRLEN ulen1, ulen2;
3102 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3103 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3104
a0ed51b3
LW
3105 if (l >= PL_regeol)
3106 sayNO;
a2a2844f
JH
3107 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3108 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3109 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3110 sayNO;
a2a2844f
JH
3111 s += ulen1;
3112 l += ulen2;
a0ed51b3
LW
3113 }
3114 }
3115 locinput = l;
3116 nextchr = UCHARAT(locinput);
3117 break;
3118 }
3119
a0d0e21e 3120 /* Inline the first character, for speed. */
76e3520e 3121 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3122 (OP(scan) == REF ||
3123 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3124 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3125 sayNO;
cf93c79d 3126 ln = PL_regendp[n] - ln;
3280af22 3127 if (locinput + ln > PL_regeol)
4633a7c4 3128 sayNO;
c8756f30
AK
3129 if (ln > 1 && (OP(scan) == REF
3130 ? memNE(s, locinput, ln)
3131 : (OP(scan) == REFF
3132 ? ibcmp(s, locinput, ln)
3133 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3134 sayNO;
a0d0e21e 3135 locinput += ln;
76e3520e 3136 nextchr = UCHARAT(locinput);
a0d0e21e
LW
3137 break;
3138
3139 case NOTHING:
c277df42 3140 case TAIL:
a0d0e21e
LW
3141 break;
3142 case BACK:
3143 break;
c277df42
IZ
3144 case EVAL:
3145 {
3146 dSP;
6136c704
AL
3147 OP_4tree * const oop = PL_op;
3148 COP * const ocurcop = PL_curcop;
f3548bdc 3149 PAD *old_comppad;
c277df42 3150 SV *ret;
6136c704 3151 struct regexp * const oreg = PL_reg_re;
9041c2e3 3152
c277df42 3153 n = ARG(scan);
533c011a 3154 PL_op = (OP_4tree*)PL_regdata->data[n];
a3621e74 3155 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 3156 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 3157 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 3158
8e5e9ebe 3159 {
6136c704 3160 SV ** const before = SP;
8e5e9ebe
RGS
3161 CALLRUNOPS(aTHX); /* Scalar context. */
3162 SPAGAIN;
3163 if (SP == before)
075aa684 3164 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3165 else {
3166 ret = POPs;
3167 PUTBACK;
3168 }
3169 }
3170
0f5d15d6 3171 PL_op = oop;
f3548bdc 3172 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 3173 PL_curcop = ocurcop;
c277df42 3174 if (logical) {
0f5d15d6
IZ
3175 if (logical == 2) { /* Postponed subexpression. */
3176 regexp *re;
6136c704 3177 MAGIC *mg = NULL;
0f5d15d6 3178 re_cc_state state;
0f5d15d6 3179 CHECKPOINT cp, lastcp;
cb50f42d 3180 int toggleutf;
faf82a0b 3181 register SV *sv;
0f5d15d6 3182
faf82a0b
AE
3183 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3184 mg = mg_find(sv, PERL_MAGIC_qr);
3185 else if (SvSMAGICAL(ret)) {
3186 if (SvGMAGICAL(ret))
3187 sv_unmagic(ret, PERL_MAGIC_qr);
3188 else
3189 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3190 }
faf82a0b 3191
0f5d15d6
IZ
3192 if (mg) {
3193 re = (regexp *)mg->mg_obj;
df0003d4 3194 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3195 }
3196 else {
3197 STRLEN len;
6136c704 3198 const char * const t = SvPV_const(ret, len);
0f5d15d6 3199 PMOP pm;
a3b680e6
AL
3200 char * const oprecomp = PL_regprecomp;
3201 const I32 osize = PL_regsize;
3202 const I32 onpar = PL_regnpar;
0f5d15d6 3203
5fcd1c1b 3204 Zero(&pm, 1, PMOP);
cb50f42d 3205 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3206 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3207 if (!(SvFLAGS(ret)
faf82a0b
AE
3208 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3209 | SVs_GMG)))
14befaf4
DM
3210 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3211 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
3212 PL_regprecomp = oprecomp;
3213 PL_regsize = osize;
3214 PL_regnpar = onpar;
3215 }
a3621e74 3216 DEBUG_EXECUTE_r(
9041c2e3 3217 PerlIO_printf(Perl_debug_log,
a0288114 3218 "Entering embedded \"%s%.60s%s%s\"\n",
0f5d15d6
IZ
3219 PL_colors[0],
3220 re->precomp,
3221 PL_colors[1],
3222 (strlen(re->precomp) > 60 ? "..." : ""))
3223 );
3224 state.node = next;
3225 state.prev = PL_reg_call_cc;
3226 state.cc = PL_regcc;
3227 state.re = PL_reg_re;
3228
2ab05381 3229 PL_regcc = 0;
9041c2e3 3230
0f5d15d6 3231 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3232 REGCP_SET(lastcp);
0f5d15d6
IZ
3233 cache_re(re);
3234 state.ss = PL_savestack_ix;
3235 *PL_reglastparen = 0;
a01268b5 3236 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
3237 PL_reg_call_cc = &state;
3238 PL_reginput = locinput;
cb50f42d
YST
3239 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3240 ((re->reganch & ROPT_UTF8) != 0);
3241 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3242
3243 /* XXXX This is too dramatic a measure... */
3244 PL_reg_maxiter = 0;
3245
0f5d15d6 3246 if (regmatch(re->program + 1)) {
2c914db6
IZ
3247 /* Even though we succeeded, we need to restore
3248 global variables, since we may be wrapped inside
3249 SUSPEND, thus the match may be not finished yet. */
3250
3251 /* XXXX Do this only if SUSPENDed? */
3252 PL_reg_call_cc = state.prev;
3253 PL_regcc = state.cc;
3254 PL_reg_re = state.re;
3255 cache_re(PL_reg_re);
cb50f42d 3256 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
3257
3258 /* XXXX This is too dramatic a measure... */
3259 PL_reg_maxiter = 0;
3260
3261 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
3262 ReREFCNT_dec(re);
3263 regcpblow(cp);
3264 sayYES;
3265 }
0f5d15d6 3266 ReREFCNT_dec(re);
02db2b7b 3267 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3268 regcppop();
3269 PL_reg_call_cc = state.prev;
3270 PL_regcc = state.cc;
3271 PL_reg_re = state.re;
d3790889 3272 cache_re(PL_reg_re);
cb50f42d 3273 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3274
3275 /* XXXX This is too dramatic a measure... */
3276 PL_reg_maxiter = 0;
3277
8e514ae6 3278 logical = 0;
0f5d15d6
IZ
3279 sayNO;
3280 }
c277df42 3281 sw = SvTRUE(ret);
0f5d15d6 3282 logical = 0;
a0ed51b3 3283 }
080c2dec 3284 else {
3280af22 3285 sv_setsv(save_scalar(PL_replgv), ret);
080c2dec
AE
3286 cache_re(oreg);
3287 }
c277df42
IZ
3288 break;
3289 }
a0d0e21e 3290 case OPEN:
c277df42 3291 n = ARG(scan); /* which paren pair */
3280af22
NIS
3292 PL_reg_start_tmp[n] = locinput;
3293 if (n > PL_regsize)
3294 PL_regsize = n;
a0d0e21e
LW
3295 break;
3296 case CLOSE:
c277df42 3297 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3298 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3299 PL_regendp[n] = locinput - PL_bostr;
eb160463 3300 if (n > (I32)*PL_reglastparen)
3280af22 3301 *PL_reglastparen = n;
a01268b5 3302 *PL_reglastcloseparen = n;
a0d0e21e 3303 break;
c277df42
IZ
3304 case GROUPP:
3305 n = ARG(scan); /* which paren pair */
eb160463 3306 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3307 break;
3308 case IFTHEN:
2c2d71f5 3309 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
3310 if (sw)
3311 next = NEXTOPER(NEXTOPER(scan));
3312 else {
3313 next = scan + ARG(scan);
3314 if (OP(next) == IFTHEN) /* Fake one. */
3315 next = NEXTOPER(NEXTOPER(next));
3316 }
3317 break;
3318 case LOGICAL:
0f5d15d6 3319 logical = scan->flags;
c277df42 3320 break;
2ab05381
IZ
3321/*******************************************************************
3322 PL_regcc contains infoblock about the innermost (...)* loop, and
3323 a pointer to the next outer infoblock.
3324
3325 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3326
3327 1) After matching X, regnode for CURLYX is processed;
3328
9041c2e3 3329 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3330 regmatch() recursively with the starting point at WHILEM node;
3331
3332 3) Each hit of WHILEM node tries to match A and Z (in the order
3333 depending on the current iteration, min/max of {min,max} and
3334 greediness). The information about where are nodes for "A"
3335 and "Z" is read from the infoblock, as is info on how many times "A"
3336 was already matched, and greediness.
3337
3338 4) After A matches, the same WHILEM node is hit again.
3339
3340 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3341 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3342 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3343 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3344 of the external loop.
3345
3346 Currently present infoblocks form a tree with a stem formed by PL_curcc
3347 and whatever it mentions via ->next, and additional attached trees
3348 corresponding to temporarily unset infoblocks as in "5" above.
3349
9041c2e3 3350 In the following picture infoblocks for outer loop of
2ab05381
IZ
3351 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3352 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3353 infoblocks are drawn below the "reset" infoblock.
3354
3355 In fact in the picture below we do not show failed matches for Z and T
3356 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3357 more obvious *why* one needs to *temporary* unset infoblocks.]
3358
3359 Matched REx position InfoBlocks Comment
3360 (Y(A)*?Z)*?T x
3361 Y(A)*?Z)*?T x <- O
3362 Y (A)*?Z)*?T x <- O
3363 Y A)*?Z)*?T x <- O <- I
3364 YA )*?Z)*?T x <- O <- I
3365 YA A)*?Z)*?T x <- O <- I
3366 YAA )*?Z)*?T x <- O <- I
3367 YAA Z)*?T x <- O # Temporary unset I
3368 I
3369
3370 YAAZ Y(A)*?Z)*?T x <- O
3371 I
3372
3373 YAAZY (A)*?Z)*?T x <- O
3374 I
3375
3376 YAAZY A)*?Z)*?T x <- O <- I
3377 I
3378
3379 YAAZYA )*?Z)*?T x <- O <- I
3380 I
3381
3382 YAAZYA Z)*?T x <- O # Temporary unset I
3383 I,I
3384
3385 YAAZYAZ )*?T x <- O
3386 I,I
3387
3388 YAAZYAZ T x # Temporary unset O
3389 O
3390 I,I
3391
3392 YAAZYAZT x
3393 O
3394 I,I
3395 *******************************************************************/
a0d0e21e
LW
3396 case CURLYX: {
3397 CURCUR cc;
3280af22 3398 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3399 /* No need to save/restore up to this paren */
3400 I32 parenfloor = scan->flags;
c277df42
IZ
3401
3402 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3403 next += ARG(next);
3280af22
NIS
3404 cc.oldcc = PL_regcc;
3405 PL_regcc = &cc;
cb434fcc
IZ
3406 /* XXXX Probably it is better to teach regpush to support
3407 parenfloor > PL_regsize... */
eb160463 3408 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3409 parenfloor = *PL_reglastparen; /* Pessimization... */
3410 cc.parenfloor = parenfloor;
a0d0e21e
LW
3411 cc.cur = -1;
3412 cc.min = ARG1(scan);
3413 cc.max = ARG2(scan);
c277df42 3414 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3415 cc.next = next;
3416 cc.minmod = minmod;
3417 cc.lastloc = 0;
3280af22 3418 PL_reginput = locinput;
a0d0e21e
LW
3419 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3420 regcpblow(cp);
3280af22 3421 PL_regcc = cc.oldcc;
4633a7c4 3422 saySAME(n);
a0d0e21e 3423 }
5f66b61c 3424 /* NOTREACHED */
a0d0e21e
LW
3425 case WHILEM: {
3426 /*
3427 * This is really hard to understand, because after we match
3428 * what we're trying to match, we must make sure the rest of
2c2d71f5 3429 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3430 * to go back UP the parse tree by recursing ever deeper. And
3431 * if it fails, we have to reset our parent's current state
3432 * that we can try again after backing off.
3433 */
3434
c277df42 3435 CHECKPOINT cp, lastcp;
3280af22 3436 CURCUR* cc = PL_regcc;
6136c704 3437 char * const lastloc = cc->lastloc; /* Detection of 0-len. */
3ab3c9b4 3438 I32 cache_offset = 0, cache_bit = 0;
c277df42 3439
4633a7c4 3440 n = cc->cur + 1; /* how many we know we matched */
3280af22 3441 PL_reginput = locinput;
a0d0e21e 3442
a3621e74 3443 DEBUG_EXECUTE_r(
9041c2e3 3444 PerlIO_printf(Perl_debug_log,
91f3b821 3445 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3446 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3447 (long)n, (long)cc->min,
2797576d 3448 (long)cc->max, PTR2UV(cc))
c277df42 3449 );
4633a7c4 3450
a0d0e21e
LW
3451 /* If degenerate scan matches "", assume scan done. */
3452
579cf2c3 3453 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3454 PL_regcc = cc->oldcc;
2ab05381
IZ
3455 if (PL_regcc)
3456 ln = PL_regcc->cur;
a3621e74 3457 DEBUG_EXECUTE_r(
c3464db5
DD
3458 PerlIO_printf(Perl_debug_log,
3459 "%*s empty match detected, try continuation...\n",
3280af22 3460 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3461 );
a0d0e21e 3462 if (regmatch(cc->next))
4633a7c4 3463 sayYES;
2ab05381
IZ
3464 if (PL_regcc)
3465 PL_regcc->cur = ln;
3280af22 3466 PL_regcc = cc;
4633a7c4 3467 sayNO;
a0d0e21e
LW
3468 }
3469
3470 /* First just match a string of min scans. */
3471
3472 if (n < cc->min) {
3473 cc->cur = n;
3474 cc->lastloc = locinput;
4633a7c4
LW
3475 if (regmatch(cc->scan))
3476 sayYES;
3477 cc->cur = n - 1;
c277df42 3478 cc->lastloc = lastloc;