This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade to Devel::PPPort 3.08_05
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e 33#ifdef PERL_EXT_RE_BUILD
54df2634 34#include "re_top.h"
9041c2e3 35#endif
56953603 36
a687059c 37/*
e50aee73 38 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
39 *
40 * Copyright (c) 1986 by University of Toronto.
41 * Written by Henry Spencer. Not derived from licensed software.
42 *
43 * Permission is granted to anyone to use this software for any
44 * purpose on any computer system, and to redistribute it freely,
45 * subject to the following restrictions:
46 *
47 * 1. The author is not responsible for the consequences of use of
48 * this software, no matter how awful, even if they arise
49 * from defects in it.
50 *
51 * 2. The origin of this software must not be misrepresented, either
52 * by explicit claim or by omission.
53 *
54 * 3. Altered versions must be plainly marked as such, and must not
55 * be misrepresented as being the original software.
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
a687059c
LW
64 *
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
68 */
69#include "EXTERN.h"
864dbfa3 70#define PERL_IN_REGEXEC_C
a687059c 71#include "perl.h"
0f5d15d6 72
54df2634
NC
73#ifdef PERL_IN_XSUB_RE
74# include "re_comp.h"
75#else
76# include "regcomp.h"
77#endif
a687059c 78
c277df42
IZ
79#define RF_tainted 1 /* tainted information used? */
80#define RF_warned 2 /* warned about big count? */
ce862d02 81#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
82#define RF_utf8 8 /* String contains multibyte chars? */
83
eb160463 84#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
85
86#define RS_init 1 /* eval environment created */
87#define RS_set 2 /* replsv value is set */
c277df42 88
a687059c
LW
89#ifndef STATIC
90#define STATIC static
91#endif
92
32fc9b6a 93#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 94
c277df42
IZ
95/*
96 * Forwards.
97 */
98
33b8afdf 99#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 100#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 101
3dab1dad
YO
102#define HOPc(pos,off) \
103 (char *)(PL_reg_match_utf8 \
52657f30 104 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
105 : (U8*)(pos + off))
106#define HOPBACKc(pos, off) \
07be1b83
YO
107 (char*)(PL_reg_match_utf8\
108 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
109 : (pos - off >= PL_bostr) \
8e11feef 110 ? (U8*)pos - off \
3dab1dad 111 : NULL)
efb30f32 112
53c4c00c 113#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 114#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 115
1a4fad37
AL
116#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
117 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
118#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
119#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
120#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
121#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 122
3dab1dad
YO
123/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124
5f80c4cf 125/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
126#define JUMPABLE(rn) ( \
127 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
128 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
129 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 130 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
131)
132
cca55fe3 133#define HAS_TEXT(rn) ( \
3dab1dad 134 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 135)
e2d8ce26 136
a84d97b6
HS
137/*
138 Search for mandatory following text node; for lookahead, the text must
139 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140*/
cca55fe3 141#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
142 while (JUMPABLE(rn)) { \
143 const OPCODE type = OP(rn); \
144 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 145 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 146 else if (type == PLUS) \
cca55fe3 147 rn = NEXTOPER(rn); \
3dab1dad 148 else if (type == IFMATCH) \
a84d97b6 149 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 150 else rn += NEXT_OFF(rn); \
3dab1dad 151 } \
5f80c4cf 152} STMT_END
74750237 153
acfe0abc 154static void restore_pos(pTHX_ void *arg);
51371543 155
76e3520e 156STATIC CHECKPOINT
cea2e8a9 157S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 158{
97aff369 159 dVAR;
a3b680e6 160 const int retval = PL_savestack_ix;
b1ce53c5 161#define REGCP_PAREN_ELEMS 4
a3b680e6 162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 163 int p;
40a82448 164 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 165
e49a9654
IH
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
168
a01268b5 169#define REGCP_OTHER_ELEMS 6
4b3c1a47 170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 171 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 172/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
173 SSPUSHINT(PL_regendp[p]);
174 SSPUSHINT(PL_regstartp[p]);
3280af22 175 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 176 SSPUSHINT(p);
40a82448
DM
177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
178 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
179 (UV)p, (IV)PL_regstartp[p],
180 (IV)(PL_reg_start_tmp[p] - PL_bostr),
181 (IV)PL_regendp[p]
182 ));
a0d0e21e 183 }
b1ce53c5 184/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
185 SSPUSHINT(PL_regsize);
186 SSPUSHINT(*PL_reglastparen);
a01268b5 187 SSPUSHINT(*PL_reglastcloseparen);
3280af22 188 SSPUSHPTR(PL_reginput);
41123dfd
JH
189#define REGCP_FRAME_ELEMS 2
190/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
191 * are needed for the regexp context stack bookkeeping. */
192 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 193 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 194
a0d0e21e
LW
195 return retval;
196}
197
c277df42 198/* These are needed since we do not localize EVAL nodes: */
a3621e74 199# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 200 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 201 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 202
a3621e74 203# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 204 PerlIO_printf(Perl_debug_log, \
faccc32b 205 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 206 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 207
76e3520e 208STATIC char *
097eb12c 209S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 210{
97aff369 211 dVAR;
b1ce53c5 212 I32 i;
a0d0e21e 213 char *input;
b1ce53c5 214
a3621e74
YO
215 GET_RE_DEBUG_FLAGS_DECL;
216
b1ce53c5 217 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 218 i = SSPOPINT;
b1ce53c5
JH
219 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
220 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 221 input = (char *) SSPOPPTR;
a01268b5 222 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
223 *PL_reglastparen = SSPOPINT;
224 PL_regsize = SSPOPINT;
b1ce53c5
JH
225
226 /* Now restore the parentheses context. */
41123dfd
JH
227 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
228 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 229 I32 tmps;
097eb12c 230 U32 paren = (U32)SSPOPINT;
3280af22 231 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
232 PL_regstartp[paren] = SSPOPINT;
233 tmps = SSPOPINT;
3280af22
NIS
234 if (paren <= *PL_reglastparen)
235 PL_regendp[paren] = tmps;
a3621e74 236 DEBUG_EXECUTE_r(
c3464db5 237 PerlIO_printf(Perl_debug_log,
b900a521 238 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 239 (UV)paren, (IV)PL_regstartp[paren],
b900a521 240 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 241 (IV)PL_regendp[paren],
3280af22 242 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 243 );
a0d0e21e 244 }
a3621e74 245 DEBUG_EXECUTE_r(
bb7a0f54 246 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 247 PerlIO_printf(Perl_debug_log,
faccc32b 248 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 249 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
250 }
251 );
daf18116 252#if 1
dafc8851
JH
253 /* It would seem that the similar code in regtry()
254 * already takes care of this, and in fact it is in
255 * a better location to since this code can #if 0-ed out
256 * but the code in regtry() is needed or otherwise tests
257 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
258 * (as of patchlevel 7877) will fail. Then again,
259 * this code seems to be necessary or otherwise
260 * building DynaLoader will fail:
261 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
262 * --jhi */
bb7a0f54 263 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
264 if (i > PL_regsize)
265 PL_regstartp[i] = -1;
266 PL_regendp[i] = -1;
a0d0e21e 267 }
dafc8851 268#endif
a0d0e21e
LW
269 return input;
270}
271
02db2b7b 272#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 273
95b24440 274#define TRYPAREN(paren, n, input, where) { \
29d1e993
HS
275 if (paren) { \
276 if (n) { \
277 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
278 PL_regendp[paren] = input - PL_bostr; \
279 } \
280 else \
281 PL_regendp[paren] = -1; \
282 } \
95b24440
DM
283 REGMATCH(next, where); \
284 if (result) \
29d1e993
HS
285 sayYES; \
286 if (paren && n) \
287 PL_regendp[paren] = -1; \
288}
289
290
a687059c 291/*
e50aee73 292 * pregexec and friends
a687059c
LW
293 */
294
76234dfb 295#ifndef PERL_IN_XSUB_RE
a687059c 296/*
c277df42 297 - pregexec - match a regexp against a string
a687059c 298 */
c277df42 299I32
864dbfa3 300Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 301 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
302/* strend: pointer to null at end of string */
303/* strbeg: real beginning of string */
304/* minend: end of match must be >=minend after stringarg. */
305/* nosave: For optimizations. */
306{
307 return
9041c2e3 308 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
309 nosave ? 0 : REXEC_COPY_STR);
310}
76234dfb 311#endif
22e551b9 312
9041c2e3 313/*
cad2e5aa
JH
314 * Need to implement the following flags for reg_anch:
315 *
316 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
317 * USE_INTUIT_ML
318 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
319 * INTUIT_AUTORITATIVE_ML
320 * INTUIT_ONCE_NOML - Intuit can match in one location only.
321 * INTUIT_ONCE_ML
322 *
323 * Another flag for this function: SECOND_TIME (so that float substrs
324 * with giant delta may be not rechecked).
325 */
326
327/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
328
3f7c398e 329/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
330 Otherwise, only SvCUR(sv) is used to get strbeg. */
331
332/* XXXX We assume that strpos is strbeg unless sv. */
333
6eb5f6b9
JH
334/* XXXX Some places assume that there is a fixed substring.
335 An update may be needed if optimizer marks as "INTUITable"
336 RExen without fixed substrings. Similarly, it is assumed that
337 lengths of all the strings are no more than minlen, thus they
338 cannot come from lookahead.
339 (Or minlen should take into account lookahead.) */
340
2c2d71f5
JH
341/* A failure to find a constant substring means that there is no need to make
342 an expensive call to REx engine, thus we celebrate a failure. Similarly,
343 finding a substring too deep into the string means that less calls to
30944b6d
IZ
344 regtry() should be needed.
345
346 REx compiler's optimizer found 4 possible hints:
347 a) Anchored substring;
348 b) Fixed substring;
349 c) Whether we are anchored (beginning-of-line or \G);
350 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 351 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
352 string which does not contradict any of them.
353 */
2c2d71f5 354
6eb5f6b9
JH
355/* Most of decisions we do here should have been done at compile time.
356 The nodes of the REx which we used for the search should have been
357 deleted from the finite automaton. */
358
cad2e5aa
JH
359char *
360Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
361 char *strend, U32 flags, re_scream_pos_data *data)
362{
97aff369 363 dVAR;
b7953727 364 register I32 start_shift = 0;
cad2e5aa 365 /* Should be nonnegative! */
b7953727 366 register I32 end_shift = 0;
2c2d71f5
JH
367 register char *s;
368 register SV *check;
a1933d95 369 char *strbeg;
cad2e5aa 370 char *t;
a3b680e6 371 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 372 I32 ml_anch;
bd61b366
SS
373 register char *other_last = NULL; /* other substr checked before this */
374 char *check_at = NULL; /* check substr found at this pos */
1df70142 375 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 376#ifdef DEBUGGING
890ce7af
AL
377 const char * const i_strpos = strpos;
378 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 379#endif
a3621e74
YO
380
381 GET_RE_DEBUG_FLAGS_DECL;
382
a30b2f1f 383 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 384
b8d68ded 385 if (prog->reganch & ROPT_UTF8) {
a3621e74 386 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
387 "UTF-8 regex...\n"));
388 PL_reg_flags |= RF_utf8;
389 }
390
a3621e74 391 DEBUG_EXECUTE_r({
1df70142 392 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
393 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
394 strpos;
1df70142 395 const int len = PL_reg_match_utf8 ?
bb7a0f54 396 (int)strlen(s) : strend - strpos;
2a782b5b
JH
397 if (!PL_colorset)
398 reginitcolors();
b8d68ded 399 if (PL_reg_match_utf8)
a3621e74 400 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 401 "UTF-8 target...\n"));
2a782b5b 402 PerlIO_printf(Perl_debug_log,
a0288114 403 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 404 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
405 prog->precomp,
406 PL_colors[1],
407 (strlen(prog->precomp) > 60 ? "..." : ""),
408 PL_colors[0],
409 (int)(len > 60 ? 60 : len),
410 s, PL_colors[1],
411 (len > 60 ? "..." : "")
412 );
413 });
cad2e5aa 414
c344f387
JH
415 /* CHR_DIST() would be more correct here but it makes things slow. */
416 if (prog->minlen > strend - strpos) {
a3621e74 417 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 418 "String too short... [re_intuit_start]\n"));
cad2e5aa 419 goto fail;
2c2d71f5 420 }
a1933d95 421 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 422 PL_regeol = strend;
33b8afdf
JH
423 if (do_utf8) {
424 if (!prog->check_utf8 && prog->check_substr)
425 to_utf8_substr(prog);
426 check = prog->check_utf8;
427 } else {
428 if (!prog->check_substr && prog->check_utf8)
429 to_byte_substr(prog);
430 check = prog->check_substr;
431 }
432 if (check == &PL_sv_undef) {
a3621e74 433 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
434 "Non-utf string cannot match utf check string\n"));
435 goto fail;
436 }
2c2d71f5 437 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
438 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
439 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 440 && !multiline ) ); /* Check after \n? */
cad2e5aa 441
7e25d62c
JH
442 if (!ml_anch) {
443 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
444 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 445 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
446 && sv && !SvROK(sv)
447 && (strpos != strbeg)) {
a3621e74 448 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
449 goto fail;
450 }
451 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 452 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 453 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
454 I32 slen;
455
1aa99e6b 456 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
457 if (SvTAIL(check)) {
458 slen = SvCUR(check); /* >= 1 */
cad2e5aa 459
9041c2e3 460 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 461 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 462 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 463 goto fail_finish;
cad2e5aa
JH
464 }
465 /* Now should match s[0..slen-2] */
466 slen--;
3f7c398e 467 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 468 || (slen > 1
3f7c398e 469 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 470 report_neq:
a3621e74 471 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
472 goto fail_finish;
473 }
cad2e5aa 474 }
3f7c398e 475 else if (*SvPVX_const(check) != *s
653099ff 476 || ((slen = SvCUR(check)) > 1
3f7c398e 477 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 478 goto report_neq;
c315bfe8 479 check_at = s;
2c2d71f5 480 goto success_at_start;
7e25d62c 481 }
cad2e5aa 482 }
2c2d71f5 483 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 484 s = strpos;
2c2d71f5 485 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 486 end_shift = prog->minlen - start_shift -
653099ff 487 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 488 if (!ml_anch) {
a3b680e6 489 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 490 - (SvTAIL(check) != 0);
a3b680e6 491 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
492
493 if (end_shift < eshift)
494 end_shift = eshift;
495 }
cad2e5aa 496 }
2c2d71f5 497 else { /* Can match at random position */
cad2e5aa
JH
498 ml_anch = 0;
499 s = strpos;
2c2d71f5
JH
500 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
501 /* Should be nonnegative! */
502 end_shift = prog->minlen - start_shift -
653099ff 503 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
504 }
505
2c2d71f5 506#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 507 if (end_shift < 0)
6bbae5e6 508 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
509#endif
510
2c2d71f5
JH
511 restart:
512 /* Find a possible match in the region s..strend by looking for
513 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 514 if (flags & REXEC_SCREAM) {
cad2e5aa 515 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 516 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 517
2c2d71f5
JH
518 if (PL_screamfirst[BmRARE(check)] >= 0
519 || ( BmRARE(check) == '\n'
520 && (BmPREVIOUS(check) == SvCUR(check) - 1)
521 && SvTAIL(check) ))
9041c2e3 522 s = screaminstr(sv, check,
2c2d71f5 523 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 524 else
2c2d71f5 525 goto fail_finish;
4addbd3b
HS
526 /* we may be pointing at the wrong string */
527 if (s && RX_MATCH_COPIED(prog))
3f7c398e 528 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
529 if (data)
530 *data->scream_olds = s;
531 }
f33976b4 532 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
533 s = fbm_instr((U8*)(s + start_shift),
534 (U8*)(strend - end_shift),
7fba1cd6 535 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 536 else
1aa99e6b
IH
537 s = fbm_instr(HOP3(s, start_shift, strend),
538 HOP3(strend, -end_shift, strbeg),
7fba1cd6 539 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
540
541 /* Update the count-of-usability, remove useless subpatterns,
542 unshift s. */
2c2d71f5 543
a0288114 544 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 545 (s ? "Found" : "Did not find"),
33b8afdf 546 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 547 PL_colors[0],
7b0972df 548 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 549 SvPVX_const(check),
2c2d71f5
JH
550 PL_colors[1], (SvTAIL(check) ? "$" : ""),
551 (s ? " at offset " : "...\n") ) );
552
553 if (!s)
554 goto fail_finish;
555
6eb5f6b9
JH
556 check_at = s;
557
2c2d71f5 558 /* Finish the diagnostic message */
a3621e74 559 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
560
561 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
562 Start with the other substr.
563 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 564 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
565 *always* match. Probably should be marked during compile...
566 Probably it is right to do no SCREAM here...
567 */
568
33b8afdf 569 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 570 /* Take into account the "other" substring. */
2c2d71f5
JH
571 /* XXXX May be hopelessly wrong for UTF... */
572 if (!other_last)
6eb5f6b9 573 other_last = strpos;
33b8afdf 574 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
575 do_other_anchored:
576 {
890ce7af
AL
577 char * const last = HOP3c(s, -start_shift, strbeg);
578 char *last1, *last2;
be8e71aa 579 char * const saved_s = s;
33b8afdf 580 SV* must;
2c2d71f5 581
2c2d71f5
JH
582 t = s - prog->check_offset_max;
583 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 584 && (!do_utf8
0ce71af7 585 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 586 && t > strpos)))
6f207bd3 587 NOOP;
2c2d71f5
JH
588 else
589 t = strpos;
1aa99e6b 590 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
591 if (t < other_last) /* These positions already checked */
592 t = other_last;
1aa99e6b 593 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
594 if (last < last1)
595 last1 = last;
596 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
597 /* On end-of-str: see comment below. */
33b8afdf
JH
598 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
599 if (must == &PL_sv_undef) {
600 s = (char*)NULL;
a3621e74 601 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
602 }
603 else
604 s = fbm_instr(
605 (unsigned char*)t,
606 HOP3(HOP3(last1, prog->anchored_offset, strend)
607 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
608 must,
7fba1cd6 609 multiline ? FBMrf_MULTILINE : 0
33b8afdf 610 );
a3621e74 611 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 612 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
613 (s ? "Found" : "Contradicts"),
614 PL_colors[0],
33b8afdf
JH
615 (int)(SvCUR(must)
616 - (SvTAIL(must)!=0)),
3f7c398e 617 SvPVX_const(must),
33b8afdf 618 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
619 if (!s) {
620 if (last1 >= last2) {
a3621e74 621 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
622 ", giving up...\n"));
623 goto fail_finish;
624 }
a3621e74 625 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 626 ", trying floating at offset %ld...\n",
be8e71aa 627 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
628 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
629 s = HOP3c(last, 1, strend);
2c2d71f5
JH
630 goto restart;
631 }
632 else {
a3621e74 633 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 634 (long)(s - i_strpos)));
1aa99e6b
IH
635 t = HOP3c(s, -prog->anchored_offset, strbeg);
636 other_last = HOP3c(s, 1, strend);
be8e71aa 637 s = saved_s;
2c2d71f5
JH
638 if (t == strpos)
639 goto try_at_start;
2c2d71f5
JH
640 goto try_at_offset;
641 }
30944b6d 642 }
2c2d71f5
JH
643 }
644 else { /* Take into account the floating substring. */
33b8afdf 645 char *last, *last1;
be8e71aa 646 char * const saved_s = s;
33b8afdf
JH
647 SV* must;
648
649 t = HOP3c(s, -start_shift, strbeg);
650 last1 = last =
651 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
652 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
653 last = HOP3c(t, prog->float_max_offset, strend);
654 s = HOP3c(t, prog->float_min_offset, strend);
655 if (s < other_last)
656 s = other_last;
2c2d71f5 657 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
658 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
659 /* fbm_instr() takes into account exact value of end-of-str
660 if the check is SvTAIL(ed). Since false positives are OK,
661 and end-of-str is not later than strend we are OK. */
662 if (must == &PL_sv_undef) {
663 s = (char*)NULL;
a3621e74 664 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
665 }
666 else
2c2d71f5 667 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
668 (unsigned char*)last + SvCUR(must)
669 - (SvTAIL(must)!=0),
7fba1cd6 670 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 671 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
672 (s ? "Found" : "Contradicts"),
673 PL_colors[0],
674 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 675 SvPVX_const(must),
33b8afdf
JH
676 PL_colors[1], (SvTAIL(must) ? "$" : "")));
677 if (!s) {
678 if (last1 == last) {
a3621e74 679 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
680 ", giving up...\n"));
681 goto fail_finish;
2c2d71f5 682 }
a3621e74 683 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 684 ", trying anchored starting at offset %ld...\n",
be8e71aa 685 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
686 other_last = last;
687 s = HOP3c(t, 1, strend);
688 goto restart;
689 }
690 else {
a3621e74 691 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
692 (long)(s - i_strpos)));
693 other_last = s; /* Fix this later. --Hugo */
be8e71aa 694 s = saved_s;
33b8afdf
JH
695 if (t == strpos)
696 goto try_at_start;
697 goto try_at_offset;
698 }
2c2d71f5 699 }
cad2e5aa 700 }
2c2d71f5
JH
701
702 t = s - prog->check_offset_max;
2c2d71f5 703 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 704 && (!do_utf8
0ce71af7 705 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
1aa99e6b 706 && t > strpos))) {
2c2d71f5
JH
707 /* Fixed substring is found far enough so that the match
708 cannot start at strpos. */
709 try_at_offset:
cad2e5aa 710 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
711 /* Eventually fbm_*() should handle this, but often
712 anchored_offset is not 0, so this check will not be wasted. */
713 /* XXXX In the code below we prefer to look for "^" even in
714 presence of anchored substrings. And we search even
715 beyond the found float position. These pessimizations
716 are historical artefacts only. */
717 find_anchor:
2c2d71f5 718 while (t < strend - prog->minlen) {
cad2e5aa 719 if (*t == '\n') {
4ee3650e 720 if (t < check_at - prog->check_offset_min) {
33b8afdf 721 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
722 /* Since we moved from the found position,
723 we definitely contradict the found anchored
30944b6d
IZ
724 substr. Due to the above check we do not
725 contradict "check" substr.
726 Thus we can arrive here only if check substr
727 is float. Redo checking for "other"=="fixed".
728 */
9041c2e3 729 strpos = t + 1;
a3621e74 730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 731 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
732 goto do_other_anchored;
733 }
4ee3650e
GS
734 /* We don't contradict the found floating substring. */
735 /* XXXX Why not check for STCLASS? */
cad2e5aa 736 s = t + 1;
a3621e74 737 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 738 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
739 goto set_useful;
740 }
4ee3650e
GS
741 /* Position contradicts check-string */
742 /* XXXX probably better to look for check-string
743 than for "\n", so one should lower the limit for t? */
a3621e74 744 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 745 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 746 other_last = strpos = s = t + 1;
cad2e5aa
JH
747 goto restart;
748 }
749 t++;
750 }
a3621e74 751 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 752 PL_colors[0], PL_colors[1]));
2c2d71f5 753 goto fail_finish;
cad2e5aa 754 }
f5952150 755 else {
a3621e74 756 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 757 PL_colors[0], PL_colors[1]));
f5952150 758 }
cad2e5aa
JH
759 s = t;
760 set_useful:
33b8afdf 761 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
762 }
763 else {
f5952150 764 /* The found string does not prohibit matching at strpos,
2c2d71f5 765 - no optimization of calling REx engine can be performed,
f5952150
GS
766 unless it was an MBOL and we are not after MBOL,
767 or a future STCLASS check will fail this. */
2c2d71f5
JH
768 try_at_start:
769 /* Even in this situation we may use MBOL flag if strpos is offset
770 wrt the start of the string. */
05b4157f 771 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 772 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
773 /* May be due to an implicit anchor of m{.*foo} */
774 && !(prog->reganch & ROPT_IMPLICIT))
775 {
cad2e5aa
JH
776 t = strpos;
777 goto find_anchor;
778 }
a3621e74 779 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 780 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 781 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 782 );
2c2d71f5 783 success_at_start:
30944b6d 784 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
785 && (do_utf8 ? (
786 prog->check_utf8 /* Could be deleted already */
787 && --BmUSEFUL(prog->check_utf8) < 0
788 && (prog->check_utf8 == prog->float_utf8)
789 ) : (
790 prog->check_substr /* Could be deleted already */
791 && --BmUSEFUL(prog->check_substr) < 0
792 && (prog->check_substr == prog->float_substr)
793 )))
66e933ab 794 {
cad2e5aa 795 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 796 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
797 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
798 if (do_utf8 ? prog->check_substr : prog->check_utf8)
799 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
800 prog->check_substr = prog->check_utf8 = NULL; /* disable */
801 prog->float_substr = prog->float_utf8 = NULL; /* clear */
802 check = NULL; /* abort */
cad2e5aa 803 s = strpos;
3cf5c195
IZ
804 /* XXXX This is a remnant of the old implementation. It
805 looks wasteful, since now INTUIT can use many
6eb5f6b9 806 other heuristics. */
cad2e5aa
JH
807 prog->reganch &= ~RE_USE_INTUIT;
808 }
809 else
810 s = strpos;
811 }
812
6eb5f6b9
JH
813 /* Last resort... */
814 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
07be1b83 815 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
6eb5f6b9
JH
816 /* minlen == 0 is possible if regstclass is \b or \B,
817 and the fixed substr is ''$.
818 Since minlen is already taken into account, s+1 is before strend;
819 accidentally, minlen >= 1 guaranties no false positives at s + 1
820 even for \b or \B. But (minlen? 1 : 0) below assumes that
821 regstclass does not come from lookahead... */
822 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
823 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 824 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 825 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 826 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 827 : 1);
07be1b83 828 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 829 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 830 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
831 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
832 cl_l, strend)
833 : strend);
07be1b83
YO
834 /*if (OP(prog->regstclass) == TRIE)
835 endpos++;*/
6eb5f6b9 836 t = s;
3b0527fe 837 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
838 if (!s) {
839#ifdef DEBUGGING
cbbf8932 840 const char *what = NULL;
6eb5f6b9
JH
841#endif
842 if (endpos == strend) {
a3621e74 843 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
844 "Could not match STCLASS...\n") );
845 goto fail;
846 }
a3621e74 847 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 848 "This position contradicts STCLASS...\n") );
653099ff
GS
849 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
850 goto fail;
6eb5f6b9 851 /* Contradict one of substrings */
33b8afdf
JH
852 if (prog->anchored_substr || prog->anchored_utf8) {
853 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 854 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 855 hop_and_restart:
1aa99e6b 856 s = HOP3c(t, 1, strend);
66e933ab
GS
857 if (s + start_shift + end_shift > strend) {
858 /* XXXX Should be taken into account earlier? */
a3621e74 859 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
860 "Could not match STCLASS...\n") );
861 goto fail;
862 }
5e39e1e5
HS
863 if (!check)
864 goto giveup;
a3621e74 865 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 866 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
867 what, (long)(s + start_shift - i_strpos)) );
868 goto restart;
869 }
66e933ab 870 /* Have both, check_string is floating */
6eb5f6b9
JH
871 if (t + start_shift >= check_at) /* Contradicts floating=check */
872 goto retry_floating_check;
873 /* Recheck anchored substring, but not floating... */
9041c2e3 874 s = check_at;
5e39e1e5
HS
875 if (!check)
876 goto giveup;
a3621e74 877 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 878 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
879 (long)(other_last - i_strpos)) );
880 goto do_other_anchored;
881 }
60e71179
GS
882 /* Another way we could have checked stclass at the
883 current position only: */
884 if (ml_anch) {
885 s = t = t + 1;
5e39e1e5
HS
886 if (!check)
887 goto giveup;
a3621e74 888 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 889 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 890 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 891 goto try_at_offset;
66e933ab 892 }
33b8afdf 893 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 894 goto fail;
6eb5f6b9
JH
895 /* Check is floating subtring. */
896 retry_floating_check:
897 t = check_at - start_shift;
a3621e74 898 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
899 goto hop_and_restart;
900 }
b7953727 901 if (t != s) {
a3621e74 902 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 903 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
904 (long)(t - i_strpos), (long)(s - i_strpos))
905 );
906 }
907 else {
a3621e74 908 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
909 "Does not contradict STCLASS...\n");
910 );
911 }
6eb5f6b9 912 }
5e39e1e5 913 giveup:
a3621e74 914 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
915 PL_colors[4], (check ? "Guessed" : "Giving up"),
916 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 917 return s;
2c2d71f5
JH
918
919 fail_finish: /* Substring not found */
33b8afdf
JH
920 if (prog->check_substr || prog->check_utf8) /* could be removed already */
921 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 922 fail:
a3621e74 923 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 924 PL_colors[4], PL_colors[5]));
bd61b366 925 return NULL;
cad2e5aa 926}
9661b544 927
6eb5f6b9 928/* We know what class REx starts with. Try to find this position... */
3b0527fe 929/* if reginfo is NULL, its a dryrun */
07be1b83
YO
930/* annoyingly all the vars in this routine have different names from their counterparts
931 in regmatch. /grrr */
3b0527fe 932
3c3eec57 933STATIC char *
07be1b83
YO
934S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
935 const char *strend, const regmatch_info *reginfo)
a687059c 936{
27da23d5 937 dVAR;
1df70142 938 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 939 char *m;
d8093b23 940 STRLEN ln;
5dab1207 941 STRLEN lnc;
078c425b 942 register STRLEN uskip;
d8093b23
G
943 unsigned int c1;
944 unsigned int c2;
6eb5f6b9
JH
945 char *e;
946 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 947 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 948
6eb5f6b9
JH
949 /* We know what class it must start with. */
950 switch (OP(c)) {
6eb5f6b9 951 case ANYOF:
388cc4de 952 if (do_utf8) {
078c425b 953 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
954 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
955 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a
DM
956 reginclass(prog, c, (U8*)s, 0, do_utf8) :
957 REGINCLASS(prog, c, (U8*)s)) {
3b0527fe 958 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
959 goto got_it;
960 else
961 tmp = doevery;
962 }
963 else
964 tmp = 1;
078c425b 965 s += uskip;
388cc4de
HS
966 }
967 }
968 else {
969 while (s < strend) {
970 STRLEN skip = 1;
971
32fc9b6a 972 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
973 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
974 /* The assignment of 2 is intentional:
975 * for the folded sharp s, the skip is 2. */
976 (skip = SHARP_S_SKIP))) {
3b0527fe 977 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
978 goto got_it;
979 else
980 tmp = doevery;
981 }
982 else
983 tmp = 1;
984 s += skip;
985 }
a0d0e21e 986 }
6eb5f6b9 987 break;
f33976b4
DB
988 case CANY:
989 while (s < strend) {
3b0527fe 990 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
991 goto got_it;
992 else
993 tmp = doevery;
994 s++;
995 }
996 break;
6eb5f6b9 997 case EXACTF:
5dab1207
NIS
998 m = STRING(c);
999 ln = STR_LEN(c); /* length to match in octets/bytes */
1000 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1001 if (UTF) {
a2a2844f 1002 STRLEN ulen1, ulen2;
5dab1207 1003 U8 *sm = (U8 *) m;
89ebb4a3
JH
1004 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1005 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1006 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1007
1008 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1009 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1010
89ebb4a3 1011 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1012 0, uniflags);
89ebb4a3 1013 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1014 0, uniflags);
5dab1207
NIS
1015 lnc = 0;
1016 while (sm < ((U8 *) m + ln)) {
1017 lnc++;
1018 sm += UTF8SKIP(sm);
1019 }
1aa99e6b
IH
1020 }
1021 else {
1022 c1 = *(U8*)m;
1023 c2 = PL_fold[c1];
1024 }
6eb5f6b9
JH
1025 goto do_exactf;
1026 case EXACTFL:
5dab1207
NIS
1027 m = STRING(c);
1028 ln = STR_LEN(c);
1029 lnc = (I32) ln;
d8093b23 1030 c1 = *(U8*)m;
6eb5f6b9
JH
1031 c2 = PL_fold_locale[c1];
1032 do_exactf:
db12adc6 1033 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1034
3b0527fe 1035 if (!reginfo && e < s)
6eb5f6b9 1036 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1037
60a8b682
JH
1038 /* The idea in the EXACTF* cases is to first find the
1039 * first character of the EXACTF* node and then, if
1040 * necessary, case-insensitively compare the full
1041 * text of the node. The c1 and c2 are the first
1042 * characters (though in Unicode it gets a bit
1043 * more complicated because there are more cases
7f16dd3d
JH
1044 * than just upper and lower: one needs to use
1045 * the so-called folding case for case-insensitive
1046 * matching (called "loose matching" in Unicode).
1047 * ibcmp_utf8() will do just that. */
60a8b682 1048
1aa99e6b 1049 if (do_utf8) {
575cac57 1050 UV c, f;
89ebb4a3 1051 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1052 STRLEN len, foldlen;
4ad0818d 1053 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1054 if (c1 == c2) {
5dab1207
NIS
1055 /* Upper and lower of 1st char are equal -
1056 * probably not a "letter". */
1aa99e6b 1057 while (s <= e) {
89ebb4a3 1058 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1059 uniflags);
80aecb99
JH
1060 if ( c == c1
1061 && (ln == len ||
3dab1dad
YO
1062 ibcmp_utf8(s, NULL, 0, do_utf8,
1063 m, NULL, ln, (bool)UTF))
3b0527fe 1064 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1065 goto got_it;
80aecb99 1066 else {
1df70142 1067 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1068 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1069 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1070 if ( f != c
1071 && (f == c1 || f == c2)
1072 && (ln == foldlen ||
66423254 1073 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1074 NULL, foldlen, do_utf8,
d07ddd77 1075 m,
3dab1dad 1076 NULL, ln, (bool)UTF))
3b0527fe 1077 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1078 goto got_it;
1079 }
1aa99e6b
IH
1080 s += len;
1081 }
09091399
JH
1082 }
1083 else {
1aa99e6b 1084 while (s <= e) {
89ebb4a3 1085 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1086 uniflags);
80aecb99 1087
60a8b682 1088 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1089 * Note that not all the possible combinations
1090 * are handled here: some of them are handled
1091 * by the standard folding rules, and some of
1092 * them (the character class or ANYOF cases)
1093 * are handled during compiletime in
1094 * regexec.c:S_regclass(). */
880bd946
JH
1095 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1096 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1097 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1098
1099 if ( (c == c1 || c == c2)
1100 && (ln == len ||
3dab1dad
YO
1101 ibcmp_utf8(s, NULL, 0, do_utf8,
1102 m, NULL, ln, (bool)UTF))
3b0527fe 1103 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1104 goto got_it;
80aecb99 1105 else {
1df70142 1106 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1107 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1108 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1109 if ( f != c
1110 && (f == c1 || f == c2)
1111 && (ln == foldlen ||
a6872d42 1112 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1113 NULL, foldlen, do_utf8,
d07ddd77 1114 m,
3dab1dad 1115 NULL, ln, (bool)UTF))
3b0527fe 1116 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1117 goto got_it;
1118 }
1aa99e6b
IH
1119 s += len;
1120 }
09091399 1121 }
1aa99e6b
IH
1122 }
1123 else {
1124 if (c1 == c2)
1125 while (s <= e) {
1126 if ( *(U8*)s == c1
1127 && (ln == 1 || !(OP(c) == EXACTF
1128 ? ibcmp(s, m, ln)
1129 : ibcmp_locale(s, m, ln)))
3b0527fe 1130 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1131 goto got_it;
1132 s++;
1133 }
1134 else
1135 while (s <= e) {
1136 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1137 && (ln == 1 || !(OP(c) == EXACTF
1138 ? ibcmp(s, m, ln)
1139 : ibcmp_locale(s, m, ln)))
3b0527fe 1140 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1141 goto got_it;
1142 s++;
1143 }
b3c9acc1
IZ
1144 }
1145 break;
bbce6d69 1146 case BOUNDL:
3280af22 1147 PL_reg_flags |= RF_tainted;
bbce6d69 1148 /* FALL THROUGH */
a0d0e21e 1149 case BOUND:
ffc61ed2 1150 if (do_utf8) {
12d33761 1151 if (s == PL_bostr)
ffc61ed2
JH
1152 tmp = '\n';
1153 else {
6136c704 1154 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1155 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1156 }
1157 tmp = ((OP(c) == BOUND ?
9041c2e3 1158 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1159 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1160 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1161 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1162 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1163 isALNUM_LC_utf8((U8*)s)))
1164 {
1165 tmp = !tmp;
3b0527fe 1166 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1167 goto got_it;
1168 }
078c425b 1169 s += uskip;
a687059c 1170 }
a0d0e21e 1171 }
667bb95a 1172 else {
12d33761 1173 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1174 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1175 while (s < strend) {
1176 if (tmp ==
1177 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1178 tmp = !tmp;
3b0527fe 1179 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1180 goto got_it;
1181 }
1182 s++;
a0ed51b3 1183 }
a0ed51b3 1184 }
3b0527fe 1185 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1186 goto got_it;
1187 break;
bbce6d69 1188 case NBOUNDL:
3280af22 1189 PL_reg_flags |= RF_tainted;
bbce6d69 1190 /* FALL THROUGH */
a0d0e21e 1191 case NBOUND:
ffc61ed2 1192 if (do_utf8) {
12d33761 1193 if (s == PL_bostr)
ffc61ed2
JH
1194 tmp = '\n';
1195 else {
6136c704 1196 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1197 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1198 }
1199 tmp = ((OP(c) == NBOUND ?
9041c2e3 1200 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1201 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1202 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1203 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1204 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1205 isALNUM_LC_utf8((U8*)s)))
1206 tmp = !tmp;
3b0527fe 1207 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2 1208 goto got_it;
078c425b 1209 s += uskip;
ffc61ed2 1210 }
a0d0e21e 1211 }
667bb95a 1212 else {
12d33761 1213 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1214 tmp = ((OP(c) == NBOUND ?
1215 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1216 while (s < strend) {
1217 if (tmp ==
1218 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1219 tmp = !tmp;
3b0527fe 1220 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1221 goto got_it;
1222 s++;
1223 }
a0ed51b3 1224 }
3b0527fe 1225 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1226 goto got_it;
1227 break;
a0d0e21e 1228 case ALNUM:
ffc61ed2 1229 if (do_utf8) {
1a4fad37 1230 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1231 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1232 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1233 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1234 goto got_it;
1235 else
1236 tmp = doevery;
1237 }
bbce6d69 1238 else
ffc61ed2 1239 tmp = 1;
078c425b 1240 s += uskip;
bbce6d69 1241 }
bbce6d69 1242 }
ffc61ed2
JH
1243 else {
1244 while (s < strend) {
1245 if (isALNUM(*s)) {
3b0527fe 1246 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1247 goto got_it;
1248 else
1249 tmp = doevery;
1250 }
a0ed51b3 1251 else
ffc61ed2
JH
1252 tmp = 1;
1253 s++;
a0ed51b3 1254 }
a0ed51b3
LW
1255 }
1256 break;
bbce6d69 1257 case ALNUML:
3280af22 1258 PL_reg_flags |= RF_tainted;
ffc61ed2 1259 if (do_utf8) {
078c425b 1260 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1261 if (isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1262 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1263 goto got_it;
1264 else
1265 tmp = doevery;
1266 }
a687059c 1267 else
ffc61ed2 1268 tmp = 1;
078c425b 1269 s += uskip;
a0d0e21e 1270 }
a0d0e21e 1271 }
ffc61ed2
JH
1272 else {
1273 while (s < strend) {
1274 if (isALNUM_LC(*s)) {
3b0527fe 1275 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1276 goto got_it;
1277 else
1278 tmp = doevery;
1279 }
a0ed51b3 1280 else
ffc61ed2
JH
1281 tmp = 1;
1282 s++;
a0ed51b3 1283 }
a0ed51b3
LW
1284 }
1285 break;
a0d0e21e 1286 case NALNUM:
ffc61ed2 1287 if (do_utf8) {
1a4fad37 1288 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1289 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1290 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1291 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1292 goto got_it;
1293 else
1294 tmp = doevery;
1295 }
bbce6d69 1296 else
ffc61ed2 1297 tmp = 1;
078c425b 1298 s += uskip;
bbce6d69 1299 }
bbce6d69 1300 }
ffc61ed2
JH
1301 else {
1302 while (s < strend) {
1303 if (!isALNUM(*s)) {
3b0527fe 1304 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1305 goto got_it;
1306 else
1307 tmp = doevery;
1308 }
a0ed51b3 1309 else
ffc61ed2
JH
1310 tmp = 1;
1311 s++;
a0ed51b3 1312 }
a0ed51b3
LW
1313 }
1314 break;
bbce6d69 1315 case NALNUML:
3280af22 1316 PL_reg_flags |= RF_tainted;
ffc61ed2 1317 if (do_utf8) {
078c425b 1318 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1319 if (!isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1320 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1321 goto got_it;
1322 else
1323 tmp = doevery;
1324 }
a687059c 1325 else
ffc61ed2 1326 tmp = 1;
078c425b 1327 s += uskip;
a687059c 1328 }
a0d0e21e 1329 }
ffc61ed2
JH
1330 else {
1331 while (s < strend) {
1332 if (!isALNUM_LC(*s)) {
3b0527fe 1333 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1334 goto got_it;
1335 else
1336 tmp = doevery;
1337 }
a0ed51b3 1338 else
ffc61ed2
JH
1339 tmp = 1;
1340 s++;
a0ed51b3 1341 }
a0ed51b3
LW
1342 }
1343 break;
a0d0e21e 1344 case SPACE:
ffc61ed2 1345 if (do_utf8) {
1a4fad37 1346 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1347 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1348 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
3b0527fe 1349 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1350 goto got_it;
1351 else
1352 tmp = doevery;
1353 }
a0d0e21e 1354 else
ffc61ed2 1355 tmp = 1;
078c425b 1356 s += uskip;
2304df62 1357 }
a0d0e21e 1358 }
ffc61ed2
JH
1359 else {
1360 while (s < strend) {
1361 if (isSPACE(*s)) {
3b0527fe 1362 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1363 goto got_it;
1364 else
1365 tmp = doevery;
1366 }
a0ed51b3 1367 else
ffc61ed2
JH
1368 tmp = 1;
1369 s++;
a0ed51b3 1370 }
a0ed51b3
LW
1371 }
1372 break;
bbce6d69 1373 case SPACEL:
3280af22 1374 PL_reg_flags |= RF_tainted;
ffc61ed2 1375 if (do_utf8) {
078c425b 1376 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1377 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
3b0527fe 1378 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1379 goto got_it;
1380 else
1381 tmp = doevery;
1382 }
bbce6d69 1383 else
ffc61ed2 1384 tmp = 1;
078c425b 1385 s += uskip;
bbce6d69 1386 }
bbce6d69 1387 }
ffc61ed2
JH
1388 else {
1389 while (s < strend) {
1390 if (isSPACE_LC(*s)) {
3b0527fe 1391 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1392 goto got_it;
1393 else
1394 tmp = doevery;
1395 }
a0ed51b3 1396 else
ffc61ed2
JH
1397 tmp = 1;
1398 s++;
a0ed51b3 1399 }
a0ed51b3
LW
1400 }
1401 break;
a0d0e21e 1402 case NSPACE:
ffc61ed2 1403 if (do_utf8) {
1a4fad37 1404 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1405 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1406 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
3b0527fe 1407 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1408 goto got_it;
1409 else
1410 tmp = doevery;
1411 }
a0d0e21e 1412 else
ffc61ed2 1413 tmp = 1;
078c425b 1414 s += uskip;
a687059c 1415 }
a0d0e21e 1416 }
ffc61ed2
JH
1417 else {
1418 while (s < strend) {
1419 if (!isSPACE(*s)) {
3b0527fe 1420 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1421 goto got_it;
1422 else
1423 tmp = doevery;
1424 }
a0ed51b3 1425 else
ffc61ed2
JH
1426 tmp = 1;
1427 s++;
a0ed51b3 1428 }
a0ed51b3
LW
1429 }
1430 break;
bbce6d69 1431 case NSPACEL:
3280af22 1432 PL_reg_flags |= RF_tainted;
ffc61ed2 1433 if (do_utf8) {
078c425b 1434 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1435 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
3b0527fe 1436 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1437 goto got_it;
1438 else
1439 tmp = doevery;
1440 }
bbce6d69 1441 else
ffc61ed2 1442 tmp = 1;
078c425b 1443 s += uskip;
bbce6d69 1444 }
bbce6d69 1445 }
ffc61ed2
JH
1446 else {
1447 while (s < strend) {
1448 if (!isSPACE_LC(*s)) {
3b0527fe 1449 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1450 goto got_it;
1451 else
1452 tmp = doevery;
1453 }
a0ed51b3 1454 else
ffc61ed2
JH
1455 tmp = 1;
1456 s++;
a0ed51b3 1457 }
a0ed51b3
LW
1458 }
1459 break;
a0d0e21e 1460 case DIGIT:
ffc61ed2 1461 if (do_utf8) {
1a4fad37 1462 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1463 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1464 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1465 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1466 goto got_it;
1467 else
1468 tmp = doevery;
1469 }
a0d0e21e 1470 else
ffc61ed2 1471 tmp = 1;
078c425b 1472 s += uskip;
2b69d0c2 1473 }
a0d0e21e 1474 }
ffc61ed2
JH
1475 else {
1476 while (s < strend) {
1477 if (isDIGIT(*s)) {
3b0527fe 1478 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1479 goto got_it;
1480 else
1481 tmp = doevery;
1482 }
a0ed51b3 1483 else
ffc61ed2
JH
1484 tmp = 1;
1485 s++;
a0ed51b3 1486 }
a0ed51b3
LW
1487 }
1488 break;
b8c5462f
JH
1489 case DIGITL:
1490 PL_reg_flags |= RF_tainted;
ffc61ed2 1491 if (do_utf8) {
078c425b 1492 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1493 if (isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1494 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1495 goto got_it;
1496 else
1497 tmp = doevery;
1498 }
b8c5462f 1499 else
ffc61ed2 1500 tmp = 1;
078c425b 1501 s += uskip;
b8c5462f 1502 }
b8c5462f 1503 }
ffc61ed2
JH
1504 else {
1505 while (s < strend) {
1506 if (isDIGIT_LC(*s)) {
3b0527fe 1507 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1508 goto got_it;
1509 else
1510 tmp = doevery;
1511 }
b8c5462f 1512 else
ffc61ed2
JH
1513 tmp = 1;
1514 s++;
b8c5462f 1515 }
b8c5462f
JH
1516 }
1517 break;
a0d0e21e 1518 case NDIGIT:
ffc61ed2 1519 if (do_utf8) {
1a4fad37 1520 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1521 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1522 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1523 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1524 goto got_it;
1525 else
1526 tmp = doevery;
1527 }
a0d0e21e 1528 else
ffc61ed2 1529 tmp = 1;
078c425b 1530 s += uskip;
a687059c 1531 }
a0d0e21e 1532 }
ffc61ed2
JH
1533 else {
1534 while (s < strend) {
1535 if (!isDIGIT(*s)) {
3b0527fe 1536 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1537 goto got_it;
1538 else
1539 tmp = doevery;
1540 }
a0ed51b3 1541 else
ffc61ed2
JH
1542 tmp = 1;
1543 s++;
a0ed51b3 1544 }
a0ed51b3
LW
1545 }
1546 break;
b8c5462f
JH
1547 case NDIGITL:
1548 PL_reg_flags |= RF_tainted;
ffc61ed2 1549 if (do_utf8) {
078c425b 1550 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1551 if (!isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1552 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1553 goto got_it;
1554 else
1555 tmp = doevery;
1556 }
b8c5462f 1557 else
ffc61ed2 1558 tmp = 1;
078c425b 1559 s += uskip;
b8c5462f 1560 }
a0ed51b3 1561 }
ffc61ed2
JH
1562 else {
1563 while (s < strend) {
1564 if (!isDIGIT_LC(*s)) {
3b0527fe 1565 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1566 goto got_it;
1567 else
1568 tmp = doevery;
1569 }
cf93c79d 1570 else
ffc61ed2
JH
1571 tmp = 1;
1572 s++;
b8c5462f 1573 }
b8c5462f
JH
1574 }
1575 break;
07be1b83
YO
1576 case TRIE:
1577 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1578 {
1579 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1580 trie_type = do_utf8 ?
1581 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1582 : trie_plain;
1583 /* what trie are we using right now */
1584 reg_ac_data *aho
1585 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1586 reg_trie_data *trie=aho->trie;
1587
1588 const char *last_start = strend - trie->minlen;
1589 const char *real_start = s;
1590 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1591 SV *sv_points;
1592 U8 **points; /* map of where we were in the input string
1593 when reading a given string. For ASCII this
1594 is unnecessary overhead as the relationship
1595 is always 1:1, but for unicode, especially
1596 case folded unicode this is not true. */
f9e705e8 1597 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
07be1b83
YO
1598
1599 GET_RE_DEBUG_FLAGS_DECL;
1600
be8e71aa
YO
1601 /* We can't just allocate points here. We need to wrap it in
1602 * an SV so it gets freed properly if there is a croak while
1603 * running the match */
1604 ENTER;
1605 SAVETMPS;
1606 sv_points=newSV(maxlen * sizeof(U8 *));
1607 SvCUR_set(sv_points,
1608 maxlen * sizeof(U8 *));
1609 SvPOK_on(sv_points);
1610 sv_2mortal(sv_points);
1611 points=(U8**)SvPV_nolen(sv_points );
07be1b83
YO
1612
1613 if (trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1614 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
07be1b83
YO
1615 s++;
1616 }
1617 }
1618
1619 while (s <= last_start) {
1620 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1621 U8 *uc = (U8*)s;
1622 U16 charid = 0;
1623 U32 base = 1;
1624 U32 state = 1;
1625 UV uvc = 0;
1626 STRLEN len = 0;
1627 STRLEN foldlen = 0;
1628 U8 *uscan = (U8*)NULL;
1629 U8 *leftmost = NULL;
1630
1631 U32 pointpos = 0;
1632
1633 while ( state && uc <= (U8*)strend ) {
1634 int failed=0;
1635 if (aho->states[ state ].wordnum) {
1636 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1637 if (!leftmost || lpos < leftmost)
1638 leftmost= lpos;
1639 if (base==0) break;
1640 }
1641 points[pointpos++ % maxlen]= uc;
1642 switch (trie_type) {
1643 case trie_utf8_fold:
1644 if ( foldlen>0 ) {
1645 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
1646 foldlen -= len;
1647 uscan += len;
1648 len=0;
1649 } else {
07be1b83
YO
1650 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
1651 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
1652 foldlen -= UNISKIP( uvc );
1653 uscan = foldbuf + UNISKIP( uvc );
1654 }
1655 break;
1656 case trie_utf8:
1657 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
1658 &len, uniflags );
1659 break;
1660 case trie_plain:
1661 uvc = (UV)*uc;
1662 len = 1;
1663 }
1664
1665 if (uvc < 256) {
1666 charid = trie->charmap[ uvc ];
1667 }
1668 else {
1669 charid = 0;
1670 if (trie->widecharmap) {
1671 SV** const svpp = hv_fetch(trie->widecharmap,
1672 (char*)&uvc, sizeof(UV), 0);
1673 if (svpp)
1674 charid = (U16)SvIV(*svpp);
1675 }
1676 }
1677 DEBUG_TRIE_EXECUTE_r(
1678 PerlIO_printf(Perl_debug_log,
1679 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1680 (int)((const char*)uc - real_start), charid, uvc)
1681 );
1682 uc += len;
1683
1684 do {
1685 U32 word = aho->states[ state ].wordnum;
1686 base = aho->states[ state ].trans.base;
1687
1688 DEBUG_TRIE_EXECUTE_r(
1689 PerlIO_printf( Perl_debug_log,
1690 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1691 failed ? "Fail transition to " : "",
1692 state, base, uvc, word)
1693 );
1694 if ( base ) {
1695 U32 tmp;
1696 if (charid &&
1697 (base + charid > trie->uniquecharcount )
1698 && (base + charid - 1 - trie->uniquecharcount
1699 < trie->lasttrans)
1700 && trie->trans[base + charid - 1 -
1701 trie->uniquecharcount].check == state
1702 && (tmp=trie->trans[base + charid - 1 -
1703 trie->uniquecharcount ].next))
1704 {
1705 state = tmp;
1706 break;
1707 }
1708 else {
1709 failed++;
1710 if ( state == 1 )
1711 break;
1712 else
1713 state = aho->fail[state];
1714 }
1715 }
1716 else {
1717 /* we must be accepting here */
1718 failed++;
1719 break;
1720 }
1721 } while(state);
1722 if (failed) {
1723 if (leftmost)
1724 break;
1725 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1726 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
07be1b83
YO
1727 uc++;
1728 }
1729 }
1730 }
1731 }
1732 if ( aho->states[ state ].wordnum ) {
1733 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1734 if (!leftmost || lpos < leftmost)
1735 leftmost = lpos;
1736 }
1737 DEBUG_TRIE_EXECUTE_r(
1738 PerlIO_printf( Perl_debug_log,
1739 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1740 "All done: ",
1741 state, base, uvc)
1742 );
1743 if (leftmost) {
1744 s = (char*)leftmost;
be8e71aa
YO
1745 if (!reginfo || regtry(reginfo, s)) {
1746 FREETMPS;
1747 LEAVE;
07be1b83 1748 goto got_it;
be8e71aa 1749 }
07be1b83
YO
1750 s = HOPc(s,1);
1751 } else {
1752 break;
1753 }
1754 }
be8e71aa
YO
1755 FREETMPS;
1756 LEAVE;
07be1b83
YO
1757 }
1758 break;
b3c9acc1 1759 default:
3c3eec57
GS
1760 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1761 break;
d6a28714 1762 }
6eb5f6b9
JH
1763 return 0;
1764 got_it:
1765 return s;
1766}
1767
1768/*
1769 - regexec_flags - match a regexp against a string
1770 */
1771I32
1772Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1773 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1774/* strend: pointer to null at end of string */
1775/* strbeg: real beginning of string */
1776/* minend: end of match must be >=minend after stringarg. */
1777/* data: May be used for some additional optimizations. */
1778/* nosave: For optimizations. */
1779{
97aff369 1780 dVAR;
6eb5f6b9
JH
1781 register char *s;
1782 register regnode *c;
1783 register char *startpos = stringarg;
6eb5f6b9
JH
1784 I32 minlen; /* must match at least this many chars */
1785 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1786 I32 end_shift = 0; /* Same for the end. */ /* CC */
1787 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1788 char *scream_olds = NULL;
3dab1dad 1789 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1790 const bool do_utf8 = DO_UTF8(sv);
2757e526 1791 I32 multiline;
2a782b5b 1792#ifdef DEBUGGING
2757e526
JH
1793 SV* dsv0;
1794 SV* dsv1;
2a782b5b 1795#endif
3b0527fe 1796 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1797
1798 GET_RE_DEBUG_FLAGS_DECL;
1799
9d4ba2ae 1800 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1801
1802 /* Be paranoid... */
1803 if (prog == NULL || startpos == NULL) {
1804 Perl_croak(aTHX_ "NULL regexp parameter");
1805 return 0;
1806 }
1807
2757e526 1808 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1809 reginfo.prog = prog;
2757e526
JH
1810
1811#ifdef DEBUGGING
1812 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1813 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1814#endif
1815
bac06658
JH
1816 RX_MATCH_UTF8_set(prog, do_utf8);
1817
6eb5f6b9 1818 minlen = prog->minlen;
61a36c01 1819 if (strend - startpos < minlen) {
a3621e74 1820 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1821 "String too short [regexec_flags]...\n"));
1822 goto phooey;
1aa99e6b 1823 }
6eb5f6b9 1824
6eb5f6b9
JH
1825 /* Check validity of program. */
1826 if (UCHARAT(prog->program) != REG_MAGIC) {
1827 Perl_croak(aTHX_ "corrupted regexp program");
1828 }
1829
1830 PL_reg_flags = 0;
1831 PL_reg_eval_set = 0;
1832 PL_reg_maxiter = 0;
1833
1834 if (prog->reganch & ROPT_UTF8)
1835 PL_reg_flags |= RF_utf8;
1836
1837 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1838 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1839 PL_bostr = strbeg;
3b0527fe 1840 reginfo.sv = sv;
6eb5f6b9
JH
1841
1842 /* Mark end of line for $ (and such) */
1843 PL_regeol = strend;
1844
1845 /* see how far we have to get to not match where we matched before */
3b0527fe 1846 reginfo.till = startpos+minend;
6eb5f6b9 1847
6eb5f6b9
JH
1848 /* If there is a "must appear" string, look for it. */
1849 s = startpos;
1850
3b0527fe 1851 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1852 MAGIC *mg;
1853
1854 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1855 reginfo.ganch = startpos;
6eb5f6b9
JH
1856 else if (sv && SvTYPE(sv) >= SVt_PVMG
1857 && SvMAGIC(sv)
14befaf4
DM
1858 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1859 && mg->mg_len >= 0) {
3b0527fe 1860 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1861 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1862 if (s > reginfo.ganch)
6eb5f6b9 1863 goto phooey;
3b0527fe 1864 s = reginfo.ganch;
6eb5f6b9
JH
1865 }
1866 }
1867 else /* pos() not defined */
3b0527fe 1868 reginfo.ganch = strbeg;
6eb5f6b9
JH
1869 }
1870
a0714e2c 1871 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1872 re_scream_pos_data d;
1873
1874 d.scream_olds = &scream_olds;
1875 d.scream_pos = &scream_pos;
1876 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1877 if (!s) {
a3621e74 1878 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1879 goto phooey; /* not present */
3fa9c3d7 1880 }
6eb5f6b9
JH
1881 }
1882
a3621e74 1883 DEBUG_EXECUTE_r({
1df70142
AL
1884 const char * const s0 = UTF
1885 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1886 UNI_DISPLAY_REGEX)
1887 : prog->precomp;
bb7a0f54 1888 const int len0 = UTF ? (int)SvCUR(dsv0) : prog->prelen;
1df70142 1889 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1890 UNI_DISPLAY_REGEX) : startpos;
bb7a0f54 1891 const int len1 = do_utf8 ? (int)SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1892 if (!PL_colorset)
1893 reginitcolors();
1894 PerlIO_printf(Perl_debug_log,
a0288114 1895 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1896 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1897 len0, len0, s0,
2a782b5b 1898 PL_colors[1],
9e55ce06 1899 len0 > 60 ? "..." : "",
2a782b5b 1900 PL_colors[0],
9e55ce06
JH
1901 (int)(len1 > 60 ? 60 : len1),
1902 s1, PL_colors[1],
1903 (len1 > 60 ? "..." : "")
2a782b5b
JH
1904 );
1905 });
6eb5f6b9
JH
1906
1907 /* Simplest case: anchored match need be tried only once. */
1908 /* [unless only anchor is BOL and multiline is set] */
1909 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1910 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1911 goto got_it;
7fba1cd6 1912 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1913 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1914 {
1915 char *end;
1916
1917 if (minlen)
1918 dontbother = minlen - 1;
1aa99e6b 1919 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1920 /* for multiline we only have to try after newlines */
33b8afdf 1921 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1922 if (s == startpos)
1923 goto after_try;
1924 while (1) {
3b0527fe 1925 if (regtry(&reginfo, s))
6eb5f6b9
JH
1926 goto got_it;
1927 after_try:
1928 if (s >= end)
1929 goto phooey;
1930 if (prog->reganch & RE_USE_INTUIT) {
1931 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1932 if (!s)
1933 goto phooey;
1934 }
1935 else
1936 s++;
1937 }
1938 } else {
1939 if (s > startpos)
1940 s--;
1941 while (s < end) {
1942 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1943 if (regtry(&reginfo, s))
6eb5f6b9
JH
1944 goto got_it;
1945 }
1946 }
1947 }
1948 }
1949 goto phooey;
1950 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1951 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1952 goto got_it;
1953 goto phooey;
1954 }
1955
1956 /* Messy cases: unanchored match. */
33b8afdf 1957 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1958 /* we have /x+whatever/ */
1959 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1960 char ch;
bf93d4cc
GS
1961#ifdef DEBUGGING
1962 int did_match = 0;
1963#endif
33b8afdf
JH
1964 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1965 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1966 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1967
1aa99e6b 1968 if (do_utf8) {
6eb5f6b9
JH
1969 while (s < strend) {
1970 if (*s == ch) {
a3621e74 1971 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1972 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1973 s += UTF8SKIP(s);
1974 while (s < strend && *s == ch)
1975 s += UTF8SKIP(s);
1976 }
1977 s += UTF8SKIP(s);
1978 }
1979 }
1980 else {
1981 while (s < strend) {
1982 if (*s == ch) {
a3621e74 1983 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1984 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1985 s++;
1986 while (s < strend && *s == ch)
1987 s++;
1988 }
1989 s++;
1990 }
1991 }
a3621e74 1992 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1993 PerlIO_printf(Perl_debug_log,
b7953727
JH
1994 "Did not find anchored character...\n")
1995 );
6eb5f6b9 1996 }
a0714e2c
SS
1997 else if (prog->anchored_substr != NULL
1998 || prog->anchored_utf8 != NULL
1999 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2000 && prog->float_max_offset < strend - s)) {
2001 SV *must;
2002 I32 back_max;
2003 I32 back_min;
2004 char *last;
6eb5f6b9 2005 char *last1; /* Last position checked before */
bf93d4cc
GS
2006#ifdef DEBUGGING
2007 int did_match = 0;
2008#endif
33b8afdf
JH
2009 if (prog->anchored_substr || prog->anchored_utf8) {
2010 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2011 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2012 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2013 back_max = back_min = prog->anchored_offset;
2014 } else {
2015 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2016 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2017 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2018 back_max = prog->float_max_offset;
2019 back_min = prog->float_min_offset;
2020 }
2021 if (must == &PL_sv_undef)
2022 /* could not downgrade utf8 check substring, so must fail */
2023 goto phooey;
2024
2025 last = HOP3c(strend, /* Cannot start after this */
2026 -(I32)(CHR_SVLEN(must)
2027 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
2028
2029 if (s > PL_bostr)
2030 last1 = HOPc(s, -1);
2031 else
2032 last1 = s - 1; /* bogus */
2033
a0288114 2034 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2035 check_substr==must. */
2036 scream_pos = -1;
2037 dontbother = end_shift;
2038 strend = HOPc(strend, -dontbother);
2039 while ( (s <= last) &&
9041c2e3 2040 ((flags & REXEC_SCREAM)
1aa99e6b 2041 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 2042 end_shift, &scream_pos, 0))
1aa99e6b 2043 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 2044 (unsigned char*)strend, must,
7fba1cd6 2045 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
2046 /* we may be pointing at the wrong string */
2047 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 2048 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2049 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2050 if (HOPc(s, -back_max) > last1) {
2051 last1 = HOPc(s, -back_min);
2052 s = HOPc(s, -back_max);
2053 }
2054 else {
52657f30 2055 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2056
2057 last1 = HOPc(s, -back_min);
52657f30 2058 s = t;
6eb5f6b9 2059 }
1aa99e6b 2060 if (do_utf8) {
6eb5f6b9 2061 while (s <= last1) {
3b0527fe 2062 if (regtry(&reginfo, s))
6eb5f6b9
JH
2063 goto got_it;
2064 s += UTF8SKIP(s);
2065 }
2066 }
2067 else {
2068 while (s <= last1) {
3b0527fe 2069 if (regtry(&reginfo, s))
6eb5f6b9
JH
2070 goto got_it;
2071 s++;
2072 }
2073 }
2074 }
a3621e74 2075 DEBUG_EXECUTE_r(if (!did_match)
b7953727 2076 PerlIO_printf(Perl_debug_log,
a0288114 2077 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 2078 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
2079 ? "anchored" : "floating"),
2080 PL_colors[0],
2081 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 2082 SvPVX_const(must),
b7953727
JH
2083 PL_colors[1], (SvTAIL(must) ? "$" : ""))
2084 );
6eb5f6b9
JH
2085 goto phooey;
2086 }
155aba94 2087 else if ((c = prog->regstclass)) {
f14c76ed 2088 if (minlen) {
be8e71aa 2089 const OPCODE op = OP(prog->regstclass);
66e933ab 2090 /* don't bother with what can't match */
07be1b83 2091 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
f14c76ed
RGS
2092 strend = HOPc(strend, -(minlen - 1));
2093 }
a3621e74 2094 DEBUG_EXECUTE_r({
be8e71aa 2095 SV * const prop = sv_newmortal();
cfd0369c
NC
2096 const char *s0;
2097 const char *s1;
9e55ce06
JH
2098 int len0;
2099 int len1;
2100
32fc9b6a 2101 regprop(prog, prop, c);
9e55ce06 2102 s0 = UTF ?
3f7c398e 2103 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 2104 UNI_DISPLAY_REGEX) :
cfd0369c 2105 SvPVX_const(prop);
9e55ce06
JH
2106 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
2107 s1 = UTF ?
c728cb41 2108 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
bb7a0f54 2109 len1 = UTF ? (int)SvCUR(dsv1) : strend - s;
9e55ce06 2110 PerlIO_printf(Perl_debug_log,
07be1b83 2111 "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
9e55ce06 2112 len0, len0, s0,
07be1b83 2113 len1, len1, s1, (int)(strend - s));
ffc61ed2 2114 });
3b0527fe 2115 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2116 goto got_it;
07be1b83 2117 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2118 }
2119 else {
2120 dontbother = 0;
a0714e2c 2121 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2122 /* Trim the end. */
d6a28714 2123 char *last;
33b8afdf
JH
2124 SV* float_real;
2125
2126 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2127 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2128 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2129
2130 if (flags & REXEC_SCREAM) {
33b8afdf 2131 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2132 end_shift, &scream_pos, 1); /* last one */
2133 if (!last)
ffc61ed2 2134 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
2135 /* we may be pointing at the wrong string */
2136 else if (RX_MATCH_COPIED(prog))
3f7c398e 2137 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2138 }
d6a28714
JH
2139 else {
2140 STRLEN len;
cfd0369c 2141 const char * const little = SvPV_const(float_real, len);
d6a28714 2142
33b8afdf 2143 if (SvTAIL(float_real)) {
d6a28714
JH
2144 if (memEQ(strend - len + 1, little, len - 1))
2145 last = strend - len + 1;
7fba1cd6 2146 else if (!multiline)
9041c2e3 2147 last = memEQ(strend - len, little, len)
bd61b366 2148 ? strend - len : NULL;
b8c5462f 2149 else
d6a28714
JH
2150 goto find_last;
2151 } else {
2152 find_last:
9041c2e3 2153 if (len)
d6a28714 2154 last = rninstr(s, strend, little, little + len);
b8c5462f 2155 else
a0288114 2156 last = strend; /* matching "$" */
b8c5462f 2157 }
b8c5462f 2158 }
bf93d4cc 2159 if (last == NULL) {
a3621e74 2160 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2161 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2162 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2163 goto phooey; /* Should not happen! */
2164 }
d6a28714
JH
2165 dontbother = strend - last + prog->float_min_offset;
2166 }
2167 if (minlen && (dontbother < minlen))
2168 dontbother = minlen - 1;
2169 strend -= dontbother; /* this one's always in bytes! */
2170 /* We don't know much -- general case. */
1aa99e6b 2171 if (do_utf8) {
d6a28714 2172 for (;;) {
3b0527fe 2173 if (regtry(&reginfo, s))
d6a28714
JH
2174 goto got_it;
2175 if (s >= strend)
2176 break;
b8c5462f 2177 s += UTF8SKIP(s);
d6a28714
JH
2178 };
2179 }
2180 else {
2181 do {
3b0527fe 2182 if (regtry(&reginfo, s))
d6a28714
JH
2183 goto got_it;
2184 } while (s++ < strend);
2185 }
2186 }
2187
2188 /* Failure. */
2189 goto phooey;
2190
2191got_it:
2192 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2193
2194 if (PL_reg_eval_set) {
2195 /* Preserve the current value of $^R */
2196 if (oreplsv != GvSV(PL_replgv))
2197 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2198 restored, the value remains
2199 the same. */
4f639d21 2200 restore_pos(aTHX_ prog);
d6a28714
JH
2201 }
2202
2203 /* make sure $`, $&, $', and $digit will work later */
2204 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2205 RX_MATCH_COPY_FREE(prog);
d6a28714 2206 if (flags & REXEC_COPY_STR) {
be8e71aa 2207 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2208#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2209 if ((SvIsCOW(sv)
2210 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2211 if (DEBUG_C_TEST) {
2212 PerlIO_printf(Perl_debug_log,
2213 "Copy on write: regexp capture, type %d\n",
2214 (int) SvTYPE(sv));
2215 }
2216 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2217 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2218 assert (SvPOKp(prog->saved_copy));
2219 } else
2220#endif
2221 {
2222 RX_MATCH_COPIED_on(prog);
2223 s = savepvn(strbeg, i);
2224 prog->subbeg = s;
2225 }
d6a28714 2226 prog->sublen = i;
d6a28714
JH
2227 }
2228 else {
2229 prog->subbeg = strbeg;
2230 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2231 }
2232 }
9041c2e3 2233
d6a28714
JH
2234 return 1;
2235
2236phooey:
a3621e74 2237 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2238 PL_colors[4], PL_colors[5]));
d6a28714 2239 if (PL_reg_eval_set)
4f639d21 2240 restore_pos(aTHX_ prog);
d6a28714
JH
2241 return 0;
2242}
2243
2244/*
2245 - regtry - try match at specific point
2246 */
2247STATIC I32 /* 0 failure, 1 success */
3b0527fe 2248S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2249{
97aff369 2250 dVAR;
d6a28714
JH
2251 register I32 *sp;
2252 register I32 *ep;
2253 CHECKPOINT lastcp;
3b0527fe 2254 regexp *prog = reginfo->prog;
a3621e74 2255 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2256
02db2b7b
IZ
2257#ifdef DEBUGGING
2258 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2259#endif
d6a28714
JH
2260 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2261 MAGIC *mg;
2262
2263 PL_reg_eval_set = RS_init;
a3621e74 2264 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2265 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2266 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2267 ));
e8347627 2268 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2269 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2270 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2271 SAVETMPS;
2272 /* Apparently this is not needed, judging by wantarray. */
e8347627 2273 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2274 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2275
3b0527fe 2276 if (reginfo->sv) {
d6a28714 2277 /* Make $_ available to executed code. */
3b0527fe 2278 if (reginfo->sv != DEFSV) {
59f00321 2279 SAVE_DEFSV;
3b0527fe 2280 DEFSV = reginfo->sv;
b8c5462f 2281 }
d6a28714 2282
3b0527fe
DM
2283 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2284 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2285 /* prepare for quick setting of pos */
d300d9fa
NC
2286#ifdef PERL_OLD_COPY_ON_WRITE
2287 if (SvIsCOW(sv))
2288 sv_force_normal_flags(sv, 0);
2289#endif
3dab1dad 2290 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2291 &PL_vtbl_mglob, NULL, 0);
d6a28714 2292 mg->mg_len = -1;
b8c5462f 2293 }
d6a28714
JH
2294 PL_reg_magic = mg;
2295 PL_reg_oldpos = mg->mg_len;
4f639d21 2296 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2297 }
09687e5a 2298 if (!PL_reg_curpm) {
a02a5408 2299 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2300#ifdef USE_ITHREADS
2301 {
be8e71aa 2302 SV* const repointer = newSViv(0);
577e12cc 2303 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2304 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2305 av_push(PL_regex_padav,repointer);
2306 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2307 PL_regex_pad = AvARRAY(PL_regex_padav);
2308 }
2309#endif
2310 }
aaa362c4 2311 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2312 PL_reg_oldcurpm = PL_curpm;
2313 PL_curpm = PL_reg_curpm;
2314 if (RX_MATCH_COPIED(prog)) {
2315 /* Here is a serious problem: we cannot rewrite subbeg,
2316 since it may be needed if this match fails. Thus
2317 $` inside (?{}) could fail... */
2318 PL_reg_oldsaved = prog->subbeg;
2319 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2320#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2321 PL_nrs = prog->saved_copy;
2322#endif
d6a28714
JH
2323 RX_MATCH_COPIED_off(prog);
2324 }
2325 else
bd61b366 2326 PL_reg_oldsaved = NULL;
d6a28714
JH
2327 prog->subbeg = PL_bostr;
2328 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2329 }
973dddac 2330 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2331 PL_reginput = startpos;
2332 PL_regstartp = prog->startp;
2333 PL_regendp = prog->endp;
2334 PL_reglastparen = &prog->lastparen;
a01268b5 2335 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2336 prog->lastparen = 0;
03994de8 2337 prog->lastcloseparen = 0;
d6a28714 2338 PL_regsize = 0;
a3621e74 2339 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2340 if (PL_reg_start_tmpl <= prog->nparens) {
2341 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2342 if(PL_reg_start_tmp)
2343 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2344 else
a02a5408 2345 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2346 }
2347
2348 /* XXXX What this code is doing here?!!! There should be no need
2349 to do this again and again, PL_reglastparen should take care of
3dd2943c 2350 this! --ilya*/
dafc8851
JH
2351
2352 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2353 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2354 * PL_reglastparen), is not needed at all by the test suite
2355 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2356 * enough, for building DynaLoader, or otherwise this
2357 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2358 * will happen. Meanwhile, this code *is* needed for the
2359 * above-mentioned test suite tests to succeed. The common theme
2360 * on those tests seems to be returning null fields from matches.
2361 * --jhi */
dafc8851 2362#if 1
d6a28714
JH
2363 sp = prog->startp;
2364 ep = prog->endp;
2365 if (prog->nparens) {
097eb12c 2366 register I32 i;
eb160463 2367 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2368 *++sp = -1;
2369 *++ep = -1;
2370 }
2371 }
dafc8851 2372#endif
02db2b7b 2373 REGCP_SET(lastcp);
3b0527fe 2374 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2375 prog->endp[0] = PL_reginput - PL_bostr;
2376 return 1;
2377 }
02db2b7b 2378 REGCP_UNWIND(lastcp);
d6a28714
JH
2379 return 0;
2380}
2381
02db2b7b 2382
8ba1375e
MJD
2383#define sayYES goto yes
2384#define sayNO goto no
e0f9d4a8 2385#define sayNO_ANYOF goto no_anyof
8ba1375e 2386#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2387#define sayNO_FINAL goto no_final
2388#define sayNO_SILENT goto do_no
2389#define saySAME(x) if (x) goto yes; else goto no
2390
3ab3c9b4
HS
2391#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2392#define POSCACHE_SEEN 1 /* we know what we're caching */
2393#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
7409bbd3 2394
3ab3c9b4 2395#define CACHEsayYES STMT_START { \
d8319b27 2396 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3
DM
2397 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2398 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2399 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2400 } \
2401 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2402 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2403 } \
2404 else { \
3ab3c9b4
HS
2405 /* cache records failure, but this is success */ \
2406 DEBUG_r( \
2407 PerlIO_printf(Perl_debug_log, \
2408 "%*s (remove success from failure cache)\n", \
2409 REPORT_CODE_OFF+PL_regindent*2, "") \
2410 ); \
d8319b27 2411 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2412 } \
2413 } \
2414 sayYES; \
2415} STMT_END
7409bbd3 2416
3ab3c9b4 2417#define CACHEsayNO STMT_START { \
d8319b27 2418 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3 2419 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
3ab3c9b4 2420 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
7409bbd3
DM
2421 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2422 } \
2423 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2424 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2425 } \
2426 else { \
3ab3c9b4
HS
2427 /* cache records success, but this is failure */ \
2428 DEBUG_r( \
2429 PerlIO_printf(Perl_debug_log, \
2430 "%*s (remove failure from success cache)\n", \
2431 REPORT_CODE_OFF+PL_regindent*2, "") \
2432 ); \
d8319b27 2433 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2434 } \
2435 } \
2436 sayNO; \
2437} STMT_END
2438
a3621e74
YO
2439/* this is used to determine how far from the left messages like
2440 'failed...' are printed. Currently 29 makes these messages line
2441 up with the opcode they refer to. Earlier perls used 25 which
2442 left these messages outdented making reviewing a debug output
2443 quite difficult.
2444*/
2445#define REPORT_CODE_OFF 29
2446
2447
2448/* Make sure there is a test for this +1 options in re_tests */
2449#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2450
40a82448
DM
2451#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2452#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2453
86545054
DM
2454#define SLAB_FIRST(s) (&(s)->states[0])
2455#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2456
5d9a96ca
DM
2457/* grab a new slab and return the first slot in it */
2458
2459STATIC regmatch_state *
2460S_push_slab(pTHX)
2461{
54df2634
NC
2462#if PERL_VERSION < 9
2463 dMY_CXT;
2464#endif
5d9a96ca
DM
2465 regmatch_slab *s = PL_regmatch_slab->next;
2466 if (!s) {
2467 Newx(s, 1, regmatch_slab);
2468 s->prev = PL_regmatch_slab;
2469 s->next = NULL;
2470 PL_regmatch_slab->next = s;
2471 }
2472 PL_regmatch_slab = s;
86545054 2473 return SLAB_FIRST(s);
5d9a96ca 2474}
5b47454d 2475
95b24440
DM
2476/* simulate a recursive call to regmatch */
2477
2478#define REGMATCH(ns, where) \
5d9a96ca
DM
2479 st->scan = scan; \
2480 scan = (ns); \
2481 st->resume_state = resume_##where; \
95b24440
DM
2482 goto start_recurse; \
2483 resume_point_##where:
2484
40a82448
DM
2485/* push a new state then goto it */
2486
2487#define PUSH_STATE_GOTO(state, node) \
2488 scan = node; \
2489 st->resume_state = state; \
2490 goto push_state;
2491
2492/* push a new state with success backtracking, then goto it */
2493
2494#define PUSH_YES_STATE_GOTO(state, node) \
2495 scan = node; \
2496 st->resume_state = state; \
2497 goto push_yes_state;
2498
aa283a38 2499
aa283a38 2500
d6a28714
JH
2501/*
2502 - regmatch - main matching routine
2503 *
2504 * Conceptually the strategy is simple: check to see whether the current
2505 * node matches, call self recursively to see whether the rest matches,
2506 * and then act accordingly. In practice we make some effort to avoid
2507 * recursion, in particular by going through "ordinary" nodes (that don't
2508 * need to know whether the rest of the match failed) by a loop instead of
2509 * by recursion.
2510 */
2511/* [lwall] I've hoisted the register declarations to the outer block in order to
2512 * maybe save a little bit of pushing and popping on the stack. It also takes
2513 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2514 *
2515 * This function used to be heavily recursive, but since this had the
2516 * effect of blowing the CPU stack on complex regexes, it has been
2517 * restructured to be iterative, and to save state onto the heap rather
2518 * than the stack. Essentially whereever regmatch() used to be called, it
2519 * pushes the current state, notes where to return, then jumps back into
2520 * the main loop.
2521 *
2522 * Originally the structure of this function used to look something like
2523
2524 S_regmatch() {
2525 int a = 1, b = 2;
2526 ...
2527 while (scan != NULL) {
5d9a96ca 2528 a++; // do stuff with a and b
95b24440
DM
2529 ...
2530 switch (OP(scan)) {
2531 case FOO: {
2532 int local = 3;
2533 ...
2534 if (regmatch(...)) // recurse
2535 goto yes;
2536 }
2537 ...
2538 }
2539 }
2540 yes:
2541 return 1;
2542 }
2543
2544 * Now it looks something like this:
2545
5d9a96ca 2546 typedef struct {
95b24440
DM
2547 int a, b, local;
2548 int resume_state;
5d9a96ca 2549 } regmatch_state;
95b24440
DM
2550
2551 S_regmatch() {
5d9a96ca
DM
2552 regmatch_state *st = new();
2553 int depth=0;
2554 st->a++; // do stuff with a and b
95b24440
DM
2555 ...
2556 while (scan != NULL) {
2557 ...
2558 switch (OP(scan)) {
2559 case FOO: {
5d9a96ca 2560 st->local = 3;
95b24440 2561 ...
5d9a96ca
DM
2562 st->scan = scan;
2563 scan = ...;
2564 st->resume_state = resume_FOO;
2565 goto start_recurse; // recurse
95b24440 2566
5d9a96ca
DM
2567 resume_point_FOO:
2568 if (result)
95b24440
DM
2569 goto yes;
2570 }
2571 ...
2572 }
5d9a96ca
DM
2573 start_recurse:
2574 st = new(); push a new state
2575 st->a = 1; st->b = 2;
2576 depth++;
95b24440 2577 }
5d9a96ca 2578 yes:
95b24440 2579 result = 1;
5d9a96ca
DM
2580 if (depth--) {
2581 st = pop();
95b24440
DM
2582 switch (resume_state) {
2583 case resume_FOO:
2584 goto resume_point_FOO;
2585 ...
2586 }
2587 }
2588 return result
2589 }
2590
2591 * WARNING: this means that any line in this function that contains a
2592 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2593 * regmatch() using gotos instead. Thus the values of any local variables
2594 * not saved in the regmatch_state structure will have been lost when
2595 * execution resumes on the next line .
5d9a96ca
DM
2596 *
2597 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2598 * PL_regmatch_state always points to the currently active state, and
2599 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2600 * The first time regmatch is called, the first slab is allocated, and is
2601 * never freed until interpreter desctruction. When the slab is full,
2602 * a new one is allocated chained to the end. At exit from regmatch, slabs
2603 * allocated since entry are freed.
d6a28714 2604 */
95b24440 2605
40a82448
DM
2606/* *** every FOO_fail should = FOO+1 */
2607#define resume_TRIE1 (REGNODE_MAX+1)
2608#define resume_TRIE2 (REGNODE_MAX+2)
2609#define EVAL_A (REGNODE_MAX+3)
2610#define EVAL_A_fail (REGNODE_MAX+4)
2611#define resume_CURLYX (REGNODE_MAX+5)
2612#define resume_WHILEM1 (REGNODE_MAX+6)
2613#define resume_WHILEM2 (REGNODE_MAX+7)
2614#define resume_WHILEM3 (REGNODE_MAX+8)
2615#define resume_WHILEM4 (REGNODE_MAX+9)
2616#define resume_WHILEM5 (REGNODE_MAX+10)
2617#define resume_WHILEM6 (REGNODE_MAX+11)
2618#define BRANCH_next (REGNODE_MAX+12)
2619#define BRANCH_next_fail (REGNODE_MAX+13)
2620#define CURLYM_A (REGNODE_MAX+14)
2621#define CURLYM_A_fail (REGNODE_MAX+15)
2622#define CURLYM_B (REGNODE_MAX+16)
2623#define CURLYM_B_fail (REGNODE_MAX+17)
2624#define IFMATCH_A (REGNODE_MAX+18)
2625#define IFMATCH_A_fail (REGNODE_MAX+19)
2626#define resume_PLUS1 (REGNODE_MAX+20)
2627#define resume_PLUS2 (REGNODE_MAX+21)
2628#define resume_PLUS3 (REGNODE_MAX+22)
2629#define resume_PLUS4 (REGNODE_MAX+23)
2630
2631
2632
3dab1dad 2633#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2634
07be1b83
YO
2635#ifdef DEBUGGING
2636STATIC void
2637S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2638{
2639 const int docolor = *PL_colors[0];
2640 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2641 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2642 /* The part of the string before starttry has one color
2643 (pref0_len chars), between starttry and current
2644 position another one (pref_len - pref0_len chars),
2645 after the current position the third one.
2646 We assume that pref0_len <= pref_len, otherwise we
2647 decrease pref0_len. */
2648 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2649 ? (5 + taill) - l : locinput - PL_bostr;
2650 int pref0_len;
2651
2652 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2653 pref_len++;
2654 pref0_len = pref_len - (locinput - PL_reg_starttry);
2655 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2656 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2657 ? (5 + taill) - pref_len : PL_regeol - locinput);
2658 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2659 l--;
2660 if (pref0_len < 0)
2661 pref0_len = 0;
2662 if (pref0_len > pref_len)
2663 pref0_len = pref_len;
2664 {
2665 const char * const s0 =
2666 do_utf8 && OP(scan) != CANY ?
2667 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
2668 pref0_len, 60, UNI_DISPLAY_REGEX) :
2669 locinput - pref_len;
2670 const int len0 = do_utf8 ? (int)strlen(s0) : pref0_len;
2671 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2672 pv_uni_display(PERL_DEBUG_PAD(1),
2673 (U8*)(locinput - pref_len + pref0_len),
2674 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2675 locinput - pref_len + pref0_len;
2676 const int len1 = do_utf8 ? (int)strlen(s1) : pref_len - pref0_len;
2677 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2678 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
2679 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2680 locinput;
2681 const int len2 = do_utf8 ? (int)strlen(s2) : l;
2682 PerlIO_printf(Perl_debug_log,
2683 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
2684 (IV)(locinput - PL_bostr),
2685 PL_colors[4],
2686 len0, s0,
2687 PL_colors[5],
2688 PL_colors[2],
2689 len1, s1,
2690 PL_colors[3],
2691 (docolor ? "" : "> <"),
2692 PL_colors[0],
2693 len2, s2,
2694 PL_colors[1],
2695 15 - l - pref_len + 1,
2696 "");
2697 }
2698}
2699#endif
2700
d6a28714 2701STATIC I32 /* 0 failure, 1 success */
3b0527fe 2702S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2703{
54df2634
NC
2704#if PERL_VERSION < 9
2705 dMY_CXT;
2706#endif
27da23d5 2707 dVAR;
95b24440 2708 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2709 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2710
3b0527fe
DM
2711 regexp *rex = reginfo->prog;
2712
5d9a96ca
DM
2713 regmatch_slab *orig_slab;
2714 regmatch_state *orig_state;
a3621e74 2715
5d9a96ca
DM
2716 /* the current state. This is a cached copy of PL_regmatch_state */
2717 register regmatch_state *st;
95b24440 2718
5d9a96ca
DM
2719 /* cache heavy used fields of st in registers */
2720 register regnode *scan;
2721 register regnode *next;
2722 register I32 n = 0; /* initialize to shut up compiler warning */
2723 register char *locinput = PL_reginput;
95b24440 2724
5d9a96ca
DM
2725 /* these variables are NOT saved during a recusive RFEGMATCH: */
2726 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2727 bool result; /* return value of S_regmatch */
5d9a96ca 2728 int depth = 0; /* depth of recursion */
77cb431f
DM
2729 regmatch_state *yes_state = NULL; /* state to pop to on success of
2730 subpattern */
40a82448 2731 U32 state_num;
95b24440
DM
2732
2733#ifdef DEBUGGING
e68ec53f 2734 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2735 PL_regindent++;
2736#endif
2737
5d9a96ca
DM
2738 /* on first ever call to regmatch, allocate first slab */
2739 if (!PL_regmatch_slab) {
2740 Newx(PL_regmatch_slab, 1, regmatch_slab);
2741 PL_regmatch_slab->prev = NULL;
2742 PL_regmatch_slab->next = NULL;
86545054 2743 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2744 }
2745
2746 /* remember current high-water mark for exit */
2747 /* XXX this should be done with SAVE* instead */
2748 orig_slab = PL_regmatch_slab;
2749 orig_state = PL_regmatch_state;
2750
2751 /* grab next free state slot */
2752 st = ++PL_regmatch_state;
86545054 2753 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2754 st = PL_regmatch_state = S_push_slab(aTHX);
2755
2756 st->minmod = 0;
2757 st->sw = 0;
2758 st->logical = 0;
5d9a96ca 2759 st->cc = NULL;
d6a28714
JH
2760 /* Note that nextchr is a byte even in UTF */
2761 nextchr = UCHARAT(locinput);
2762 scan = prog;
2763 while (scan != NULL) {
8ba1375e 2764
a3621e74 2765 DEBUG_EXECUTE_r( {
6136c704 2766 SV * const prop = sv_newmortal();
07be1b83 2767 dump_exec_pos( locinput, scan, do_utf8 );
32fc9b6a 2768 regprop(rex, prop, scan);
07be1b83
YO
2769
2770 PerlIO_printf(Perl_debug_log,
2771 "%3"IVdf":%*s%s(%"IVdf")\n",
2772 (IV)(scan - rex->program), PL_regindent*2, "",
2773 SvPVX_const(prop),
2774 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2a782b5b 2775 });
d6a28714
JH
2776
2777 next = scan + NEXT_OFF(scan);
2778 if (next == scan)
2779 next = NULL;
40a82448 2780 state_num = OP(scan);
d6a28714 2781
40a82448
DM
2782 reenter_switch:
2783 switch (state_num) {
d6a28714 2784 case BOL:
7fba1cd6 2785 if (locinput == PL_bostr)
d6a28714 2786 {
3b0527fe 2787 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2788 break;
2789 }
d6a28714
JH
2790 sayNO;
2791 case MBOL:
12d33761
HS
2792 if (locinput == PL_bostr ||
2793 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2794 {
b8c5462f
JH
2795 break;
2796 }
d6a28714
JH
2797 sayNO;
2798 case SBOL:
c2a73568 2799 if (locinput == PL_bostr)
b8c5462f 2800 break;
d6a28714
JH
2801 sayNO;
2802 case GPOS:
3b0527fe 2803 if (locinput == reginfo->ganch)
d6a28714
JH
2804 break;
2805 sayNO;
2806 case EOL:
d6a28714
JH
2807 goto seol;
2808 case MEOL:
d6a28714 2809 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2810 sayNO;
b8c5462f 2811 break;
d6a28714
JH
2812 case SEOL:
2813 seol:
2814 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2815 sayNO;
d6a28714 2816 if (PL_regeol - locinput > 1)
b8c5462f 2817 sayNO;
b8c5462f 2818 break;
d6a28714
JH
2819 case EOS:
2820 if (PL_regeol != locinput)
b8c5462f 2821 sayNO;
d6a28714 2822 break;
ffc61ed2 2823 case SANY:
d6a28714 2824 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2825 sayNO;
f33976b4
DB
2826 if (do_utf8) {
2827 locinput += PL_utf8skip[nextchr];
2828 if (locinput > PL_regeol)
2829 sayNO;
2830 nextchr = UCHARAT(locinput);
2831 }
2832 else
2833 nextchr = UCHARAT(++locinput);
2834 break;
2835 case CANY:
2836 if (!nextchr && locinput >= PL_regeol)
2837 sayNO;
b8c5462f 2838 nextchr = UCHARAT(++locinput);
a0d0e21e 2839 break;
ffc61ed2 2840 case REG_ANY:
1aa99e6b
IH
2841 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2842 sayNO;
2843 if (do_utf8) {
b8c5462f 2844 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2845 if (locinput > PL_regeol)
2846 sayNO;
a0ed51b3 2847 nextchr = UCHARAT(locinput);
a0ed51b3 2848 }
1aa99e6b
IH
2849 else
2850 nextchr = UCHARAT(++locinput);
a0ed51b3 2851 break;
5b47454d 2852 case TRIE:
3dab1dad 2853 {
07be1b83 2854 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2855 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2856 trie_type = do_utf8 ?
2857 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2858 : trie_plain;
2859
2860 /* what trie are we using right now */
be8e71aa 2861 reg_trie_data * const trie
3dab1dad
YO
2862 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2863 U32 state = trie->startstate;
2864
2865 if (trie->bitmap && trie_type != trie_utf8_fold &&
2866 !TRIE_BITMAP_TEST(trie,*locinput)
2867 ) {
2868 if (trie->states[ state ].wordnum) {
2869 DEBUG_EXECUTE_r(
2870 PerlIO_printf(Perl_debug_log,
2871 "%*s %smatched empty string...%s\n",
2872 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2873 );
2874 break;
2875 } else {
2876 DEBUG_EXECUTE_r(
2877 PerlIO_printf(Perl_debug_log,
2878 "%*s %sfailed to match start class...%s\n",
2879 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2880 );
2881 sayNO_SILENT;
2882 }
2883 }
a3621e74 2884 {
07be1b83
YO
2885 /*
2886 traverse the TRIE keeping track of all accepting states
2887 we transition through until we get to a failing node.
2888 */
2889
a3621e74 2890 U8 *uc = ( U8* )locinput;
a3621e74
YO
2891 U16 charid = 0;
2892 U32 base = 0;
2893 UV uvc = 0;
2894 STRLEN len = 0;
2895 STRLEN foldlen = 0;
a3621e74
YO
2896 U8 *uscan = (U8*)NULL;
2897 STRLEN bufflen=0;
95b24440 2898 SV *sv_accept_buff = NULL;
f9e705e8 2899 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
5b47454d 2900
d8319b27 2901 st->u.trie.accepted = 0; /* how many accepting states we have seen */
95b24440 2902 result = 0;
a3621e74
YO
2903
2904 while ( state && uc <= (U8*)PL_regeol ) {
2905
5b47454d 2906 if (trie->states[ state ].wordnum) {
d8319b27 2907 if (!st->u.trie.accepted ) {
5b47454d
DM
2908 ENTER;
2909 SAVETMPS;
2910 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2911 sv_accept_buff=newSV(bufflen *
2912 sizeof(reg_trie_accepted) - 1);
2913 SvCUR_set(sv_accept_buff,
2914 sizeof(reg_trie_accepted));
2915 SvPOK_on(sv_accept_buff);
2916 sv_2mortal(sv_accept_buff);
d8319b27 2917 st->u.trie.accept_buff =
5b47454d
DM
2918 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2919 }
2920 else {
d8319b27 2921 if (st->u.trie.accepted >= bufflen) {
5b47454d 2922 bufflen *= 2;
d8319b27 2923 st->u.trie.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2924 SvGROW(sv_accept_buff,
2925 bufflen * sizeof(reg_trie_accepted));
2926 }
2927 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2928 + sizeof(reg_trie_accepted));
2929 }
d8319b27
DM
2930 st->u.trie.accept_buff[st->u.trie.accepted].wordnum = trie->states[state].wordnum;
2931 st->u.trie.accept_buff[st->u.trie.accepted].endpos = uc;
2932 ++st->u.trie.accepted;
5b47454d 2933 }
a3621e74
YO
2934
2935 base = trie->states[ state ].trans.base;
2936
07be1b83
YO
2937 DEBUG_TRIE_EXECUTE_r({
2938 dump_exec_pos( (char *)uc, scan, do_utf8 );
a3621e74 2939 PerlIO_printf( Perl_debug_log,
e4584336 2940 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
07be1b83 2941 2+PL_regindent * 2, "", PL_colors[4],
d8319b27 2942 (UV)state, (UV)base, (UV)st->u.trie.accepted );
07be1b83 2943 });
a3621e74
YO
2944
2945 if ( base ) {
5b47454d 2946 switch (trie_type) {
3dab1dad 2947 case trie_utf8_fold:
a3621e74
YO
2948 if ( foldlen>0 ) {
2949 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2950 foldlen -= len;
2951 uscan += len;
2952 len=0;
2953 } else {
2954 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2955 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2956 foldlen -= UNISKIP( uvc );
2957 uscan = foldbuf + UNISKIP( uvc );
2958 }
5b47454d
DM
2959 break;
2960 case trie_utf8:
2961 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2962 &len, uniflags );
2963 break;
2964 case trie_plain:
e4584336 2965 uvc = (UV)*uc;
a3621e74
YO
2966 len = 1;
2967 }
2968
5b47454d
DM
2969 if (uvc < 256) {
2970 charid = trie->charmap[ uvc ];
2971 }
2972 else {
2973 charid = 0;
2974 if (trie->widecharmap) {
3dab1dad 2975 SV** const svpp = hv_fetch(trie->widecharmap,
5b47454d
DM
2976 (char*)&uvc, sizeof(UV), 0);
2977 if (svpp)
2978 charid = (U16)SvIV(*svpp);
2979 }
2980 }
a3621e74 2981
5b47454d
DM
2982 if (charid &&
2983 (base + charid > trie->uniquecharcount )
2984 && (base + charid - 1 - trie->uniquecharcount
2985 < trie->lasttrans)
2986 && trie->trans[base + charid - 1 -
2987 trie->uniquecharcount].check == state)
2988 {
2989 state = trie->trans[base + charid - 1 -
2990 trie->uniquecharcount ].next;
2991 }
2992 else {
2993 state = 0;
2994 }
2995 uc += len;
2996
2997 }
2998 else {
a3621e74
YO
2999 state = 0;
3000 }
3001 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
3002 PerlIO_printf( Perl_debug_log,
3003 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
3004 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3005 );
3006 }
d8319b27 3007 if (!st->u.trie.accepted )
a3621e74 3008 sayNO;
a3621e74
YO
3009
3010 /*
3011 There was at least one accepting state that we
3012 transitioned through. Presumably the number of accepting
3013 states is going to be low, typically one or two. So we
3014 simply scan through to find the one with lowest wordnum.
3015 Once we find it, we swap the last state into its place
3016 and decrement the size. We then try to match the rest of
3017 the pattern at the point where the word ends, if we
3018 succeed then we end the loop, otherwise the loop
3019 eventually terminates once all of the accepting states
3020 have been tried.
3021 */
a3621e74 3022
d8319b27 3023 if ( st->u.trie.accepted == 1 ) {
f2278c82 3024 DEBUG_EXECUTE_r({
be8e71aa 3025 SV ** const tmp = RX_DEBUG(reginfo->prog)
f2278c82
YO
3026 ? av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 )
3027 : NULL;
3028 PerlIO_printf( Perl_debug_log,
3029 "%*s %sonly one match : #%d <%s>%s\n",
3030 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3031 st->u.trie.accept_buff[ 0 ].wordnum,
3032 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
3033 PL_colors[5] );
3034 });
d8319b27 3035 PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos;
a3621e74
YO
3036 /* in this case we free tmps/leave before we call regmatch
3037 as we wont be using accept_buff again. */
3038 FREETMPS;
3039 LEAVE;
07be1b83 3040 /* do we need this? why dont we just do a break? */
95b24440
DM
3041 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
3042 /*** all unsaved local vars undefined at this point */
a3621e74
YO
3043 } else {
3044 DEBUG_EXECUTE_r(
e4584336 3045 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
d8319b27 3046 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)st->u.trie.accepted,
a3621e74
YO
3047 PL_colors[5] );
3048 );
d8319b27 3049 while ( !result && st->u.trie.accepted-- ) {
a3621e74
YO
3050 U32 best = 0;
3051 U32 cur;
d8319b27 3052 for( cur = 1 ; cur <= st->u.trie.accepted ; cur++ ) {
e4584336
RB
3053 DEBUG_TRIE_EXECUTE_r(
3054 PerlIO_printf( Perl_debug_log,
3055 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3056 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27
DM
3057 (IV)best, st->u.trie.accept_buff[ best ].wordnum, (IV)cur,
3058 st->u.trie.accept_buff[ cur ].wordnum, PL_colors[5] );
e4584336 3059 );
a3621e74 3060
d8319b27
DM
3061 if (st->u.trie.accept_buff[cur].wordnum <
3062 st->u.trie.accept_buff[best].wordnum)
e822a8b4 3063 best = cur;
a3621e74 3064 }
f2278c82
YO
3065 DEBUG_EXECUTE_r({
3066 reg_trie_data * const trie = (reg_trie_data*)
3067 rex->data->data[ARG(scan)];
be8e71aa 3068 SV ** const tmp = RX_DEBUG(reginfo->prog)
f2278c82
YO
3069 ? av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 )
3070 : NULL;
3071 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
3072 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3073 st->u.trie.accept_buff[best].wordnum,
3074 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
3075 PL_colors[5] );
3076 });
d8319b27
DM
3077 if ( best<st->u.trie.accepted ) {
3078 reg_trie_accepted tmp = st->u.trie.accept_buff[ best ];
3079 st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ];
3080 st->u.trie.accept_buff[ st->u.trie.accepted ] = tmp;
3081 best = st->u.trie.accepted;
a3621e74 3082 }
d8319b27 3083 PL_reginput = (char *)st->u.trie.accept_buff[ best ].endpos;
a3621e74
YO
3084
3085 /*
3086 as far as I can tell we only need the SAVETMPS/FREETMPS
3087 for re's with EVAL in them but I'm leaving them in for
3088 all until I can be sure.
3089 */
3090 SAVETMPS;
95b24440
DM
3091 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
3092 /*** all unsaved local vars undefined at this point */
a3621e74
YO
3093 FREETMPS;
3094 }
3095 FREETMPS;
3096 LEAVE;
3097 }
3098
95b24440 3099 if (result) {
a3621e74
YO
3100 sayYES;
3101 } else {
3102 sayNO;
3103 }
3dab1dad 3104 }}
a3621e74 3105 /* unreached codepoint */
95b24440
DM
3106 case EXACT: {
3107 char *s = STRING(scan);
5d9a96ca 3108 st->ln = STR_LEN(scan);
eb160463 3109 if (do_utf8 != UTF) {
bc517b45 3110 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3111 char *l = locinput;
be8e71aa 3112 const char * const e = s + st->ln;
a72c7584 3113
5ff6fc6d
JH
3114 if (do_utf8) {
3115 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3116 while (s < e) {
a3b680e6 3117 STRLEN ulen;
1aa99e6b 3118 if (l >= PL_regeol)
5ff6fc6d
JH
3119 sayNO;
3120 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3121 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3122 uniflags))
5ff6fc6d 3123 sayNO;
bc517b45 3124 l += ulen;
5ff6fc6d 3125 s ++;
1aa99e6b 3126 }
5ff6fc6d
JH
3127 }
3128 else {
3129 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3130 while (s < e) {
a3b680e6 3131 STRLEN ulen;
1aa99e6b
IH
3132 if (l >= PL_regeol)
3133 sayNO;
5ff6fc6d 3134 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3135 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3136 uniflags))
1aa99e6b 3137 sayNO;
bc517b45 3138 s += ulen;
a72c7584 3139 l ++;
1aa99e6b 3140 }
5ff6fc6d 3141 }
1aa99e6b
IH
3142 locinput = l;
3143 nextchr = UCHARAT(locinput);
3144 break;
3145 }
bc517b45 3146 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3147 /* Inline the first character, for speed. */
3148 if (UCHARAT(s) != nextchr)
3149 sayNO;
5d9a96ca 3150 if (PL_regeol - locinput < st->ln)
d6a28714 3151 sayNO;
5d9a96ca 3152 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 3153 sayNO;
5d9a96ca 3154 locinput += st->ln;
d6a28714
JH
3155 nextchr = UCHARAT(locinput);
3156 break;
95b24440 3157 }
d6a28714 3158 case EXACTFL:
b8c5462f
JH
3159 PL_reg_flags |= RF_tainted;
3160 /* FALL THROUGH */
95b24440 3161 case EXACTF: {
be8e71aa 3162 char * const s = STRING(scan);
5d9a96ca 3163 st->ln = STR_LEN(scan);
d6a28714 3164
d07ddd77
JH
3165 if (do_utf8 || UTF) {
3166 /* Either target or the pattern are utf8. */
be8e71aa 3167 const char * const l = locinput;
d07ddd77 3168 char *e = PL_regeol;
bc517b45 3169
5d9a96ca 3170 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 3171 l, &e, 0, do_utf8)) {
5486206c
JH
3172 /* One more case for the sharp s:
3173 * pack("U0U*", 0xDF) =~ /ss/i,
3174 * the 0xC3 0x9F are the UTF-8
3175 * byte sequence for the U+00DF. */
3176 if (!(do_utf8 &&
3177 toLOWER(s[0]) == 's' &&
5d9a96ca 3178 st->ln >= 2 &&
5486206c
JH
3179 toLOWER(s[1]) == 's' &&
3180 (U8)l[0] == 0xC3 &&
3181 e - l >= 2 &&
3182 (U8)l[1] == 0x9F))
3183 sayNO;
3184 }
d07ddd77
JH
3185 locinput = e;
3186 nextchr = UCHARAT(locinput);
3187 break;
a0ed51b3 3188 }
d6a28714 3189
bc517b45
JH
3190 /* Neither the target and the pattern are utf8. */
3191
d6a28714
JH
3192 /* Inline the first character, for speed. */
3193 if (UCHARAT(s) != nextchr &&
3194 UCHARAT(s) != ((OP(scan) == EXACTF)
3195 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3196 sayNO;
5d9a96ca 3197 if (PL_regeol - locinput < st->ln)
b8c5462f 3198 sayNO;
5d9a96ca
DM
3199 if (st->ln > 1 && (OP(scan) == EXACTF
3200 ? ibcmp(s, locinput, st->ln)
3201 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 3202 sayNO;
5d9a96ca 3203 locinput += st->ln;
d6a28714 3204 nextchr = UCHARAT(locinput);
a0d0e21e 3205 break;
95b24440 3206 }
d6a28714 3207 case ANYOF:
ffc61ed2 3208 if (do_utf8) {
9e55ce06
JH
3209 STRLEN inclasslen = PL_regeol - locinput;
3210
32fc9b6a 3211 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 3212 sayNO_ANYOF;
ffc61ed2
JH
3213 if (locinput >= PL_regeol)
3214 sayNO;
0f0076b4 3215 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3216 nextchr = UCHARAT(locinput);
e0f9d4a8 3217 break;
ffc61ed2
JH
3218 }
3219 else {
3220 if (nextchr < 0)
3221 nextchr = UCHARAT(locinput);
32fc9b6a 3222 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 3223 sayNO_ANYOF;
ffc61ed2
JH
3224 if (!nextchr && locinput >= PL_regeol)
3225 sayNO;
3226 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3227 break;
3228 }
3229 no_anyof:
3230 /* If we might have the case of the German sharp s
3231 * in a casefolding Unicode character class. */
3232
ebc501f0
JH
3233 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3234 locinput += SHARP_S_SKIP;
e0f9d4a8 3235 nextchr = UCHARAT(locinput);
ffc61ed2 3236 }
e0f9d4a8
JH
3237 else
3238 sayNO;
b8c5462f 3239 break;
d6a28714 3240 case ALNUML:
b8c5462f
JH
3241 PL_reg_flags |= RF_tainted;
3242 /* FALL THROUGH */
d6a28714 3243 case ALNUM:
b8c5462f 3244 if (!nextchr)
4633a7c4 3245 sayNO;
ffc61ed2 3246 if (do_utf8) {
1a4fad37 3247 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3248 if (!(OP(scan) == ALNUM
bb7a0f54 3249 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3250 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3251 {
3252 sayNO;
a0ed51b3 3253 }
b8c5462f 3254 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3255 nextchr = UCHARAT(locinput);
3256 break;
3257 }
ffc61ed2 3258 if (!(OP(scan) == ALNUM
d6a28714 3259 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3260 sayNO;
b8c5462f 3261 nextchr = UCHARAT(++locinput);
a0d0e21e 3262 break;
d6a28714 3263 case NALNUML:
b8c5462f
JH
3264 PL_reg_flags |= RF_tainted;
3265 /* FALL THROUGH */
d6a28714
JH
3266 case NALNUM:
3267 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3268 sayNO;
ffc61ed2 3269 if (do_utf8) {
1a4fad37 3270 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3271 if (OP(scan) == NALNUM
bb7a0f54 3272 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3273 : isALNUM_LC_utf8((U8*)locinput))
3274 {
b8c5462f 3275 sayNO;
d6a28714 3276 }
b8c5462f
JH
3277 locinput += PL_utf8skip[nextchr];
3278 nextchr = UCHARAT(locinput);
3279 break;
3280 }
ffc61ed2 3281 if (OP(scan) == NALNUM
d6a28714 3282 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3283 sayNO;
76e3520e 3284 nextchr = UCHARAT(++locinput);
a0d0e21e 3285 break;
d6a28714
JH
3286 case BOUNDL:
3287 case NBOUNDL:
3280af22 3288 PL_reg_flags |= RF_tainted;
bbce6d69 3289 /* FALL THROUGH */
d6a28714
JH
3290 case BOUND:
3291 case NBOUND:
3292 /* was last char in word? */
ffc61ed2 3293 if (do_utf8) {
12d33761 3294 if (locinput == PL_bostr)
5d9a96ca 3295 st->ln = '\n';
ffc61ed2 3296 else {
a3b680e6 3297 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3298
4ad0818d 3299 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3300 }
3301 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3302 st->ln = isALNUM_uni(st->ln);
1a4fad37 3303 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3304 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3305 }
3306 else {
5d9a96ca 3307 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3308 n = isALNUM_LC_utf8((U8*)locinput);
3309 }
a0ed51b3 3310 }
d6a28714 3311 else {
5d9a96ca 3312 st->ln = (locinput != PL_bostr) ?
12d33761 3313 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3314 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3315 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3316 n = isALNUM(nextchr);
3317 }
3318 else {
5d9a96ca 3319 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3320 n = isALNUM_LC(nextchr);
3321 }
d6a28714 3322 }
5d9a96ca 3323 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3324 OP(scan) == BOUNDL))
3325 sayNO;
a0ed51b3 3326 break;
d6a28714 3327 case SPACEL:
3280af22 3328 PL_reg_flags |= RF_tainted;
bbce6d69 3329 /* FALL THROUGH */
d6a28714 3330 case SPACE:
9442cb0e 3331 if (!nextchr)
4633a7c4 3332 sayNO;
1aa99e6b 3333 if (do_utf8) {
fd400ab9 3334 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3335 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3336 if (!(OP(scan) == SPACE
bb7a0f54 3337 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3338 : isSPACE_LC_utf8((U8*)locinput)))
3339 {
3340 sayNO;
3341 }
3342 locinput += PL_utf8skip[nextchr];
3343 nextchr = UCHARAT(locinput);
3344 break;
d6a28714 3345 }
ffc61ed2
JH
3346 if (!(OP(scan) == SPACE
3347 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3348 sayNO;
3349 nextchr = UCHARAT(++locinput);
3350 }
3351 else {
3352 if (!(OP(scan) == SPACE
3353 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3354 sayNO;
3355 nextchr = UCHARAT(++locinput);
a0ed51b3 3356 }
a0ed51b3 3357 break;
d6a28714 3358 case NSPACEL:
3280af22 3359 PL_reg_flags |= RF_tainted;
bbce6d69 3360 /* FALL THROUGH */
d6a28714 3361 case NSPACE:
9442cb0e 3362 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3363 sayNO;
1aa99e6b 3364 if (do_utf8) {
1a4fad37 3365 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3366 if (OP(scan) == NSPACE
bb7a0f54 3367 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3368 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3369 {
3370 sayNO;
3371 }
3372 locinput += PL_utf8skip[nextchr];
3373 nextchr = UCHARAT(locinput);
3374 break;
a0ed51b3 3375 }
ffc61ed2 3376 if (OP(scan) == NSPACE
d6a28714 3377 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3378 sayNO;
76e3520e 3379 nextchr = UCHARAT(++locinput);
a0d0e21e 3380 break;
d6a28714 3381 case DIGITL:
a0ed51b3
LW
3382 PL_reg_flags |= RF_tainted;
3383 /* FALL THROUGH */
d6a28714 3384 case DIGIT:
9442cb0e 3385 if (!nextchr)
a0ed51b3 3386 sayNO;
1aa99e6b 3387 if (do_utf8) {
1a4fad37 3388 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3389 if (!(OP(scan) == DIGIT
bb7a0f54 3390 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3391 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3392 {
a0ed51b3 3393 sayNO;
dfe13c55 3394 }
6f06b55f 3395 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3396 nextchr = UCHARAT(locinput);
3397 break;
3398 }
ffc61ed2 3399 if (!(OP(scan) == DIGIT
9442cb0e 3400 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3401 sayNO;
3402 nextchr = UCHARAT(++locinput);
3403 break;
d6a28714 3404 case NDIGITL:
b8c5462f
JH
3405 PL_reg_flags |= RF_tainted;
3406 /* FALL THROUGH */
d6a28714 3407 case NDIGIT:
9442cb0e 3408 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3409 sayNO;
1aa99e6b 3410 if (do_utf8) {
1a4fad37 3411 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3412 if (OP(scan) == NDIGIT
bb7a0f54 3413 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3414 : isDIGIT_LC_utf8((U8*)locinput))
3415 {
a0ed51b3<