This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
migrate CURLY/PLUS/STAR branch in regmatch() to new FSM-esque paradigm
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e 33#ifdef PERL_EXT_RE_BUILD
54df2634 34#include "re_top.h"
9041c2e3 35#endif
56953603 36
a687059c 37/*
e50aee73 38 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
39 *
40 * Copyright (c) 1986 by University of Toronto.
41 * Written by Henry Spencer. Not derived from licensed software.
42 *
43 * Permission is granted to anyone to use this software for any
44 * purpose on any computer system, and to redistribute it freely,
45 * subject to the following restrictions:
46 *
47 * 1. The author is not responsible for the consequences of use of
48 * this software, no matter how awful, even if they arise
49 * from defects in it.
50 *
51 * 2. The origin of this software must not be misrepresented, either
52 * by explicit claim or by omission.
53 *
54 * 3. Altered versions must be plainly marked as such, and must not
55 * be misrepresented as being the original software.
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
a687059c
LW
64 *
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
68 */
69#include "EXTERN.h"
864dbfa3 70#define PERL_IN_REGEXEC_C
a687059c 71#include "perl.h"
0f5d15d6 72
54df2634
NC
73#ifdef PERL_IN_XSUB_RE
74# include "re_comp.h"
75#else
76# include "regcomp.h"
77#endif
a687059c 78
c277df42
IZ
79#define RF_tainted 1 /* tainted information used? */
80#define RF_warned 2 /* warned about big count? */
ce862d02 81#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
82#define RF_utf8 8 /* String contains multibyte chars? */
83
eb160463 84#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
85
86#define RS_init 1 /* eval environment created */
87#define RS_set 2 /* replsv value is set */
c277df42 88
a687059c
LW
89#ifndef STATIC
90#define STATIC static
91#endif
92
32fc9b6a 93#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 94
c277df42
IZ
95/*
96 * Forwards.
97 */
98
33b8afdf 99#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 100#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 101
3dab1dad
YO
102#define HOPc(pos,off) \
103 (char *)(PL_reg_match_utf8 \
52657f30 104 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
105 : (U8*)(pos + off))
106#define HOPBACKc(pos, off) \
07be1b83
YO
107 (char*)(PL_reg_match_utf8\
108 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
109 : (pos - off >= PL_bostr) \
8e11feef 110 ? (U8*)pos - off \
3dab1dad 111 : NULL)
efb30f32 112
53c4c00c 113#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 114#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 115
1a4fad37
AL
116#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
117 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
118#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
119#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
120#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
121#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 122
3dab1dad
YO
123/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124
5f80c4cf 125/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
126#define JUMPABLE(rn) ( \
127 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
128 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
129 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 130 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
131)
132
cca55fe3 133#define HAS_TEXT(rn) ( \
3dab1dad 134 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 135)
e2d8ce26 136
a84d97b6
HS
137/*
138 Search for mandatory following text node; for lookahead, the text must
139 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140*/
cca55fe3 141#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
142 while (JUMPABLE(rn)) { \
143 const OPCODE type = OP(rn); \
144 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 145 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 146 else if (type == PLUS) \
cca55fe3 147 rn = NEXTOPER(rn); \
3dab1dad 148 else if (type == IFMATCH) \
a84d97b6 149 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 150 else rn += NEXT_OFF(rn); \
3dab1dad 151 } \
5f80c4cf 152} STMT_END
74750237 153
acfe0abc 154static void restore_pos(pTHX_ void *arg);
51371543 155
76e3520e 156STATIC CHECKPOINT
cea2e8a9 157S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 158{
97aff369 159 dVAR;
a3b680e6 160 const int retval = PL_savestack_ix;
b1ce53c5 161#define REGCP_PAREN_ELEMS 4
a3b680e6 162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 163 int p;
40a82448 164 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 165
e49a9654
IH
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
168
a01268b5 169#define REGCP_OTHER_ELEMS 6
4b3c1a47 170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 171 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 172/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
173 SSPUSHINT(PL_regendp[p]);
174 SSPUSHINT(PL_regstartp[p]);
3280af22 175 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 176 SSPUSHINT(p);
40a82448
DM
177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
178 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
179 (UV)p, (IV)PL_regstartp[p],
180 (IV)(PL_reg_start_tmp[p] - PL_bostr),
181 (IV)PL_regendp[p]
182 ));
a0d0e21e 183 }
b1ce53c5 184/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
185 SSPUSHINT(PL_regsize);
186 SSPUSHINT(*PL_reglastparen);
a01268b5 187 SSPUSHINT(*PL_reglastcloseparen);
3280af22 188 SSPUSHPTR(PL_reginput);
41123dfd
JH
189#define REGCP_FRAME_ELEMS 2
190/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
191 * are needed for the regexp context stack bookkeeping. */
192 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 193 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 194
a0d0e21e
LW
195 return retval;
196}
197
c277df42 198/* These are needed since we do not localize EVAL nodes: */
a3621e74 199# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 200 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 201 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 202
a3621e74 203# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 204 PerlIO_printf(Perl_debug_log, \
faccc32b 205 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 206 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 207
76e3520e 208STATIC char *
097eb12c 209S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 210{
97aff369 211 dVAR;
b1ce53c5 212 I32 i;
a0d0e21e 213 char *input;
b1ce53c5 214
a3621e74
YO
215 GET_RE_DEBUG_FLAGS_DECL;
216
b1ce53c5 217 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 218 i = SSPOPINT;
b1ce53c5
JH
219 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
220 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 221 input = (char *) SSPOPPTR;
a01268b5 222 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
223 *PL_reglastparen = SSPOPINT;
224 PL_regsize = SSPOPINT;
b1ce53c5
JH
225
226 /* Now restore the parentheses context. */
41123dfd
JH
227 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
228 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 229 I32 tmps;
097eb12c 230 U32 paren = (U32)SSPOPINT;
3280af22 231 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
232 PL_regstartp[paren] = SSPOPINT;
233 tmps = SSPOPINT;
3280af22
NIS
234 if (paren <= *PL_reglastparen)
235 PL_regendp[paren] = tmps;
a3621e74 236 DEBUG_EXECUTE_r(
c3464db5 237 PerlIO_printf(Perl_debug_log,
b900a521 238 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 239 (UV)paren, (IV)PL_regstartp[paren],
b900a521 240 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 241 (IV)PL_regendp[paren],
3280af22 242 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 243 );
a0d0e21e 244 }
a3621e74 245 DEBUG_EXECUTE_r(
bb7a0f54 246 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 247 PerlIO_printf(Perl_debug_log,
faccc32b 248 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 249 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
250 }
251 );
daf18116 252#if 1
dafc8851
JH
253 /* It would seem that the similar code in regtry()
254 * already takes care of this, and in fact it is in
255 * a better location to since this code can #if 0-ed out
256 * but the code in regtry() is needed or otherwise tests
257 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
258 * (as of patchlevel 7877) will fail. Then again,
259 * this code seems to be necessary or otherwise
260 * building DynaLoader will fail:
261 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
262 * --jhi */
bb7a0f54 263 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
264 if (i > PL_regsize)
265 PL_regstartp[i] = -1;
266 PL_regendp[i] = -1;
a0d0e21e 267 }
dafc8851 268#endif
a0d0e21e
LW
269 return input;
270}
271
02db2b7b 272#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 273
a687059c 274/*
e50aee73 275 * pregexec and friends
a687059c
LW
276 */
277
76234dfb 278#ifndef PERL_IN_XSUB_RE
a687059c 279/*
c277df42 280 - pregexec - match a regexp against a string
a687059c 281 */
c277df42 282I32
864dbfa3 283Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 284 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
285/* strend: pointer to null at end of string */
286/* strbeg: real beginning of string */
287/* minend: end of match must be >=minend after stringarg. */
288/* nosave: For optimizations. */
289{
290 return
9041c2e3 291 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
292 nosave ? 0 : REXEC_COPY_STR);
293}
76234dfb 294#endif
22e551b9 295
9041c2e3 296/*
cad2e5aa
JH
297 * Need to implement the following flags for reg_anch:
298 *
299 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
300 * USE_INTUIT_ML
301 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
302 * INTUIT_AUTORITATIVE_ML
303 * INTUIT_ONCE_NOML - Intuit can match in one location only.
304 * INTUIT_ONCE_ML
305 *
306 * Another flag for this function: SECOND_TIME (so that float substrs
307 * with giant delta may be not rechecked).
308 */
309
310/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
311
3f7c398e 312/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
313 Otherwise, only SvCUR(sv) is used to get strbeg. */
314
315/* XXXX We assume that strpos is strbeg unless sv. */
316
6eb5f6b9
JH
317/* XXXX Some places assume that there is a fixed substring.
318 An update may be needed if optimizer marks as "INTUITable"
319 RExen without fixed substrings. Similarly, it is assumed that
320 lengths of all the strings are no more than minlen, thus they
321 cannot come from lookahead.
322 (Or minlen should take into account lookahead.) */
323
2c2d71f5
JH
324/* A failure to find a constant substring means that there is no need to make
325 an expensive call to REx engine, thus we celebrate a failure. Similarly,
326 finding a substring too deep into the string means that less calls to
30944b6d
IZ
327 regtry() should be needed.
328
329 REx compiler's optimizer found 4 possible hints:
330 a) Anchored substring;
331 b) Fixed substring;
332 c) Whether we are anchored (beginning-of-line or \G);
333 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 334 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
335 string which does not contradict any of them.
336 */
2c2d71f5 337
6eb5f6b9
JH
338/* Most of decisions we do here should have been done at compile time.
339 The nodes of the REx which we used for the search should have been
340 deleted from the finite automaton. */
341
cad2e5aa
JH
342char *
343Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
344 char *strend, U32 flags, re_scream_pos_data *data)
345{
97aff369 346 dVAR;
b7953727 347 register I32 start_shift = 0;
cad2e5aa 348 /* Should be nonnegative! */
b7953727 349 register I32 end_shift = 0;
2c2d71f5
JH
350 register char *s;
351 register SV *check;
a1933d95 352 char *strbeg;
cad2e5aa 353 char *t;
a3b680e6 354 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 355 I32 ml_anch;
bd61b366
SS
356 register char *other_last = NULL; /* other substr checked before this */
357 char *check_at = NULL; /* check substr found at this pos */
1df70142 358 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 359#ifdef DEBUGGING
890ce7af
AL
360 const char * const i_strpos = strpos;
361 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 362#endif
a3621e74
YO
363
364 GET_RE_DEBUG_FLAGS_DECL;
365
a30b2f1f 366 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 367
b8d68ded 368 if (prog->reganch & ROPT_UTF8) {
a3621e74 369 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
370 "UTF-8 regex...\n"));
371 PL_reg_flags |= RF_utf8;
372 }
373
a3621e74 374 DEBUG_EXECUTE_r({
1df70142 375 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
376 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
377 strpos;
1df70142 378 const int len = PL_reg_match_utf8 ?
bb7a0f54 379 (int)strlen(s) : strend - strpos;
2a782b5b
JH
380 if (!PL_colorset)
381 reginitcolors();
b8d68ded 382 if (PL_reg_match_utf8)
a3621e74 383 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 384 "UTF-8 target...\n"));
2a782b5b 385 PerlIO_printf(Perl_debug_log,
a0288114 386 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 387 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
388 prog->precomp,
389 PL_colors[1],
390 (strlen(prog->precomp) > 60 ? "..." : ""),
391 PL_colors[0],
392 (int)(len > 60 ? 60 : len),
393 s, PL_colors[1],
394 (len > 60 ? "..." : "")
395 );
396 });
cad2e5aa 397
c344f387
JH
398 /* CHR_DIST() would be more correct here but it makes things slow. */
399 if (prog->minlen > strend - strpos) {
a3621e74 400 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 401 "String too short... [re_intuit_start]\n"));
cad2e5aa 402 goto fail;
2c2d71f5 403 }
a1933d95 404 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 405 PL_regeol = strend;
33b8afdf
JH
406 if (do_utf8) {
407 if (!prog->check_utf8 && prog->check_substr)
408 to_utf8_substr(prog);
409 check = prog->check_utf8;
410 } else {
411 if (!prog->check_substr && prog->check_utf8)
412 to_byte_substr(prog);
413 check = prog->check_substr;
414 }
415 if (check == &PL_sv_undef) {
a3621e74 416 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
417 "Non-utf string cannot match utf check string\n"));
418 goto fail;
419 }
2c2d71f5 420 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
421 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
422 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 423 && !multiline ) ); /* Check after \n? */
cad2e5aa 424
7e25d62c
JH
425 if (!ml_anch) {
426 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
427 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 428 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
429 && sv && !SvROK(sv)
430 && (strpos != strbeg)) {
a3621e74 431 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
432 goto fail;
433 }
434 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 435 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 436 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
437 I32 slen;
438
1aa99e6b 439 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
440 if (SvTAIL(check)) {
441 slen = SvCUR(check); /* >= 1 */
cad2e5aa 442
9041c2e3 443 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 444 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 445 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 446 goto fail_finish;
cad2e5aa
JH
447 }
448 /* Now should match s[0..slen-2] */
449 slen--;
3f7c398e 450 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 451 || (slen > 1
3f7c398e 452 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 453 report_neq:
a3621e74 454 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
455 goto fail_finish;
456 }
cad2e5aa 457 }
3f7c398e 458 else if (*SvPVX_const(check) != *s
653099ff 459 || ((slen = SvCUR(check)) > 1
3f7c398e 460 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 461 goto report_neq;
c315bfe8 462 check_at = s;
2c2d71f5 463 goto success_at_start;
7e25d62c 464 }
cad2e5aa 465 }
2c2d71f5 466 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 467 s = strpos;
2c2d71f5 468 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 469 end_shift = prog->minlen - start_shift -
653099ff 470 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 471 if (!ml_anch) {
a3b680e6 472 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 473 - (SvTAIL(check) != 0);
a3b680e6 474 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
475
476 if (end_shift < eshift)
477 end_shift = eshift;
478 }
cad2e5aa 479 }
2c2d71f5 480 else { /* Can match at random position */
cad2e5aa
JH
481 ml_anch = 0;
482 s = strpos;
2c2d71f5
JH
483 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
484 /* Should be nonnegative! */
485 end_shift = prog->minlen - start_shift -
653099ff 486 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
487 }
488
2c2d71f5 489#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 490 if (end_shift < 0)
6bbae5e6 491 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
492#endif
493
2c2d71f5
JH
494 restart:
495 /* Find a possible match in the region s..strend by looking for
496 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 497 if (flags & REXEC_SCREAM) {
cad2e5aa 498 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 499 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 500
2c2d71f5
JH
501 if (PL_screamfirst[BmRARE(check)] >= 0
502 || ( BmRARE(check) == '\n'
503 && (BmPREVIOUS(check) == SvCUR(check) - 1)
504 && SvTAIL(check) ))
9041c2e3 505 s = screaminstr(sv, check,
2c2d71f5 506 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 507 else
2c2d71f5 508 goto fail_finish;
4addbd3b
HS
509 /* we may be pointing at the wrong string */
510 if (s && RX_MATCH_COPIED(prog))
3f7c398e 511 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
512 if (data)
513 *data->scream_olds = s;
514 }
f33976b4 515 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
516 s = fbm_instr((U8*)(s + start_shift),
517 (U8*)(strend - end_shift),
7fba1cd6 518 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 519 else
1aa99e6b
IH
520 s = fbm_instr(HOP3(s, start_shift, strend),
521 HOP3(strend, -end_shift, strbeg),
7fba1cd6 522 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
523
524 /* Update the count-of-usability, remove useless subpatterns,
525 unshift s. */
2c2d71f5 526
a0288114 527 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 528 (s ? "Found" : "Did not find"),
33b8afdf 529 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 530 PL_colors[0],
7b0972df 531 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 532 SvPVX_const(check),
2c2d71f5
JH
533 PL_colors[1], (SvTAIL(check) ? "$" : ""),
534 (s ? " at offset " : "...\n") ) );
535
536 if (!s)
537 goto fail_finish;
538
6eb5f6b9
JH
539 check_at = s;
540
2c2d71f5 541 /* Finish the diagnostic message */
a3621e74 542 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
543
544 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
545 Start with the other substr.
546 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 547 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
548 *always* match. Probably should be marked during compile...
549 Probably it is right to do no SCREAM here...
550 */
551
33b8afdf 552 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 553 /* Take into account the "other" substring. */
2c2d71f5
JH
554 /* XXXX May be hopelessly wrong for UTF... */
555 if (!other_last)
6eb5f6b9 556 other_last = strpos;
33b8afdf 557 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
558 do_other_anchored:
559 {
890ce7af
AL
560 char * const last = HOP3c(s, -start_shift, strbeg);
561 char *last1, *last2;
be8e71aa 562 char * const saved_s = s;
33b8afdf 563 SV* must;
2c2d71f5 564
2c2d71f5
JH
565 t = s - prog->check_offset_max;
566 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 567 && (!do_utf8
0ce71af7 568 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 569 && t > strpos)))
6f207bd3 570 NOOP;
2c2d71f5
JH
571 else
572 t = strpos;
1aa99e6b 573 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
574 if (t < other_last) /* These positions already checked */
575 t = other_last;
1aa99e6b 576 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
577 if (last < last1)
578 last1 = last;
579 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
580 /* On end-of-str: see comment below. */
33b8afdf
JH
581 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
582 if (must == &PL_sv_undef) {
583 s = (char*)NULL;
a3621e74 584 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
585 }
586 else
587 s = fbm_instr(
588 (unsigned char*)t,
589 HOP3(HOP3(last1, prog->anchored_offset, strend)
590 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
591 must,
7fba1cd6 592 multiline ? FBMrf_MULTILINE : 0
33b8afdf 593 );
a3621e74 594 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 595 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
596 (s ? "Found" : "Contradicts"),
597 PL_colors[0],
33b8afdf
JH
598 (int)(SvCUR(must)
599 - (SvTAIL(must)!=0)),
3f7c398e 600 SvPVX_const(must),
33b8afdf 601 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
602 if (!s) {
603 if (last1 >= last2) {
a3621e74 604 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
605 ", giving up...\n"));
606 goto fail_finish;
607 }
a3621e74 608 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 609 ", trying floating at offset %ld...\n",
be8e71aa 610 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
611 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
612 s = HOP3c(last, 1, strend);
2c2d71f5
JH
613 goto restart;
614 }
615 else {
a3621e74 616 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 617 (long)(s - i_strpos)));
1aa99e6b
IH
618 t = HOP3c(s, -prog->anchored_offset, strbeg);
619 other_last = HOP3c(s, 1, strend);
be8e71aa 620 s = saved_s;
2c2d71f5
JH
621 if (t == strpos)
622 goto try_at_start;
2c2d71f5
JH
623 goto try_at_offset;
624 }
30944b6d 625 }
2c2d71f5
JH
626 }
627 else { /* Take into account the floating substring. */
33b8afdf 628 char *last, *last1;
be8e71aa 629 char * const saved_s = s;
33b8afdf
JH
630 SV* must;
631
632 t = HOP3c(s, -start_shift, strbeg);
633 last1 = last =
634 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
635 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
636 last = HOP3c(t, prog->float_max_offset, strend);
637 s = HOP3c(t, prog->float_min_offset, strend);
638 if (s < other_last)
639 s = other_last;
2c2d71f5 640 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
641 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
642 /* fbm_instr() takes into account exact value of end-of-str
643 if the check is SvTAIL(ed). Since false positives are OK,
644 and end-of-str is not later than strend we are OK. */
645 if (must == &PL_sv_undef) {
646 s = (char*)NULL;
a3621e74 647 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
648 }
649 else
2c2d71f5 650 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
651 (unsigned char*)last + SvCUR(must)
652 - (SvTAIL(must)!=0),
7fba1cd6 653 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 654 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
655 (s ? "Found" : "Contradicts"),
656 PL_colors[0],
657 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 658 SvPVX_const(must),
33b8afdf
JH
659 PL_colors[1], (SvTAIL(must) ? "$" : "")));
660 if (!s) {
661 if (last1 == last) {
a3621e74 662 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
663 ", giving up...\n"));
664 goto fail_finish;
2c2d71f5 665 }
a3621e74 666 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 667 ", trying anchored starting at offset %ld...\n",
be8e71aa 668 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
669 other_last = last;
670 s = HOP3c(t, 1, strend);
671 goto restart;
672 }
673 else {
a3621e74 674 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
675 (long)(s - i_strpos)));
676 other_last = s; /* Fix this later. --Hugo */
be8e71aa 677 s = saved_s;
33b8afdf
JH
678 if (t == strpos)
679 goto try_at_start;
680 goto try_at_offset;
681 }
2c2d71f5 682 }
cad2e5aa 683 }
2c2d71f5
JH
684
685 t = s - prog->check_offset_max;
2c2d71f5 686 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 687 && (!do_utf8
0ce71af7 688 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
1aa99e6b 689 && t > strpos))) {
2c2d71f5
JH
690 /* Fixed substring is found far enough so that the match
691 cannot start at strpos. */
692 try_at_offset:
cad2e5aa 693 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
694 /* Eventually fbm_*() should handle this, but often
695 anchored_offset is not 0, so this check will not be wasted. */
696 /* XXXX In the code below we prefer to look for "^" even in
697 presence of anchored substrings. And we search even
698 beyond the found float position. These pessimizations
699 are historical artefacts only. */
700 find_anchor:
2c2d71f5 701 while (t < strend - prog->minlen) {
cad2e5aa 702 if (*t == '\n') {
4ee3650e 703 if (t < check_at - prog->check_offset_min) {
33b8afdf 704 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
705 /* Since we moved from the found position,
706 we definitely contradict the found anchored
30944b6d
IZ
707 substr. Due to the above check we do not
708 contradict "check" substr.
709 Thus we can arrive here only if check substr
710 is float. Redo checking for "other"=="fixed".
711 */
9041c2e3 712 strpos = t + 1;
a3621e74 713 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 714 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
715 goto do_other_anchored;
716 }
4ee3650e
GS
717 /* We don't contradict the found floating substring. */
718 /* XXXX Why not check for STCLASS? */
cad2e5aa 719 s = t + 1;
a3621e74 720 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 721 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
722 goto set_useful;
723 }
4ee3650e
GS
724 /* Position contradicts check-string */
725 /* XXXX probably better to look for check-string
726 than for "\n", so one should lower the limit for t? */
a3621e74 727 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 728 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 729 other_last = strpos = s = t + 1;
cad2e5aa
JH
730 goto restart;
731 }
732 t++;
733 }
a3621e74 734 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 735 PL_colors[0], PL_colors[1]));
2c2d71f5 736 goto fail_finish;
cad2e5aa 737 }
f5952150 738 else {
a3621e74 739 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 740 PL_colors[0], PL_colors[1]));
f5952150 741 }
cad2e5aa
JH
742 s = t;
743 set_useful:
33b8afdf 744 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
745 }
746 else {
f5952150 747 /* The found string does not prohibit matching at strpos,
2c2d71f5 748 - no optimization of calling REx engine can be performed,
f5952150
GS
749 unless it was an MBOL and we are not after MBOL,
750 or a future STCLASS check will fail this. */
2c2d71f5
JH
751 try_at_start:
752 /* Even in this situation we may use MBOL flag if strpos is offset
753 wrt the start of the string. */
05b4157f 754 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 755 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
756 /* May be due to an implicit anchor of m{.*foo} */
757 && !(prog->reganch & ROPT_IMPLICIT))
758 {
cad2e5aa
JH
759 t = strpos;
760 goto find_anchor;
761 }
a3621e74 762 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 763 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 764 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 765 );
2c2d71f5 766 success_at_start:
30944b6d 767 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
768 && (do_utf8 ? (
769 prog->check_utf8 /* Could be deleted already */
770 && --BmUSEFUL(prog->check_utf8) < 0
771 && (prog->check_utf8 == prog->float_utf8)
772 ) : (
773 prog->check_substr /* Could be deleted already */
774 && --BmUSEFUL(prog->check_substr) < 0
775 && (prog->check_substr == prog->float_substr)
776 )))
66e933ab 777 {
cad2e5aa 778 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 779 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
780 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
781 if (do_utf8 ? prog->check_substr : prog->check_utf8)
782 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
783 prog->check_substr = prog->check_utf8 = NULL; /* disable */
784 prog->float_substr = prog->float_utf8 = NULL; /* clear */
785 check = NULL; /* abort */
cad2e5aa 786 s = strpos;
3cf5c195
IZ
787 /* XXXX This is a remnant of the old implementation. It
788 looks wasteful, since now INTUIT can use many
6eb5f6b9 789 other heuristics. */
cad2e5aa
JH
790 prog->reganch &= ~RE_USE_INTUIT;
791 }
792 else
793 s = strpos;
794 }
795
6eb5f6b9
JH
796 /* Last resort... */
797 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
07be1b83 798 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
6eb5f6b9
JH
799 /* minlen == 0 is possible if regstclass is \b or \B,
800 and the fixed substr is ''$.
801 Since minlen is already taken into account, s+1 is before strend;
802 accidentally, minlen >= 1 guaranties no false positives at s + 1
803 even for \b or \B. But (minlen? 1 : 0) below assumes that
804 regstclass does not come from lookahead... */
805 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
806 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 807 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 808 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 809 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 810 : 1);
07be1b83 811 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 812 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 813 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
814 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
815 cl_l, strend)
816 : strend);
07be1b83
YO
817 /*if (OP(prog->regstclass) == TRIE)
818 endpos++;*/
6eb5f6b9 819 t = s;
3b0527fe 820 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
821 if (!s) {
822#ifdef DEBUGGING
cbbf8932 823 const char *what = NULL;
6eb5f6b9
JH
824#endif
825 if (endpos == strend) {
a3621e74 826 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
827 "Could not match STCLASS...\n") );
828 goto fail;
829 }
a3621e74 830 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 831 "This position contradicts STCLASS...\n") );
653099ff
GS
832 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
833 goto fail;
6eb5f6b9 834 /* Contradict one of substrings */
33b8afdf
JH
835 if (prog->anchored_substr || prog->anchored_utf8) {
836 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 837 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 838 hop_and_restart:
1aa99e6b 839 s = HOP3c(t, 1, strend);
66e933ab
GS
840 if (s + start_shift + end_shift > strend) {
841 /* XXXX Should be taken into account earlier? */
a3621e74 842 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
843 "Could not match STCLASS...\n") );
844 goto fail;
845 }
5e39e1e5
HS
846 if (!check)
847 goto giveup;
a3621e74 848 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 849 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
850 what, (long)(s + start_shift - i_strpos)) );
851 goto restart;
852 }
66e933ab 853 /* Have both, check_string is floating */
6eb5f6b9
JH
854 if (t + start_shift >= check_at) /* Contradicts floating=check */
855 goto retry_floating_check;
856 /* Recheck anchored substring, but not floating... */
9041c2e3 857 s = check_at;
5e39e1e5
HS
858 if (!check)
859 goto giveup;
a3621e74 860 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 861 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
862 (long)(other_last - i_strpos)) );
863 goto do_other_anchored;
864 }
60e71179
GS
865 /* Another way we could have checked stclass at the
866 current position only: */
867 if (ml_anch) {
868 s = t = t + 1;
5e39e1e5
HS
869 if (!check)
870 goto giveup;
a3621e74 871 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 872 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 873 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 874 goto try_at_offset;
66e933ab 875 }
33b8afdf 876 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 877 goto fail;
6eb5f6b9
JH
878 /* Check is floating subtring. */
879 retry_floating_check:
880 t = check_at - start_shift;
a3621e74 881 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
882 goto hop_and_restart;
883 }
b7953727 884 if (t != s) {
a3621e74 885 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 886 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
887 (long)(t - i_strpos), (long)(s - i_strpos))
888 );
889 }
890 else {
a3621e74 891 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
892 "Does not contradict STCLASS...\n");
893 );
894 }
6eb5f6b9 895 }
5e39e1e5 896 giveup:
a3621e74 897 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
898 PL_colors[4], (check ? "Guessed" : "Giving up"),
899 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 900 return s;
2c2d71f5
JH
901
902 fail_finish: /* Substring not found */
33b8afdf
JH
903 if (prog->check_substr || prog->check_utf8) /* could be removed already */
904 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 905 fail:
a3621e74 906 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 907 PL_colors[4], PL_colors[5]));
bd61b366 908 return NULL;
cad2e5aa 909}
9661b544 910
6eb5f6b9 911/* We know what class REx starts with. Try to find this position... */
3b0527fe 912/* if reginfo is NULL, its a dryrun */
07be1b83
YO
913/* annoyingly all the vars in this routine have different names from their counterparts
914 in regmatch. /grrr */
3b0527fe 915
3c3eec57 916STATIC char *
07be1b83
YO
917S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
918 const char *strend, const regmatch_info *reginfo)
a687059c 919{
27da23d5 920 dVAR;
1df70142 921 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 922 char *m;
d8093b23 923 STRLEN ln;
5dab1207 924 STRLEN lnc;
078c425b 925 register STRLEN uskip;
d8093b23 926 unsigned int c1;
927 unsigned int c2;
6eb5f6b9
JH
928 char *e;
929 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 930 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 931
6eb5f6b9
JH
932 /* We know what class it must start with. */
933 switch (OP(c)) {
6eb5f6b9 934 case ANYOF:
388cc4de 935 if (do_utf8) {
078c425b 936 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
937 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
938 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a
DM
939 reginclass(prog, c, (U8*)s, 0, do_utf8) :
940 REGINCLASS(prog, c, (U8*)s)) {
3b0527fe 941 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
942 goto got_it;
943 else
944 tmp = doevery;
945 }
946 else
947 tmp = 1;
078c425b 948 s += uskip;
388cc4de
HS
949 }
950 }
951 else {
952 while (s < strend) {
953 STRLEN skip = 1;
954
32fc9b6a 955 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
956 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
957 /* The assignment of 2 is intentional:
958 * for the folded sharp s, the skip is 2. */
959 (skip = SHARP_S_SKIP))) {
3b0527fe 960 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
961 goto got_it;
962 else
963 tmp = doevery;
964 }
965 else
966 tmp = 1;
967 s += skip;
968 }
a0d0e21e 969 }
6eb5f6b9 970 break;
f33976b4
DB
971 case CANY:
972 while (s < strend) {
3b0527fe 973 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
974 goto got_it;
975 else
976 tmp = doevery;
977 s++;
978 }
979 break;
6eb5f6b9 980 case EXACTF:
5dab1207
NIS
981 m = STRING(c);
982 ln = STR_LEN(c); /* length to match in octets/bytes */
983 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 984 if (UTF) {
a2a2844f 985 STRLEN ulen1, ulen2;
5dab1207 986 U8 *sm = (U8 *) m;
89ebb4a3
JH
987 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
988 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 989 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
990
991 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
992 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
993
89ebb4a3 994 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 995 0, uniflags);
89ebb4a3 996 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 997 0, uniflags);
5dab1207
NIS
998 lnc = 0;
999 while (sm < ((U8 *) m + ln)) {
1000 lnc++;
1001 sm += UTF8SKIP(sm);
1002 }
1aa99e6b
IH
1003 }
1004 else {
1005 c1 = *(U8*)m;
1006 c2 = PL_fold[c1];
1007 }
6eb5f6b9
JH
1008 goto do_exactf;
1009 case EXACTFL:
5dab1207
NIS
1010 m = STRING(c);
1011 ln = STR_LEN(c);
1012 lnc = (I32) ln;
d8093b23 1013 c1 = *(U8*)m;
6eb5f6b9
JH
1014 c2 = PL_fold_locale[c1];
1015 do_exactf:
db12adc6 1016 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1017
3b0527fe 1018 if (!reginfo && e < s)
6eb5f6b9 1019 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1020
60a8b682
JH
1021 /* The idea in the EXACTF* cases is to first find the
1022 * first character of the EXACTF* node and then, if
1023 * necessary, case-insensitively compare the full
1024 * text of the node. The c1 and c2 are the first
1025 * characters (though in Unicode it gets a bit
1026 * more complicated because there are more cases
7f16dd3d
JH
1027 * than just upper and lower: one needs to use
1028 * the so-called folding case for case-insensitive
1029 * matching (called "loose matching" in Unicode).
1030 * ibcmp_utf8() will do just that. */
60a8b682 1031
1aa99e6b 1032 if (do_utf8) {
575cac57 1033 UV c, f;
89ebb4a3 1034 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1035 STRLEN len, foldlen;
4ad0818d 1036 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1037 if (c1 == c2) {
5dab1207
NIS
1038 /* Upper and lower of 1st char are equal -
1039 * probably not a "letter". */
1aa99e6b 1040 while (s <= e) {
89ebb4a3 1041 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1042 uniflags);
80aecb99
JH
1043 if ( c == c1
1044 && (ln == len ||
3dab1dad
YO
1045 ibcmp_utf8(s, NULL, 0, do_utf8,
1046 m, NULL, ln, (bool)UTF))
3b0527fe 1047 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1048 goto got_it;
80aecb99 1049 else {
1df70142 1050 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1051 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1052 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1053 if ( f != c
1054 && (f == c1 || f == c2)
1055 && (ln == foldlen ||
66423254 1056 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1057 NULL, foldlen, do_utf8,
d07ddd77 1058 m,
3dab1dad 1059 NULL, ln, (bool)UTF))
3b0527fe 1060 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1061 goto got_it;
1062 }
1aa99e6b
IH
1063 s += len;
1064 }
09091399
JH
1065 }
1066 else {
1aa99e6b 1067 while (s <= e) {
89ebb4a3 1068 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1069 uniflags);
80aecb99 1070
60a8b682 1071 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1072 * Note that not all the possible combinations
1073 * are handled here: some of them are handled
1074 * by the standard folding rules, and some of
1075 * them (the character class or ANYOF cases)
1076 * are handled during compiletime in
1077 * regexec.c:S_regclass(). */
880bd946
JH
1078 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1079 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1080 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1081
1082 if ( (c == c1 || c == c2)
1083 && (ln == len ||
3dab1dad
YO
1084 ibcmp_utf8(s, NULL, 0, do_utf8,
1085 m, NULL, ln, (bool)UTF))
3b0527fe 1086 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1087 goto got_it;
80aecb99 1088 else {
1df70142 1089 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1090 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1091 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1092 if ( f != c
1093 && (f == c1 || f == c2)
1094 && (ln == foldlen ||
a6872d42 1095 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1096 NULL, foldlen, do_utf8,
d07ddd77 1097 m,
3dab1dad 1098 NULL, ln, (bool)UTF))
3b0527fe 1099 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1100 goto got_it;
1101 }
1aa99e6b
IH
1102 s += len;
1103 }
09091399 1104 }
1aa99e6b
IH
1105 }
1106 else {
1107 if (c1 == c2)
1108 while (s <= e) {
1109 if ( *(U8*)s == c1
1110 && (ln == 1 || !(OP(c) == EXACTF
1111 ? ibcmp(s, m, ln)
1112 : ibcmp_locale(s, m, ln)))
3b0527fe 1113 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1114 goto got_it;
1115 s++;
1116 }
1117 else
1118 while (s <= e) {
1119 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1120 && (ln == 1 || !(OP(c) == EXACTF
1121 ? ibcmp(s, m, ln)
1122 : ibcmp_locale(s, m, ln)))
3b0527fe 1123 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1124 goto got_it;
1125 s++;
1126 }
b3c9acc1
IZ
1127 }
1128 break;
bbce6d69 1129 case BOUNDL:
3280af22 1130 PL_reg_flags |= RF_tainted;
bbce6d69 1131 /* FALL THROUGH */
a0d0e21e 1132 case BOUND:
ffc61ed2 1133 if (do_utf8) {
12d33761 1134 if (s == PL_bostr)
ffc61ed2
JH
1135 tmp = '\n';
1136 else {
6136c704 1137 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1138 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1139 }
1140 tmp = ((OP(c) == BOUND ?
9041c2e3 1141 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1142 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1143 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1144 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1145 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1146 isALNUM_LC_utf8((U8*)s)))
1147 {
1148 tmp = !tmp;
3b0527fe 1149 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1150 goto got_it;
1151 }
078c425b 1152 s += uskip;
a687059c 1153 }
a0d0e21e 1154 }
667bb95a 1155 else {
12d33761 1156 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1157 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1158 while (s < strend) {
1159 if (tmp ==
1160 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1161 tmp = !tmp;
3b0527fe 1162 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1163 goto got_it;
1164 }
1165 s++;
a0ed51b3 1166 }
a0ed51b3 1167 }
3b0527fe 1168 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1169 goto got_it;
1170 break;
bbce6d69 1171 case NBOUNDL:
3280af22 1172 PL_reg_flags |= RF_tainted;
bbce6d69 1173 /* FALL THROUGH */
a0d0e21e 1174 case NBOUND:
ffc61ed2 1175 if (do_utf8) {
12d33761 1176 if (s == PL_bostr)
ffc61ed2
JH
1177 tmp = '\n';
1178 else {
6136c704 1179 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1180 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1181 }
1182 tmp = ((OP(c) == NBOUND ?
9041c2e3 1183 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1184 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1185 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1186 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1187 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1188 isALNUM_LC_utf8((U8*)s)))
1189 tmp = !tmp;
3b0527fe 1190 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2 1191 goto got_it;
078c425b 1192 s += uskip;
ffc61ed2 1193 }
a0d0e21e 1194 }
667bb95a 1195 else {
12d33761 1196 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1197 tmp = ((OP(c) == NBOUND ?
1198 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1199 while (s < strend) {
1200 if (tmp ==
1201 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1202 tmp = !tmp;
3b0527fe 1203 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1204 goto got_it;
1205 s++;
1206 }
a0ed51b3 1207 }
3b0527fe 1208 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1209 goto got_it;
1210 break;
a0d0e21e 1211 case ALNUM:
ffc61ed2 1212 if (do_utf8) {
1a4fad37 1213 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1214 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1215 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1216 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1217 goto got_it;
1218 else
1219 tmp = doevery;
1220 }
bbce6d69 1221 else
ffc61ed2 1222 tmp = 1;
078c425b 1223 s += uskip;
bbce6d69 1224 }
bbce6d69 1225 }
ffc61ed2
JH
1226 else {
1227 while (s < strend) {
1228 if (isALNUM(*s)) {
3b0527fe 1229 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1230 goto got_it;
1231 else
1232 tmp = doevery;
1233 }
a0ed51b3 1234 else
ffc61ed2
JH
1235 tmp = 1;
1236 s++;
a0ed51b3 1237 }
a0ed51b3
LW
1238 }
1239 break;
bbce6d69 1240 case ALNUML:
3280af22 1241 PL_reg_flags |= RF_tainted;
ffc61ed2 1242 if (do_utf8) {
078c425b 1243 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1244 if (isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1245 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1246 goto got_it;
1247 else
1248 tmp = doevery;
1249 }
a687059c 1250 else
ffc61ed2 1251 tmp = 1;
078c425b 1252 s += uskip;
a0d0e21e 1253 }
a0d0e21e 1254 }
ffc61ed2
JH
1255 else {
1256 while (s < strend) {
1257 if (isALNUM_LC(*s)) {
3b0527fe 1258 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1259 goto got_it;
1260 else
1261 tmp = doevery;
1262 }
a0ed51b3 1263 else
ffc61ed2
JH
1264 tmp = 1;
1265 s++;
a0ed51b3 1266 }
a0ed51b3
LW
1267 }
1268 break;
a0d0e21e 1269 case NALNUM:
ffc61ed2 1270 if (do_utf8) {
1a4fad37 1271 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1272 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1273 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1274 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1275 goto got_it;
1276 else
1277 tmp = doevery;
1278 }
bbce6d69 1279 else
ffc61ed2 1280 tmp = 1;
078c425b 1281 s += uskip;
bbce6d69 1282 }
bbce6d69 1283 }
ffc61ed2
JH
1284 else {
1285 while (s < strend) {
1286 if (!isALNUM(*s)) {
3b0527fe 1287 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1288 goto got_it;
1289 else
1290 tmp = doevery;
1291 }
a0ed51b3 1292 else
ffc61ed2
JH
1293 tmp = 1;
1294 s++;
a0ed51b3 1295 }
a0ed51b3
LW
1296 }
1297 break;
bbce6d69 1298 case NALNUML:
3280af22 1299 PL_reg_flags |= RF_tainted;
ffc61ed2 1300 if (do_utf8) {
078c425b 1301 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1302 if (!isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1303 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1304 goto got_it;
1305 else
1306 tmp = doevery;
1307 }
a687059c 1308 else
ffc61ed2 1309 tmp = 1;
078c425b 1310 s += uskip;
a687059c 1311 }
a0d0e21e 1312 }
ffc61ed2
JH
1313 else {
1314 while (s < strend) {
1315 if (!isALNUM_LC(*s)) {
3b0527fe 1316 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1317 goto got_it;
1318 else
1319 tmp = doevery;
1320 }
a0ed51b3 1321 else
ffc61ed2
JH
1322 tmp = 1;
1323 s++;
a0ed51b3 1324 }
a0ed51b3
LW
1325 }
1326 break;
a0d0e21e 1327 case SPACE:
ffc61ed2 1328 if (do_utf8) {
1a4fad37 1329 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1330 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1331 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
3b0527fe 1332 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1333 goto got_it;
1334 else
1335 tmp = doevery;
1336 }
a0d0e21e 1337 else
ffc61ed2 1338 tmp = 1;
078c425b 1339 s += uskip;
2304df62 1340 }
a0d0e21e 1341 }
ffc61ed2
JH
1342 else {
1343 while (s < strend) {
1344 if (isSPACE(*s)) {
3b0527fe 1345 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1346 goto got_it;
1347 else
1348 tmp = doevery;
1349 }
a0ed51b3 1350 else
ffc61ed2
JH
1351 tmp = 1;
1352 s++;
a0ed51b3 1353 }
a0ed51b3
LW
1354 }
1355 break;
bbce6d69 1356 case SPACEL:
3280af22 1357 PL_reg_flags |= RF_tainted;
ffc61ed2 1358 if (do_utf8) {
078c425b 1359 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1360 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
3b0527fe 1361 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1362 goto got_it;
1363 else
1364 tmp = doevery;
1365 }
bbce6d69 1366 else
ffc61ed2 1367 tmp = 1;
078c425b 1368 s += uskip;
bbce6d69 1369 }
bbce6d69 1370 }
ffc61ed2
JH
1371 else {
1372 while (s < strend) {
1373 if (isSPACE_LC(*s)) {
3b0527fe 1374 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1375 goto got_it;
1376 else
1377 tmp = doevery;
1378 }
a0ed51b3 1379 else
ffc61ed2
JH
1380 tmp = 1;
1381 s++;
a0ed51b3 1382 }
a0ed51b3
LW
1383 }
1384 break;
a0d0e21e 1385 case NSPACE:
ffc61ed2 1386 if (do_utf8) {
1a4fad37 1387 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1388 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1389 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
3b0527fe 1390 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1391 goto got_it;
1392 else
1393 tmp = doevery;
1394 }
a0d0e21e 1395 else
ffc61ed2 1396 tmp = 1;
078c425b 1397 s += uskip;
a687059c 1398 }
a0d0e21e 1399 }
ffc61ed2
JH
1400 else {
1401 while (s < strend) {
1402 if (!isSPACE(*s)) {
3b0527fe 1403 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1404 goto got_it;
1405 else
1406 tmp = doevery;
1407 }
a0ed51b3 1408 else
ffc61ed2
JH
1409 tmp = 1;
1410 s++;
a0ed51b3 1411 }
a0ed51b3
LW
1412 }
1413 break;
bbce6d69 1414 case NSPACEL:
3280af22 1415 PL_reg_flags |= RF_tainted;
ffc61ed2 1416 if (do_utf8) {
078c425b 1417 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1418 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
3b0527fe 1419 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1420 goto got_it;
1421 else
1422 tmp = doevery;
1423 }
bbce6d69 1424 else
ffc61ed2 1425 tmp = 1;
078c425b 1426 s += uskip;
bbce6d69 1427 }
bbce6d69 1428 }
ffc61ed2
JH
1429 else {
1430 while (s < strend) {
1431 if (!isSPACE_LC(*s)) {
3b0527fe 1432 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1433 goto got_it;
1434 else
1435 tmp = doevery;
1436 }
a0ed51b3 1437 else
ffc61ed2
JH
1438 tmp = 1;
1439 s++;
a0ed51b3 1440 }
a0ed51b3
LW
1441 }
1442 break;
a0d0e21e 1443 case DIGIT:
ffc61ed2 1444 if (do_utf8) {
1a4fad37 1445 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1446 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1447 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1448 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1449 goto got_it;
1450 else
1451 tmp = doevery;
1452 }
a0d0e21e 1453 else
ffc61ed2 1454 tmp = 1;
078c425b 1455 s += uskip;
2b69d0c2 1456 }
a0d0e21e 1457 }
ffc61ed2
JH
1458 else {
1459 while (s < strend) {
1460 if (isDIGIT(*s)) {
3b0527fe 1461 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1462 goto got_it;
1463 else
1464 tmp = doevery;
1465 }
a0ed51b3 1466 else
ffc61ed2
JH
1467 tmp = 1;
1468 s++;
a0ed51b3 1469 }
a0ed51b3
LW
1470 }
1471 break;
b8c5462f
JH
1472 case DIGITL:
1473 PL_reg_flags |= RF_tainted;
ffc61ed2 1474 if (do_utf8) {
078c425b 1475 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1476 if (isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1477 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1478 goto got_it;
1479 else
1480 tmp = doevery;
1481 }
b8c5462f 1482 else
ffc61ed2 1483 tmp = 1;
078c425b 1484 s += uskip;
b8c5462f 1485 }
b8c5462f 1486 }
ffc61ed2
JH
1487 else {
1488 while (s < strend) {
1489 if (isDIGIT_LC(*s)) {
3b0527fe 1490 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1491 goto got_it;
1492 else
1493 tmp = doevery;
1494 }
b8c5462f 1495 else
ffc61ed2
JH
1496 tmp = 1;
1497 s++;
b8c5462f 1498 }
b8c5462f
JH
1499 }
1500 break;
a0d0e21e 1501 case NDIGIT:
ffc61ed2 1502 if (do_utf8) {
1a4fad37 1503 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1504 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1505 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1506 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1507 goto got_it;
1508 else
1509 tmp = doevery;
1510 }
a0d0e21e 1511 else
ffc61ed2 1512 tmp = 1;
078c425b 1513 s += uskip;
a687059c 1514 }
a0d0e21e 1515 }
ffc61ed2
JH
1516 else {
1517 while (s < strend) {
1518 if (!isDIGIT(*s)) {
3b0527fe 1519 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1520 goto got_it;
1521 else
1522 tmp = doevery;
1523 }
a0ed51b3 1524 else
ffc61ed2
JH
1525 tmp = 1;
1526 s++;
a0ed51b3 1527 }
a0ed51b3
LW
1528 }
1529 break;
b8c5462f
JH
1530 case NDIGITL:
1531 PL_reg_flags |= RF_tainted;
ffc61ed2 1532 if (do_utf8) {
078c425b 1533 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1534 if (!isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1535 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1536 goto got_it;
1537 else
1538 tmp = doevery;
1539 }
b8c5462f 1540 else
ffc61ed2 1541 tmp = 1;
078c425b 1542 s += uskip;
b8c5462f 1543 }
a0ed51b3 1544 }
ffc61ed2
JH
1545 else {
1546 while (s < strend) {
1547 if (!isDIGIT_LC(*s)) {
3b0527fe 1548 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1549 goto got_it;
1550 else
1551 tmp = doevery;
1552 }
cf93c79d 1553 else
ffc61ed2
JH
1554 tmp = 1;
1555 s++;
b8c5462f 1556 }
b8c5462f
JH
1557 }
1558 break;
07be1b83
YO
1559 case TRIE:
1560 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1561 {
1562 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1563 trie_type = do_utf8 ?
1564 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1565 : trie_plain;
1566 /* what trie are we using right now */
1567 reg_ac_data *aho
1568 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1569 reg_trie_data *trie=aho->trie;
1570
1571 const char *last_start = strend - trie->minlen;
1572 const char *real_start = s;
1573 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1574 SV *sv_points;
1575 U8 **points; /* map of where we were in the input string
1576 when reading a given string. For ASCII this
1577 is unnecessary overhead as the relationship
1578 is always 1:1, but for unicode, especially
1579 case folded unicode this is not true. */
f9e705e8 1580 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
07be1b83
YO
1581
1582 GET_RE_DEBUG_FLAGS_DECL;
1583
be8e71aa
YO
1584 /* We can't just allocate points here. We need to wrap it in
1585 * an SV so it gets freed properly if there is a croak while
1586 * running the match */
1587 ENTER;
1588 SAVETMPS;
1589 sv_points=newSV(maxlen * sizeof(U8 *));
1590 SvCUR_set(sv_points,
1591 maxlen * sizeof(U8 *));
1592 SvPOK_on(sv_points);
1593 sv_2mortal(sv_points);
1594 points=(U8**)SvPV_nolen(sv_points );
07be1b83
YO
1595
1596 if (trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1597 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
07be1b83
YO
1598 s++;
1599 }
1600 }
1601
1602 while (s <= last_start) {
1603 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1604 U8 *uc = (U8*)s;
1605 U16 charid = 0;
1606 U32 base = 1;
1607 U32 state = 1;
1608 UV uvc = 0;
1609 STRLEN len = 0;
1610 STRLEN foldlen = 0;
1611 U8 *uscan = (U8*)NULL;
1612 U8 *leftmost = NULL;
1613
1614 U32 pointpos = 0;
1615
1616 while ( state && uc <= (U8*)strend ) {
1617 int failed=0;
1618 if (aho->states[ state ].wordnum) {
1619 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1620 if (!leftmost || lpos < leftmost)
1621 leftmost= lpos;
1622 if (base==0) break;
1623 }
1624 points[pointpos++ % maxlen]= uc;
1625 switch (trie_type) {
1626 case trie_utf8_fold:
1627 if ( foldlen>0 ) {
1628 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
1629 foldlen -= len;
1630 uscan += len;
1631 len=0;
1632 } else {
07be1b83
YO
1633 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
1634 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
1635 foldlen -= UNISKIP( uvc );
1636 uscan = foldbuf + UNISKIP( uvc );
1637 }
1638 break;
1639 case trie_utf8:
1640 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
1641 &len, uniflags );
1642 break;
1643 case trie_plain:
1644 uvc = (UV)*uc;
1645 len = 1;
1646 }
1647
1648 if (uvc < 256) {
1649 charid = trie->charmap[ uvc ];
1650 }
1651 else {
1652 charid = 0;
1653 if (trie->widecharmap) {
1654 SV** const svpp = hv_fetch(trie->widecharmap,
1655 (char*)&uvc, sizeof(UV), 0);
1656 if (svpp)
1657 charid = (U16)SvIV(*svpp);
1658 }
1659 }
1660 DEBUG_TRIE_EXECUTE_r(
1661 PerlIO_printf(Perl_debug_log,
1662 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1663 (int)((const char*)uc - real_start), charid, uvc)
1664 );
1665 uc += len;
1666
1667 do {
1668 U32 word = aho->states[ state ].wordnum;
1669 base = aho->states[ state ].trans.base;
1670
1671 DEBUG_TRIE_EXECUTE_r(
1672 PerlIO_printf( Perl_debug_log,
1673 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1674 failed ? "Fail transition to " : "",
1675 state, base, uvc, word)
1676 );
1677 if ( base ) {
1678 U32 tmp;
1679 if (charid &&
1680 (base + charid > trie->uniquecharcount )
1681 && (base + charid - 1 - trie->uniquecharcount
1682 < trie->lasttrans)
1683 && trie->trans[base + charid - 1 -
1684 trie->uniquecharcount].check == state
1685 && (tmp=trie->trans[base + charid - 1 -
1686 trie->uniquecharcount ].next))
1687 {
1688 state = tmp;
1689 break;
1690 }
1691 else {
1692 failed++;
1693 if ( state == 1 )
1694 break;
1695 else
1696 state = aho->fail[state];
1697 }
1698 }
1699 else {
1700 /* we must be accepting here */
1701 failed++;
1702 break;
1703 }
1704 } while(state);
1705 if (failed) {
1706 if (leftmost)
1707 break;
1708 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1709 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
07be1b83
YO
1710 uc++;
1711 }
1712 }
1713 }
1714 }
1715 if ( aho->states[ state ].wordnum ) {
1716 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1717 if (!leftmost || lpos < leftmost)
1718 leftmost = lpos;
1719 }
1720 DEBUG_TRIE_EXECUTE_r(
1721 PerlIO_printf( Perl_debug_log,
1722 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1723 "All done: ",
1724 state, base, uvc)
1725 );
1726 if (leftmost) {
1727 s = (char*)leftmost;
be8e71aa
YO
1728 if (!reginfo || regtry(reginfo, s)) {
1729 FREETMPS;
1730 LEAVE;
07be1b83 1731 goto got_it;
be8e71aa 1732 }
07be1b83
YO
1733 s = HOPc(s,1);
1734 } else {
1735 break;
1736 }
1737 }
be8e71aa
YO
1738 FREETMPS;
1739 LEAVE;
07be1b83
YO
1740 }
1741 break;
b3c9acc1 1742 default:
3c3eec57
GS
1743 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1744 break;
d6a28714 1745 }
6eb5f6b9
JH
1746 return 0;
1747 got_it:
1748 return s;
1749}
1750
1751/*
1752 - regexec_flags - match a regexp against a string
1753 */
1754I32
1755Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1756 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1757/* strend: pointer to null at end of string */
1758/* strbeg: real beginning of string */
1759/* minend: end of match must be >=minend after stringarg. */
1760/* data: May be used for some additional optimizations. */
1761/* nosave: For optimizations. */
1762{
97aff369 1763 dVAR;
6eb5f6b9
JH
1764 register char *s;
1765 register regnode *c;
1766 register char *startpos = stringarg;
6eb5f6b9
JH
1767 I32 minlen; /* must match at least this many chars */
1768 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1769 I32 end_shift = 0; /* Same for the end. */ /* CC */
1770 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1771 char *scream_olds = NULL;
3dab1dad 1772 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1773 const bool do_utf8 = DO_UTF8(sv);
2757e526 1774 I32 multiline;
2a782b5b 1775#ifdef DEBUGGING
2757e526
JH
1776 SV* dsv0;
1777 SV* dsv1;
2a782b5b 1778#endif
3b0527fe 1779 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1780
1781 GET_RE_DEBUG_FLAGS_DECL;
1782
9d4ba2ae 1783 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1784
1785 /* Be paranoid... */
1786 if (prog == NULL || startpos == NULL) {
1787 Perl_croak(aTHX_ "NULL regexp parameter");
1788 return 0;
1789 }
1790
2757e526 1791 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1792 reginfo.prog = prog;
2757e526
JH
1793
1794#ifdef DEBUGGING
1795 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1796 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1797#endif
1798
bac06658
JH
1799 RX_MATCH_UTF8_set(prog, do_utf8);
1800
6eb5f6b9 1801 minlen = prog->minlen;
61a36c01 1802 if (strend - startpos < minlen) {
a3621e74 1803 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1804 "String too short [regexec_flags]...\n"));
1805 goto phooey;
1aa99e6b 1806 }
6eb5f6b9 1807
6eb5f6b9
JH
1808 /* Check validity of program. */
1809 if (UCHARAT(prog->program) != REG_MAGIC) {
1810 Perl_croak(aTHX_ "corrupted regexp program");
1811 }
1812
1813 PL_reg_flags = 0;
1814 PL_reg_eval_set = 0;
1815 PL_reg_maxiter = 0;
1816
1817 if (prog->reganch & ROPT_UTF8)
1818 PL_reg_flags |= RF_utf8;
1819
1820 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1821 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1822 PL_bostr = strbeg;
3b0527fe 1823 reginfo.sv = sv;
6eb5f6b9
JH
1824
1825 /* Mark end of line for $ (and such) */
1826 PL_regeol = strend;
1827
1828 /* see how far we have to get to not match where we matched before */
3b0527fe 1829 reginfo.till = startpos+minend;
6eb5f6b9 1830
6eb5f6b9
JH
1831 /* If there is a "must appear" string, look for it. */
1832 s = startpos;
1833
3b0527fe 1834 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1835 MAGIC *mg;
1836
1837 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1838 reginfo.ganch = startpos;
6eb5f6b9
JH
1839 else if (sv && SvTYPE(sv) >= SVt_PVMG
1840 && SvMAGIC(sv)
14befaf4
DM
1841 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1842 && mg->mg_len >= 0) {
3b0527fe 1843 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1844 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1845 if (s > reginfo.ganch)
6eb5f6b9 1846 goto phooey;
3b0527fe 1847 s = reginfo.ganch;
6eb5f6b9
JH
1848 }
1849 }
1850 else /* pos() not defined */
3b0527fe 1851 reginfo.ganch = strbeg;
6eb5f6b9
JH
1852 }
1853
a0714e2c 1854 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1855 re_scream_pos_data d;
1856
1857 d.scream_olds = &scream_olds;
1858 d.scream_pos = &scream_pos;
1859 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1860 if (!s) {
a3621e74 1861 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1862 goto phooey; /* not present */
3fa9c3d7 1863 }
6eb5f6b9
JH
1864 }
1865
a3621e74 1866 DEBUG_EXECUTE_r({
1df70142
AL
1867 const char * const s0 = UTF
1868 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1869 UNI_DISPLAY_REGEX)
1870 : prog->precomp;
bb7a0f54 1871 const int len0 = UTF ? (int)SvCUR(dsv0) : prog->prelen;
1df70142 1872 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1873 UNI_DISPLAY_REGEX) : startpos;
bb7a0f54 1874 const int len1 = do_utf8 ? (int)SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1875 if (!PL_colorset)
1876 reginitcolors();
1877 PerlIO_printf(Perl_debug_log,
a0288114 1878 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1879 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1880 len0, len0, s0,
2a782b5b 1881 PL_colors[1],
9e55ce06 1882 len0 > 60 ? "..." : "",
2a782b5b 1883 PL_colors[0],
9e55ce06
JH
1884 (int)(len1 > 60 ? 60 : len1),
1885 s1, PL_colors[1],
1886 (len1 > 60 ? "..." : "")
2a782b5b
JH
1887 );
1888 });
6eb5f6b9
JH
1889
1890 /* Simplest case: anchored match need be tried only once. */
1891 /* [unless only anchor is BOL and multiline is set] */
1892 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1893 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1894 goto got_it;
7fba1cd6 1895 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1896 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1897 {
1898 char *end;
1899
1900 if (minlen)
1901 dontbother = minlen - 1;
1aa99e6b 1902 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1903 /* for multiline we only have to try after newlines */
33b8afdf 1904 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1905 if (s == startpos)
1906 goto after_try;
1907 while (1) {
3b0527fe 1908 if (regtry(&reginfo, s))
6eb5f6b9
JH
1909 goto got_it;
1910 after_try:
1911 if (s >= end)
1912 goto phooey;
1913 if (prog->reganch & RE_USE_INTUIT) {
1914 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1915 if (!s)
1916 goto phooey;
1917 }
1918 else
1919 s++;
1920 }
1921 } else {
1922 if (s > startpos)
1923 s--;
1924 while (s < end) {
1925 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1926 if (regtry(&reginfo, s))
6eb5f6b9
JH
1927 goto got_it;
1928 }
1929 }
1930 }
1931 }
1932 goto phooey;
1933 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1934 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1935 goto got_it;
1936 goto phooey;
1937 }
1938
1939 /* Messy cases: unanchored match. */
33b8afdf 1940 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1941 /* we have /x+whatever/ */
1942 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1943 char ch;
bf93d4cc
GS
1944#ifdef DEBUGGING
1945 int did_match = 0;
1946#endif
33b8afdf
JH
1947 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1948 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1949 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1950
1aa99e6b 1951 if (do_utf8) {
6eb5f6b9
JH
1952 while (s < strend) {
1953 if (*s == ch) {
a3621e74 1954 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1955 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1956 s += UTF8SKIP(s);
1957 while (s < strend && *s == ch)
1958 s += UTF8SKIP(s);
1959 }
1960 s += UTF8SKIP(s);
1961 }
1962 }
1963 else {
1964 while (s < strend) {
1965 if (*s == ch) {
a3621e74 1966 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1967 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1968 s++;
1969 while (s < strend && *s == ch)
1970 s++;
1971 }
1972 s++;
1973 }
1974 }
a3621e74 1975 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1976 PerlIO_printf(Perl_debug_log,
b7953727
JH
1977 "Did not find anchored character...\n")
1978 );
6eb5f6b9 1979 }
a0714e2c
SS
1980 else if (prog->anchored_substr != NULL
1981 || prog->anchored_utf8 != NULL
1982 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1983 && prog->float_max_offset < strend - s)) {
1984 SV *must;
1985 I32 back_max;
1986 I32 back_min;
1987 char *last;
6eb5f6b9 1988 char *last1; /* Last position checked before */
bf93d4cc
GS
1989#ifdef DEBUGGING
1990 int did_match = 0;
1991#endif
33b8afdf
JH
1992 if (prog->anchored_substr || prog->anchored_utf8) {
1993 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1994 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1995 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1996 back_max = back_min = prog->anchored_offset;
1997 } else {
1998 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1999 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2000 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2001 back_max = prog->float_max_offset;
2002 back_min = prog->float_min_offset;
2003 }
2004 if (must == &PL_sv_undef)
2005 /* could not downgrade utf8 check substring, so must fail */
2006 goto phooey;
2007
2008 last = HOP3c(strend, /* Cannot start after this */
2009 -(I32)(CHR_SVLEN(must)
2010 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
2011
2012 if (s > PL_bostr)
2013 last1 = HOPc(s, -1);
2014 else
2015 last1 = s - 1; /* bogus */
2016
a0288114 2017 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2018 check_substr==must. */
2019 scream_pos = -1;
2020 dontbother = end_shift;
2021 strend = HOPc(strend, -dontbother);
2022 while ( (s <= last) &&
9041c2e3 2023 ((flags & REXEC_SCREAM)
1aa99e6b 2024 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 2025 end_shift, &scream_pos, 0))
1aa99e6b 2026 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 2027 (unsigned char*)strend, must,
7fba1cd6 2028 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
2029 /* we may be pointing at the wrong string */
2030 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 2031 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2032 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2033 if (HOPc(s, -back_max) > last1) {
2034 last1 = HOPc(s, -back_min);
2035 s = HOPc(s, -back_max);
2036 }
2037 else {
52657f30 2038 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2039
2040 last1 = HOPc(s, -back_min);
52657f30 2041 s = t;
6eb5f6b9 2042 }
1aa99e6b 2043 if (do_utf8) {
6eb5f6b9 2044 while (s <= last1) {
3b0527fe 2045 if (regtry(&reginfo, s))
6eb5f6b9
JH
2046 goto got_it;
2047 s += UTF8SKIP(s);
2048 }
2049 }
2050 else {
2051 while (s <= last1) {
3b0527fe 2052 if (regtry(&reginfo, s))
6eb5f6b9
JH
2053 goto got_it;
2054 s++;
2055 }
2056 }
2057 }
a3621e74 2058 DEBUG_EXECUTE_r(if (!did_match)
b7953727 2059 PerlIO_printf(Perl_debug_log,
a0288114 2060 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 2061 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
2062 ? "anchored" : "floating"),
2063 PL_colors[0],
2064 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 2065 SvPVX_const(must),
b7953727
JH
2066 PL_colors[1], (SvTAIL(must) ? "$" : ""))
2067 );
6eb5f6b9
JH
2068 goto phooey;
2069 }
155aba94 2070 else if ((c = prog->regstclass)) {
f14c76ed 2071 if (minlen) {
be8e71aa 2072 const OPCODE op = OP(prog->regstclass);
66e933ab 2073 /* don't bother with what can't match */
07be1b83 2074 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
f14c76ed
RGS
2075 strend = HOPc(strend, -(minlen - 1));
2076 }
a3621e74 2077 DEBUG_EXECUTE_r({
be8e71aa 2078 SV * const prop = sv_newmortal();
cfd0369c
NC
2079 const char *s0;
2080 const char *s1;
9e55ce06
JH
2081 int len0;
2082 int len1;
2083
32fc9b6a 2084 regprop(prog, prop, c);
9e55ce06 2085 s0 = UTF ?
3f7c398e 2086 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 2087 UNI_DISPLAY_REGEX) :
cfd0369c 2088 SvPVX_const(prop);
9e55ce06
JH
2089 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
2090 s1 = UTF ?
c728cb41 2091 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
bb7a0f54 2092 len1 = UTF ? (int)SvCUR(dsv1) : strend - s;
9e55ce06 2093 PerlIO_printf(Perl_debug_log,
07be1b83 2094 "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
9e55ce06 2095 len0, len0, s0,
07be1b83 2096 len1, len1, s1, (int)(strend - s));
ffc61ed2 2097 });
3b0527fe 2098 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2099 goto got_it;
07be1b83 2100 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2101 }
2102 else {
2103 dontbother = 0;
a0714e2c 2104 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2105 /* Trim the end. */
d6a28714 2106 char *last;
33b8afdf
JH
2107 SV* float_real;
2108
2109 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2110 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2111 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2112
2113 if (flags & REXEC_SCREAM) {
33b8afdf 2114 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2115 end_shift, &scream_pos, 1); /* last one */
2116 if (!last)
ffc61ed2 2117 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
2118 /* we may be pointing at the wrong string */
2119 else if (RX_MATCH_COPIED(prog))
3f7c398e 2120 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2121 }
d6a28714
JH
2122 else {
2123 STRLEN len;
cfd0369c 2124 const char * const little = SvPV_const(float_real, len);
d6a28714 2125
33b8afdf 2126 if (SvTAIL(float_real)) {
d6a28714
JH
2127 if (memEQ(strend - len + 1, little, len - 1))
2128 last = strend - len + 1;
7fba1cd6 2129 else if (!multiline)
9041c2e3 2130 last = memEQ(strend - len, little, len)
bd61b366 2131 ? strend - len : NULL;
b8c5462f 2132 else
d6a28714
JH
2133 goto find_last;
2134 } else {
2135 find_last:
9041c2e3 2136 if (len)
d6a28714 2137 last = rninstr(s, strend, little, little + len);
b8c5462f 2138 else
a0288114 2139 last = strend; /* matching "$" */
b8c5462f 2140 }
b8c5462f 2141 }
bf93d4cc 2142 if (last == NULL) {
a3621e74 2143 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2144 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2145 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2146 goto phooey; /* Should not happen! */
2147 }
d6a28714
JH
2148 dontbother = strend - last + prog->float_min_offset;
2149 }
2150 if (minlen && (dontbother < minlen))
2151 dontbother = minlen - 1;
2152 strend -= dontbother; /* this one's always in bytes! */
2153 /* We don't know much -- general case. */
1aa99e6b 2154 if (do_utf8) {
d6a28714 2155 for (;;) {
3b0527fe 2156 if (regtry(&reginfo, s))
d6a28714
JH
2157 goto got_it;
2158 if (s >= strend)
2159 break;
b8c5462f 2160 s += UTF8SKIP(s);
d6a28714
JH
2161 };
2162 }
2163 else {
2164 do {
3b0527fe 2165 if (regtry(&reginfo, s))
d6a28714
JH
2166 goto got_it;
2167 } while (s++ < strend);
2168 }
2169 }
2170
2171 /* Failure. */
2172 goto phooey;
2173
2174got_it:
2175 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2176
2177 if (PL_reg_eval_set) {
2178 /* Preserve the current value of $^R */
2179 if (oreplsv != GvSV(PL_replgv))
2180 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2181 restored, the value remains
2182 the same. */
4f639d21 2183 restore_pos(aTHX_ prog);
d6a28714
JH
2184 }
2185
2186 /* make sure $`, $&, $', and $digit will work later */
2187 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2188 RX_MATCH_COPY_FREE(prog);
d6a28714 2189 if (flags & REXEC_COPY_STR) {
be8e71aa 2190 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2191#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2192 if ((SvIsCOW(sv)
2193 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2194 if (DEBUG_C_TEST) {
2195 PerlIO_printf(Perl_debug_log,
2196 "Copy on write: regexp capture, type %d\n",
2197 (int) SvTYPE(sv));
2198 }
2199 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2200 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2201 assert (SvPOKp(prog->saved_copy));
2202 } else
2203#endif
2204 {
2205 RX_MATCH_COPIED_on(prog);
2206 s = savepvn(strbeg, i);
2207 prog->subbeg = s;
2208 }
d6a28714 2209 prog->sublen = i;
d6a28714
JH
2210 }
2211 else {
2212 prog->subbeg = strbeg;
2213 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2214 }
2215 }
9041c2e3 2216
d6a28714
JH
2217 return 1;
2218
2219phooey:
a3621e74 2220 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2221 PL_colors[4], PL_colors[5]));
d6a28714 2222 if (PL_reg_eval_set)
4f639d21 2223 restore_pos(aTHX_ prog);
d6a28714
JH
2224 return 0;
2225}
2226
2227/*
2228 - regtry - try match at specific point
2229 */
2230STATIC I32 /* 0 failure, 1 success */
3b0527fe 2231S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2232{
97aff369 2233 dVAR;
d6a28714
JH
2234 register I32 *sp;
2235 register I32 *ep;
2236 CHECKPOINT lastcp;
3b0527fe 2237 regexp *prog = reginfo->prog;
a3621e74 2238 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2239
02db2b7b
IZ
2240#ifdef DEBUGGING
2241 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2242#endif
d6a28714
JH
2243 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2244 MAGIC *mg;
2245
2246 PL_reg_eval_set = RS_init;
a3621e74 2247 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2248 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2249 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2250 ));
e8347627 2251 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2252 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2253 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2254 SAVETMPS;
2255 /* Apparently this is not needed, judging by wantarray. */
e8347627 2256 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2257 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2258
3b0527fe 2259 if (reginfo->sv) {
d6a28714 2260 /* Make $_ available to executed code. */
3b0527fe 2261 if (reginfo->sv != DEFSV) {
59f00321 2262 SAVE_DEFSV;
3b0527fe 2263 DEFSV = reginfo->sv;
b8c5462f 2264 }
d6a28714 2265
3b0527fe
DM
2266 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2267 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2268 /* prepare for quick setting of pos */
d300d9fa
NC
2269#ifdef PERL_OLD_COPY_ON_WRITE
2270 if (SvIsCOW(sv))
2271 sv_force_normal_flags(sv, 0);
2272#endif
3dab1dad 2273 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2274 &PL_vtbl_mglob, NULL, 0);
d6a28714 2275 mg->mg_len = -1;
b8c5462f 2276 }
d6a28714
JH
2277 PL_reg_magic = mg;
2278 PL_reg_oldpos = mg->mg_len;
4f639d21 2279 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2280 }
09687e5a 2281 if (!PL_reg_curpm) {
a02a5408 2282 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2283#ifdef USE_ITHREADS
2284 {
be8e71aa 2285 SV* const repointer = newSViv(0);
577e12cc 2286 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2287 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2288 av_push(PL_regex_padav,repointer);
2289 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2290 PL_regex_pad = AvARRAY(PL_regex_padav);
2291 }
2292#endif
2293 }
aaa362c4 2294 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2295 PL_reg_oldcurpm = PL_curpm;
2296 PL_curpm = PL_reg_curpm;
2297 if (RX_MATCH_COPIED(prog)) {
2298 /* Here is a serious problem: we cannot rewrite subbeg,
2299 since it may be needed if this match fails. Thus
2300 $` inside (?{}) could fail... */
2301 PL_reg_oldsaved = prog->subbeg;
2302 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2303#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2304 PL_nrs = prog->saved_copy;
2305#endif
d6a28714
JH
2306 RX_MATCH_COPIED_off(prog);
2307 }
2308 else
bd61b366 2309 PL_reg_oldsaved = NULL;
d6a28714
JH
2310 prog->subbeg = PL_bostr;
2311 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2312 }
973dddac 2313 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2314 PL_reginput = startpos;
2315 PL_regstartp = prog->startp;
2316 PL_regendp = prog->endp;
2317 PL_reglastparen = &prog->lastparen;
a01268b5 2318 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2319 prog->lastparen = 0;
03994de8 2320 prog->lastcloseparen = 0;
d6a28714 2321 PL_regsize = 0;
a3621e74 2322 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2323 if (PL_reg_start_tmpl <= prog->nparens) {
2324 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2325 if(PL_reg_start_tmp)
2326 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2327 else
a02a5408 2328 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2329 }
2330
2331 /* XXXX What this code is doing here?!!! There should be no need
2332 to do this again and again, PL_reglastparen should take care of
3dd2943c 2333 this! --ilya*/
dafc8851
JH
2334
2335 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2336 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2337 * PL_reglastparen), is not needed at all by the test suite
2338 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2339 * enough, for building DynaLoader, or otherwise this
2340 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2341 * will happen. Meanwhile, this code *is* needed for the
2342 * above-mentioned test suite tests to succeed. The common theme
2343 * on those tests seems to be returning null fields from matches.
2344 * --jhi */
dafc8851 2345#if 1
d6a28714
JH
2346 sp = prog->startp;
2347 ep = prog->endp;
2348 if (prog->nparens) {
097eb12c 2349 register I32 i;
eb160463 2350 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2351 *++sp = -1;
2352 *++ep = -1;
2353 }
2354 }
dafc8851 2355#endif
02db2b7b 2356 REGCP_SET(lastcp);
3b0527fe 2357 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2358 prog->endp[0] = PL_reginput - PL_bostr;
2359 return 1;
2360 }
02db2b7b 2361 REGCP_UNWIND(lastcp);
d6a28714
JH
2362 return 0;
2363}
2364
02db2b7b 2365
8ba1375e
MJD
2366#define sayYES goto yes
2367#define sayNO goto no
e0f9d4a8 2368#define sayNO_ANYOF goto no_anyof
8ba1375e 2369#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2370#define sayNO_FINAL goto no_final
2371#define sayNO_SILENT goto do_no
2372#define saySAME(x) if (x) goto yes; else goto no
2373
3ab3c9b4
HS
2374#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2375#define POSCACHE_SEEN 1 /* we know what we're caching */
2376#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
7409bbd3 2377
3ab3c9b4 2378#define CACHEsayYES STMT_START { \
d8319b27 2379 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3
DM
2380 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2381 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2382 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2383 } \
2384 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2385 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2386 } \
2387 else { \
3ab3c9b4
HS
2388 /* cache records failure, but this is success */ \
2389 DEBUG_r( \
2390 PerlIO_printf(Perl_debug_log, \
2391 "%*s (remove success from failure cache)\n", \
2392 REPORT_CODE_OFF+PL_regindent*2, "") \
2393 ); \
d8319b27 2394 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2395 } \
2396 } \
2397 sayYES; \
2398} STMT_END
7409bbd3 2399
3ab3c9b4 2400#define CACHEsayNO STMT_START { \
d8319b27 2401 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3 2402 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
3ab3c9b4 2403 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
7409bbd3
DM
2404 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2405 } \
2406 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2407 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2408 } \
2409 else { \
3ab3c9b4
HS
2410 /* cache records success, but this is failure */ \
2411 DEBUG_r( \
2412 PerlIO_printf(Perl_debug_log, \
2413 "%*s (remove failure from success cache)\n", \
2414 REPORT_CODE_OFF+PL_regindent*2, "") \
2415 ); \
d8319b27 2416 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2417 } \
2418 } \
2419 sayNO; \
2420} STMT_END
2421
a3621e74
YO
2422/* this is used to determine how far from the left messages like
2423 'failed...' are printed. Currently 29 makes these messages line
2424 up with the opcode they refer to. Earlier perls used 25 which
2425 left these messages outdented making reviewing a debug output
2426 quite difficult.
2427*/
2428#define REPORT_CODE_OFF 29
2429
2430
2431/* Make sure there is a test for this +1 options in re_tests */
2432#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2433
40a82448
DM
2434#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2435#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2436
86545054
DM
2437#define SLAB_FIRST(s) (&(s)->states[0])
2438#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2439
5d9a96ca
DM
2440/* grab a new slab and return the first slot in it */
2441
2442STATIC regmatch_state *
2443S_push_slab(pTHX)
2444{
54df2634
NC
2445#if PERL_VERSION < 9
2446 dMY_CXT;
2447#endif
5d9a96ca
DM
2448 regmatch_slab *s = PL_regmatch_slab->next;
2449 if (!s) {
2450 Newx(s, 1, regmatch_slab);
2451 s->prev = PL_regmatch_slab;
2452 s->next = NULL;
2453 PL_regmatch_slab->next = s;
2454 }
2455 PL_regmatch_slab = s;
86545054 2456 return SLAB_FIRST(s);
5d9a96ca 2457}
5b47454d 2458
95b24440
DM
2459/* simulate a recursive call to regmatch */
2460
2461#define REGMATCH(ns, where) \
5d9a96ca
DM
2462 st->scan = scan; \
2463 scan = (ns); \
2464 st->resume_state = resume_##where; \
95b24440
DM
2465 goto start_recurse; \
2466 resume_point_##where:
2467
40a82448
DM
2468/* push a new state then goto it */
2469
2470#define PUSH_STATE_GOTO(state, node) \
2471 scan = node; \
2472 st->resume_state = state; \
2473 goto push_state;
2474
2475/* push a new state with success backtracking, then goto it */
2476
2477#define PUSH_YES_STATE_GOTO(state, node) \
2478 scan = node; \
2479 st->resume_state = state; \
2480 goto push_yes_state;
2481
aa283a38 2482
aa283a38 2483
d6a28714
JH
2484/*
2485 - regmatch - main matching routine
2486 *
2487 * Conceptually the strategy is simple: check to see whether the current
2488 * node matches, call self recursively to see whether the rest matches,
2489 * and then act accordingly. In practice we make some effort to avoid
2490 * recursion, in particular by going through "ordinary" nodes (that don't
2491 * need to know whether the rest of the match failed) by a loop instead of
2492 * by recursion.
2493 */
2494/* [lwall] I've hoisted the register declarations to the outer block in order to
2495 * maybe save a little bit of pushing and popping on the stack. It also takes
2496 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2497 *
2498 * This function used to be heavily recursive, but since this had the
2499 * effect of blowing the CPU stack on complex regexes, it has been
2500 * restructured to be iterative, and to save state onto the heap rather
2501 * than the stack. Essentially whereever regmatch() used to be called, it
2502 * pushes the current state, notes where to return, then jumps back into
2503 * the main loop.
2504 *
2505 * Originally the structure of this function used to look something like
2506
2507 S_regmatch() {
2508 int a = 1, b = 2;
2509 ...
2510 while (scan != NULL) {
5d9a96ca 2511 a++; // do stuff with a and b
95b24440
DM
2512 ...
2513 switch (OP(scan)) {
2514 case FOO: {
2515 int local = 3;
2516 ...
2517 if (regmatch(...)) // recurse
2518 goto yes;
2519 }
2520 ...
2521 }
2522 }
2523 yes:
2524 return 1;
2525 }
2526
2527 * Now it looks something like this:
2528
5d9a96ca 2529 typedef struct {
95b24440
DM
2530 int a, b, local;
2531 int resume_state;
5d9a96ca 2532 } regmatch_state;
95b24440
DM
2533
2534 S_regmatch() {
5d9a96ca
DM
2535 regmatch_state *st = new();
2536 int depth=0;
2537 st->a++; // do stuff with a and b
95b24440
DM
2538 ...
2539 while (scan != NULL) {
2540 ...
2541 switch (OP(scan)) {
2542 case FOO: {
5d9a96ca 2543 st->local = 3;
95b24440 2544 ...
5d9a96ca
DM
2545 st->scan = scan;
2546 scan = ...;
2547 st->resume_state = resume_FOO;
2548 goto start_recurse; // recurse
95b24440 2549
5d9a96ca
DM
2550 resume_point_FOO:
2551 if (result)
95b24440
DM
2552 goto yes;
2553 }
2554 ...
2555 }
5d9a96ca
DM
2556 start_recurse:
2557 st = new(); push a new state
2558 st->a = 1; st->b = 2;
2559 depth++;
95b24440 2560 }
5d9a96ca 2561 yes:
95b24440 2562 result = 1;
5d9a96ca
DM
2563 if (depth--) {
2564 st = pop();
95b24440
DM
2565 switch (resume_state) {
2566 case resume_FOO:
2567 goto resume_point_FOO;
2568 ...
2569 }
2570 }
2571 return result
2572 }
2573
2574 * WARNING: this means that any line in this function that contains a
2575 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2576 * regmatch() using gotos instead. Thus the values of any local variables
2577 * not saved in the regmatch_state structure will have been lost when
2578 * execution resumes on the next line .
5d9a96ca
DM
2579 *
2580 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2581 * PL_regmatch_state always points to the currently active state, and
2582 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2583 * The first time regmatch is called, the first slab is allocated, and is
2584 * never freed until interpreter desctruction. When the slab is full,
2585 * a new one is allocated chained to the end. At exit from regmatch, slabs
2586 * allocated since entry are freed.
d6a28714 2587 */
95b24440 2588
40a82448 2589/* *** every FOO_fail should = FOO+1 */
c255a977
DM
2590#define TRIE_next (REGNODE_MAX+1)
2591#define TRIE_next_fail (REGNODE_MAX+2)
2592#define EVAL_A (REGNODE_MAX+3)
2593#define EVAL_A_fail (REGNODE_MAX+4)
2594#define resume_CURLYX (REGNODE_MAX+5)
2595#define resume_WHILEM1 (REGNODE_MAX+6)
2596#define resume_WHILEM2 (REGNODE_MAX+7)
2597#define resume_WHILEM3 (REGNODE_MAX+8)
2598#define resume_WHILEM4 (REGNODE_MAX+9)
2599#define resume_WHILEM5 (REGNODE_MAX+10)
2600#define resume_WHILEM6 (REGNODE_MAX+11)
2601#define BRANCH_next (REGNODE_MAX+12)
2602#define BRANCH_next_fail (REGNODE_MAX+13)
2603#define CURLYM_A (REGNODE_MAX+14)
2604#define CURLYM_A_fail (REGNODE_MAX+15)
2605#define CURLYM_B (REGNODE_MAX+16)
2606#define CURLYM_B_fail (REGNODE_MAX+17)
2607#define IFMATCH_A (REGNODE_MAX+18)
2608#define IFMATCH_A_fail (REGNODE_MAX+19)
2609#define CURLY_B_min_known (REGNODE_MAX+20)
2610#define CURLY_B_min_known_fail (REGNODE_MAX+21)
2611#define CURLY_B_min (REGNODE_MAX+22)
2612#define CURLY_B_min_fail (REGNODE_MAX+23)
2613#define CURLY_B_max (REGNODE_MAX+24)
2614#define CURLY_B_max_fail (REGNODE_MAX+25)
40a82448
DM
2615
2616
3dab1dad 2617#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2618
07be1b83
YO
2619#ifdef DEBUGGING
2620STATIC void
2621S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2622{
2623 const int docolor = *PL_colors[0];
2624 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2625 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2626 /* The part of the string before starttry has one color
2627 (pref0_len chars), between starttry and current
2628 position another one (pref_len - pref0_len chars),
2629 after the current position the third one.
2630 We assume that pref0_len <= pref_len, otherwise we
2631 decrease pref0_len. */
2632 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2633 ? (5 + taill) - l : locinput - PL_bostr;
2634 int pref0_len;
2635
2636 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2637 pref_len++;
2638 pref0_len = pref_len - (locinput - PL_reg_starttry);
2639 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2640 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2641 ? (5 + taill) - pref_len : PL_regeol - locinput);
2642 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2643 l--;
2644 if (pref0_len < 0)
2645 pref0_len = 0;
2646 if (pref0_len > pref_len)
2647 pref0_len = pref_len;
2648 {
2649 const char * const s0 =
2650 do_utf8 && OP(scan) != CANY ?
2651 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
2652 pref0_len, 60, UNI_DISPLAY_REGEX) :
2653 locinput - pref_len;
2654 const int len0 = do_utf8 ? (int)strlen(s0) : pref0_len;
2655 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2656 pv_uni_display(PERL_DEBUG_PAD(1),
2657 (U8*)(locinput - pref_len + pref0_len),
2658 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2659 locinput - pref_len + pref0_len;
2660 const int len1 = do_utf8 ? (int)strlen(s1) : pref_len - pref0_len;
2661 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2662 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
2663 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2664 locinput;
2665 const int len2 = do_utf8 ? (int)strlen(s2) : l;
2666 PerlIO_printf(Perl_debug_log,
2667 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
2668 (IV)(locinput - PL_bostr),
2669 PL_colors[4],
2670 len0, s0,
2671 PL_colors[5],
2672 PL_colors[2],
2673 len1, s1,
2674 PL_colors[3],
2675 (docolor ? "" : "> <"),
2676 PL_colors[0],
2677 len2, s2,
2678 PL_colors[1],
2679 15 - l - pref_len + 1,
2680 "");
2681 }
2682}
2683#endif
2684
d6a28714 2685STATIC I32 /* 0 failure, 1 success */
3b0527fe 2686S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2687{
54df2634
NC
2688#if PERL_VERSION < 9
2689 dMY_CXT;
2690#endif
27da23d5 2691 dVAR;
95b24440 2692 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2693 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2694
3b0527fe
DM
2695 regexp *rex = reginfo->prog;
2696
5d9a96ca
DM
2697 regmatch_slab *orig_slab;
2698 regmatch_state *orig_state;
a3621e74 2699
5d9a96ca
DM
2700 /* the current state. This is a cached copy of PL_regmatch_state */
2701 register regmatch_state *st;
95b24440 2702
5d9a96ca
DM
2703 /* cache heavy used fields of st in registers */
2704 register regnode *scan;
2705 register regnode *next;
2706 register I32 n = 0; /* initialize to shut up compiler warning */
2707 register char *locinput = PL_reginput;
95b24440 2708
5d9a96ca
DM
2709 /* these variables are NOT saved during a recusive RFEGMATCH: */
2710 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2711 bool result; /* return value of S_regmatch */
5d9a96ca 2712 int depth = 0; /* depth of recursion */
77cb431f
DM
2713 regmatch_state *yes_state = NULL; /* state to pop to on success of
2714 subpattern */
40a82448 2715 U32 state_num;
95b24440
DM
2716
2717#ifdef DEBUGGING
e68ec53f 2718 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2719 PL_regindent++;
2720#endif
2721
5d9a96ca
DM
2722 /* on first ever call to regmatch, allocate first slab */
2723 if (!PL_regmatch_slab) {
2724 Newx(PL_regmatch_slab, 1, regmatch_slab);
2725 PL_regmatch_slab->prev = NULL;
2726 PL_regmatch_slab->next = NULL;
86545054 2727 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2728 }
2729
2730 /* remember current high-water mark for exit */
2731 /* XXX this should be done with SAVE* instead */
2732 orig_slab = PL_regmatch_slab;
2733 orig_state = PL_regmatch_state;
2734
2735 /* grab next free state slot */
2736 st = ++PL_regmatch_state;
86545054 2737 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2738 st = PL_regmatch_state = S_push_slab(aTHX);
2739
2740 st->minmod = 0;
2741 st->sw = 0;
2742 st->logical = 0;
5d9a96ca 2743 st->cc = NULL;
d6a28714
JH
2744 /* Note that nextchr is a byte even in UTF */
2745 nextchr = UCHARAT(locinput);
2746 scan = prog;
2747 while (scan != NULL) {
8ba1375e 2748
a3621e74 2749 DEBUG_EXECUTE_r( {
6136c704 2750 SV * const prop = sv_newmortal();
07be1b83 2751 dump_exec_pos( locinput, scan, do_utf8 );
32fc9b6a 2752 regprop(rex, prop, scan);
07be1b83
YO
2753
2754 PerlIO_printf(Perl_debug_log,
2755 "%3"IVdf":%*s%s(%"IVdf")\n",
2756 (IV)(scan - rex->program), PL_regindent*2, "",
2757 SvPVX_const(prop),
2758 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2a782b5b 2759 });
d6a28714
JH
2760
2761 next = scan + NEXT_OFF(scan);
2762 if (next == scan)
2763 next = NULL;
40a82448 2764 state_num = OP(scan);
d6a28714 2765
40a82448
DM
2766 reenter_switch:
2767 switch (state_num) {
d6a28714 2768 case BOL:
7fba1cd6 2769 if (locinput == PL_bostr)
d6a28714 2770 {
3b0527fe 2771 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2772 break;
2773 }
d6a28714
JH
2774 sayNO;
2775 case MBOL:
12d33761
HS
2776 if (locinput == PL_bostr ||
2777 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2778 {
b8c5462f
JH
2779 break;
2780 }
d6a28714
JH
2781 sayNO;
2782 case SBOL:
c2a73568 2783 if (locinput == PL_bostr)
b8c5462f 2784 break;
d6a28714
JH
2785 sayNO;
2786 case GPOS:
3b0527fe 2787 if (locinput == reginfo->ganch)
d6a28714
JH
2788 break;
2789 sayNO;
2790 case EOL:
d6a28714
JH
2791 goto seol;
2792 case MEOL:
d6a28714 2793 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2794 sayNO;
b8c5462f 2795 break;
d6a28714
JH
2796 case SEOL:
2797 seol:
2798 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2799 sayNO;
d6a28714 2800 if (PL_regeol - locinput > 1)
b8c5462f 2801 sayNO;
b8c5462f 2802 break;
d6a28714
JH
2803 case EOS:
2804 if (PL_regeol != locinput)
b8c5462f 2805 sayNO;
d6a28714 2806 break;
ffc61ed2 2807 case SANY:
d6a28714 2808 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2809 sayNO;
f33976b4
DB
2810 if (do_utf8) {
2811 locinput += PL_utf8skip[nextchr];
2812 if (locinput > PL_regeol)
2813 sayNO;
2814 nextchr = UCHARAT(locinput);
2815 }
2816 else
2817 nextchr = UCHARAT(++locinput);
2818 break;
2819 case CANY:
2820 if (!nextchr && locinput >= PL_regeol)
2821 sayNO;
b8c5462f 2822 nextchr = UCHARAT(++locinput);
a0d0e21e 2823 break;
ffc61ed2 2824 case REG_ANY:
1aa99e6b
IH
2825 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2826 sayNO;
2827 if (do_utf8) {
b8c5462f 2828 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2829 if (locinput > PL_regeol)
2830 sayNO;
a0ed51b3 2831 nextchr = UCHARAT(locinput);
a0ed51b3 2832 }
1aa99e6b
IH
2833 else
2834 nextchr = UCHARAT(++locinput);
a0ed51b3 2835 break;
166ba7cd
DM
2836
2837#undef ST
2838#define ST st->u.trie
2839
5b47454d 2840 case TRIE:
3dab1dad 2841 {
07be1b83 2842 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2843 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2844 trie_type = do_utf8 ?
2845 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2846 : trie_plain;
2847
2848 /* what trie are we using right now */
be8e71aa 2849 reg_trie_data * const trie
3dab1dad
YO
2850 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2851 U32 state = trie->startstate;
166ba7cd
DM
2852
2853 U8 *uc = ( U8* )locinput;
2854 U16 charid = 0;
2855 U32 base = 0;
2856 UV uvc = 0;
2857 STRLEN len = 0;
2858 STRLEN foldlen = 0;
2859 U8 *uscan = (U8*)NULL;
2860 STRLEN bufflen=0;
2861 SV *sv_accept_buff = NULL;
2862 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2863
2864 ST.accepted = 0; /* how many accepting states we have seen */
2865 ST.B = next;
2866#ifdef DEBUGGING
2867 ST.me = scan;
2868#endif
3dab1dad
YO
2869
2870 if (trie->bitmap && trie_type != trie_utf8_fold &&
2871 !TRIE_BITMAP_TEST(trie,*locinput)
2872 ) {
2873 if (trie->states[ state ].wordnum) {
2874 DEBUG_EXECUTE_r(
2875 PerlIO_printf(Perl_debug_log,
2876 "%*s %smatched empty string...%s\n",
2877 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2878 );
2879 break;
2880 } else {
2881 DEBUG_EXECUTE_r(
2882 PerlIO_printf(Perl_debug_log,
2883 "%*s %sfailed to match start class...%s\n",
2884 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2885 );
2886 sayNO_SILENT;
2887 }
2888 }
166ba7cd 2889
07be1b83
YO
2890 /*
2891 traverse the TRIE keeping track of all accepting states
2892 we transition through until we get to a failing node.
2893 */
2894
a3621e74
YO
2895 while ( state && uc <= (U8*)PL_regeol ) {
2896
5b47454d 2897 if (trie->states[ state ].wordnum) {
166ba7cd 2898 if (!ST.accepted ) {
5b47454d
DM
2899 ENTER;
2900 SAVETMPS;
2901 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2902 sv_accept_buff=newSV(bufflen *
2903 sizeof(reg_trie_accepted) - 1);
2904 SvCUR_set(sv_accept_buff,
2905 sizeof(reg_trie_accepted));
2906 SvPOK_on(sv_accept_buff);
2907 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2908 SAVETMPS;
2909 ST.accept_buff =
5b47454d
DM
2910 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2911 }
2912 else {
166ba7cd 2913 if (ST.accepted >= bufflen) {
5b47454d 2914 bufflen *= 2;
166ba7cd 2915 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2916 SvGROW(sv_accept_buff,
2917 bufflen * sizeof(reg_trie_accepted));
2918 }
2919 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2920 + sizeof(reg_trie_accepted));
2921 }
166ba7cd
DM
2922 ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum;
2923 ST.accept_buff[ST.accepted].endpos = uc;
2924 ++ST.accepted;
5b47454d 2925 }
a3621e74
YO
2926
2927 base = trie->states[ state ].trans.base;
2928
07be1b83
YO
2929 DEBUG_TRIE_EXECUTE_r({
2930 dump_exec_pos( (char *)uc, scan, do_utf8 );
a3621e74 2931 PerlIO_printf( Perl_debug_log,
e4584336 2932 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
07be1b83 2933 2+PL_regindent * 2, "", PL_colors[4],
166ba7cd 2934 (UV)state, (UV)base, (UV)ST.accepted );
07be1b83 2935 });
a3621e74
YO
2936
2937 if ( base ) {
5b47454d 2938 switch (trie_type) {
3dab1dad 2939 case trie_utf8_fold:
a3621e74
YO
2940 if ( foldlen>0 ) {
2941 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2942 foldlen -= len;
2943 uscan += len;
2944 len=0;
2945 } else {
2946 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2947 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2948 foldlen -= UNISKIP( uvc );
2949 uscan = foldbuf + UNISKIP( uvc );
2950 }
5b47454d
DM
2951 break;
2952 case trie_utf8:
2953 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2954 &len, uniflags );
2955 break;
2956 case trie_plain:
e4584336 2957 uvc = (UV)*uc;
a3621e74
YO
2958 len = 1;
2959 }
2960
5b47454d
DM
2961 if (uvc < 256) {
2962 charid = trie->charmap[ uvc ];
2963 }
2964 else {
2965 charid = 0;
2966 if (trie->widecharmap) {
3dab1dad 2967 SV** const svpp = hv_fetch(trie->widecharmap,
5b47454d
DM
2968 (char*)&uvc, sizeof(UV), 0);
2969 if (svpp)
2970 charid = (U16)SvIV(*svpp);
2971 }
2972 }
a3621e74 2973
5b47454d
DM
2974 if (charid &&
2975 (base + charid > trie->uniquecharcount )
2976 && (base + charid - 1 - trie->uniquecharcount
2977 < trie->lasttrans)
2978 && trie->trans[base + charid - 1 -
2979 trie->uniquecharcount].check == state)
2980 {
2981 state = trie->trans[base + charid - 1 -
2982 trie->uniquecharcount ].next;
2983 }
2984 else {
2985 state = 0;
2986 }
2987 uc += len;
2988
2989 }
2990 else {
a3621e74
YO
2991 state = 0;
2992 }
2993 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2994 PerlIO_printf( Perl_debug_log,
2995 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2996 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2997 );
2998 }
166ba7cd 2999 if (!ST.accepted )
a3621e74 3000 sayNO;
a3621e74 3001
166ba7cd
DM
3002 DEBUG_EXECUTE_r(
3003 PerlIO_printf( Perl_debug_log,
3004 "%*s %sgot %"IVdf" possible matches%s\n",
3005 REPORT_CODE_OFF + PL_regindent * 2, "",
3006 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3007 );
3008 }
3009
3010 /* FALL THROUGH */
3011
3012 case TRIE_next_fail: /* we failed - try next alterative */
3013
3014 if ( ST.accepted == 1 ) {
3015 /* only one choice left - just continue */
3016 DEBUG_EXECUTE_r({
3017 reg_trie_data * const trie
3018 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
3019 SV ** const tmp = RX_DEBUG(reginfo->prog)
3020 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
3021 : NULL;
3022 PerlIO_printf( Perl_debug_log,
3023 "%*s %sonly one match left: #%d <%s>%s\n",
3024 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3025 ST.accept_buff[ 0 ].wordnum,
3026 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
3027 PL_colors[5] );
3028 });
3029 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3030 /* in this case we free tmps/leave before we call regmatch
3031 as we wont be using accept_buff again. */
3032 FREETMPS;
3033 LEAVE;
3034 locinput = PL_reginput;
3035 nextchr = UCHARAT(locinput);
3036 scan = ST.B;
3037 continue; /* execute rest of RE */
3038 }
3039
3040 if (!ST.accepted-- ) {
3041 FREETMPS;
3042 LEAVE;
3043 sayNO;
3044 }
3045
a3621e74 3046 /*
166ba7cd
DM
3047 There are at least two accepting states left. Presumably
3048 the number of accepting states is going to be low,
3049 typically two. So we simply scan through to find the one
3050 with lowest wordnum. Once we find it, we swap the last
3051 state into its place and decrement the size. We then try to
3052 match the rest of the pattern at the point where the word
3053 ends. If we succeed, control just continues along the
3054 regex; if we fail we return here to try the next accepting
3055 state
3056 */
a3621e74 3057
166ba7cd
DM
3058 {
3059 U32 best = 0;
3060 U32 cur;
3061 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3062 DEBUG_TRIE_EXECUTE_r(
f2278c82 3063 PerlIO_printf( Perl_debug_log,
166ba7cd
DM
3064 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3065 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
3066 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3067 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3068 );
3069
3070 if (ST.accept_buff[cur].wordnum <
3071 ST.accept_buff[best].wordnum)
3072 best = cur;
a3621e74 3073 }
166ba7cd
DM
3074
3075 DEBUG_EXECUTE_r({
3076 reg_trie_data * const trie
3077 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
3078 SV ** const tmp = RX_DEBUG(reginfo->prog)
3079 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
3080 : NULL;
3081 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
3082 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3083 ST.accept_buff[best].wordnum,
3084 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
3085 PL_colors[5] );
3086 });
3087
3088 if ( best<ST.accepted ) {
3089 reg_trie_accepted tmp = ST.accept_buff[ best ];
3090 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3091 ST.accept_buff[ ST.accepted ] = tmp;
3092 best = ST.accepted;
a3621e74 3093 }
166ba7cd
DM
3094 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3095 }
3096 PUSH_STATE_GOTO(TRIE_next, ST.B);
3097 /* NOTREACHED */
3098
3099#undef ST
3100
95b24440
DM
3101 case EXACT: {
3102 char *s = STRING(scan);
5d9a96ca 3103 st->ln = STR_LEN(scan);
eb160463 3104 if (do_utf8 != UTF) {
bc517b45 3105 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3106 char *l = locinput;
be8e71aa 3107 const char * const e = s + st->ln;
a72c7584 3108
5ff6fc6d
JH
3109 if (do_utf8) {
3110 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3111 while (s < e) {
a3b680e6 3112 STRLEN ulen;
1aa99e6b 3113 if (l >= PL_regeol)
5ff6fc6d
JH
3114 sayNO;
3115 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3116 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3117 uniflags))
5ff6fc6d 3118 sayNO;
bc517b45 3119 l += ulen;
5ff6fc6d 3120 s ++;
1aa99e6b 3121 }
5ff6fc6d
JH
3122 }
3123 else {
3124 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3125 while (s < e) {
a3b680e6 3126 STRLEN ulen;
1aa99e6b
IH
3127 if (l >= PL_regeol)
3128 sayNO;
5ff6fc6d 3129 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3130 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3131 uniflags))
1aa99e6b 3132 sayNO;
bc517b45 3133 s += ulen;
a72c7584 3134 l ++;
1aa99e6b 3135 }
5ff6fc6d 3136 }
1aa99e6b
IH
3137 locinput = l;
3138 nextchr = UCHARAT(locinput);
3139 break;
3140 }
bc517b45 3141 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3142 /* Inline the first character, for speed. */
3143 if (UCHARAT(s) != nextchr)
3144 sayNO;
5d9a96ca 3145 if (PL_regeol - locinput < st->ln)
d6a28714 3146 sayNO;
5d9a96ca 3147 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 3148 sayNO;
5d9a96ca 3149 locinput += st->ln;
d6a28714
JH
3150 nextchr = UCHARAT(locinput);
3151 break;
95b24440 3152 }
d6a28714 3153 case EXACTFL:
b8c5462f
JH
3154 PL_reg_flags |= RF_tainted;
3155 /* FALL THROUGH */
95b24440 3156 case EXACTF: {
be8e71aa 3157 char * const s = STRING(scan);
5d9a96ca 3158 st->ln = STR_LEN(scan);
d6a28714 3159
d07ddd77
JH
3160 if (do_utf8 || UTF) {
3161 /* Either target or the pattern are utf8. */
be8e71aa 3162 const char * const l = locinput;
d07ddd77 3163 char *e = PL_regeol;
bc517b45 3164
5d9a96ca 3165 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 3166 l, &e, 0, do_utf8)) {
5486206c
JH
3167 /* One more case for the sharp s:
3168 * pack("U0U*", 0xDF) =~ /ss/i,
3169 * the 0xC3 0x9F are the UTF-8
3170 * byte sequence for the U+00DF. */
3171 if (!(do_utf8 &&
3172 toLOWER(s[0]) == 's' &&
5d9a96ca 3173 st->ln >= 2 &&
5486206c
JH
3174 toLOWER(s[1]) == 's' &&
3175 (U8)l[0] == 0xC3 &&
3176 e - l >= 2 &&
3177 (U8)l[1] == 0x9F))
3178 sayNO;
3179 }
d07ddd77
JH
3180 locinput = e;
3181 nextchr = UCHARAT(locinput);
3182 break;
a0ed51b3 3183 }
d6a28714 3184
bc517b45
JH
3185 /* Neither the target and the pattern are utf8. */
3186
d6a28714
JH
3187 /* Inline the first character, for speed. */
3188 if (UCHARAT(s) != nextchr &&
3189 UCHARAT(s) != ((OP(scan) == EXACTF)
3190 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3191 sayNO;
5d9a96ca 3192 if (PL_regeol - locinput < st->ln)
b8c5462f 3193 sayNO;
5d9a96ca
DM
3194 if (st->ln > 1 && (OP(scan) == EXACTF
3195 ? ibcmp(s, locinput, st->ln)
3196 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 3197 sayNO;
5d9a96ca 3198 locinput += st->ln;
d6a28714 3199 nextchr = UCHARAT(locinput);
a0d0e21e 3200 break;
95b24440 3201 }
d6a28714 3202 case ANYOF:
ffc61ed2 3203 if (do_utf8) {
9e55ce06
JH
3204 STRLEN inclasslen = PL_regeol - locinput;
3205
32fc9b6a 3206 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 3207 sayNO_ANYOF;
ffc61ed2
JH
3208 if (locinput >= PL_regeol)
3209 sayNO;
0f0076b4 3210 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3211 nextchr = UCHARAT(locinput);
e0f9d4a8 3212 break;
ffc61ed2
JH
3213 }
3214 else {
3215 if (nextchr < 0)
3216 nextchr = UCHARAT(locinput);
32fc9b6a 3217 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 3218 sayNO_ANYOF;
ffc61ed2
JH
3219 if (!nextchr && locinput >= PL_regeol)
3220 sayNO;
3221 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3222 break;
3223 }
3224 no_anyof:
3225 /* If we might have the case of the German sharp s
3226 * in a casefolding Unicode character class. */
3227
ebc501f0
JH
3228 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3229 locinput += SHARP_S_SKIP;
e0f9d4a8 3230 nextchr = UCHARAT(locinput);
ffc61ed2 3231 }
e0f9d4a8
JH
3232 else
3233 sayNO;
b8c5462f 3234 break;
d6a28714 3235 case ALNUML:
b8c5462f
JH
3236 PL_reg_flags |= RF_tainted;
3237 /* FALL THROUGH */
d6a28714 3238 case ALNUM:
b8c5462f 3239 if (!nextchr)
4633a7c4 3240 sayNO;
ffc61ed2 3241 if (do_utf8) {
1a4fad37 3242 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3243 if (!(OP(scan) == ALNUM
bb7a0f54 3244 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3245 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3246 {
3247 sayNO;
a0ed51b3 3248 }
b8c5462f 3249 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3250 nextchr = UCHARAT(locinput);
3251 break;
3252 }
ffc61ed2 3253 if (!(OP(scan) == ALNUM
d6a28714 3254 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3255 sayNO;
b8c5462f 3256 nextchr = UCHARAT(++locinput);
a0d0e21e 3257 break;
d6a28714 3258 case NALNUML:
b8c5462f
JH
3259 PL_reg_flags |= RF_tainted;
3260 /* FALL THROUGH */
d6a28714
JH
3261 case NALNUM:
3262 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3263 sayNO;
ffc61ed2 3264 if (do_utf8) {
1a4fad37 3265 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3266 if (OP(scan) == NALNUM
bb7a0f54 3267 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3268 : isALNUM_LC_utf8((U8*)locinput))
3269 {
b8c5462f 3270 sayNO;
d6a28714 3271 }
b8c5462f
JH
3272 locinput += PL_utf8skip[nextchr];
3273 nextchr = UCHARAT(locinput);
3274 break;
3275 }
ffc61ed2 3276 if (OP(scan) == NALNUM
d6a28714 3277 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3278 sayNO;
76e3520e 3279 nextchr = UCHARAT(++locinput);
a0d0e21e 3280 break;
d6a28714
JH
3281 case BOUNDL:
3282 case NBOUNDL:
3280af22 3283 PL_reg_flags |= RF_tainted;
bbce6d69 3284 /* FALL THROUGH */
d6a28714
JH
3285 case BOUND:
3286 case NBOUND:
3287 /* was last char in word? */
ffc61ed2 3288 if (do_utf8) {
12d33761 3289 if (locinput == PL_bostr)
5d9a96ca 3290 st->ln = '\n';
ffc61ed2 3291 else {
a3b680e6 3292 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3293
4ad0818d 3294 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3295 }
3296 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3297 st->ln = isALNUM_uni(st->ln);
1a4fad37 3298 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3299 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3300 }
3301 else {
5d9a96ca 3302 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3303 n = isALNUM_LC_utf8((U8*)locinput);
3304 }
a0ed51b3 3305 }
d6a28714 3306 else {
5d9a96ca 3307 st->ln = (locinput != PL_bostr) ?
12d33761 3308 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3309 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3310 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3311 n = isALNUM(nextchr);
3312 }
3313 else {
5d9a96ca 3314 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3315 n = isALNUM_LC(nextchr);
3316 }
d6a28714 3317 }
5d9a96ca 3318 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3319 OP(scan) == BOUNDL))
3320 sayNO;
a0ed51b3 3321 break;
d6a28714 3322 case SPACEL:
3280af22 3323 PL_reg_flags |= RF_tainted;
bbce6d69 3324 /* FALL THROUGH */
d6a28714 3325 case SPACE:
9442cb0e 3326 if (!nextchr)
4633a7c4 3327 sayNO;
1aa99e6b 3328 if (do_utf8) {
fd400ab9 3329 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3330 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3331 if (!(OP(scan) == SPACE
bb7a0f54 3332 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3333 : isSPACE_LC_utf8((U8*)locinput)))
3334 {
3335 sayNO;
3336 }
3337 locinput += PL_utf8skip[nextchr];
3338 nextchr = UCHARAT(locinput);
3339 break;
d6a28714 3340 }
ffc61ed2
JH
3341 if (!(OP(scan) == SPACE
3342 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3343 sayNO;
3344 nextchr = UCHARAT(++locinput);
3345 }
3346 else {
3347 if (!(OP(scan) == SPACE
3348 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3349 sayNO;
3350 nextchr = UCHARAT(++locinput);
a0ed51b3 3351 }
a0ed51b3 3352 break;
d6a28714 3353 case NSPACEL:
3280af22 3354 PL_reg_flags |= RF_tainted;
bbce6d69 3355 /* FALL THROUGH */
d6a28714 3356 case NSPACE:
9442cb0e 3357 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3358 sayNO;
1aa99e6b 3359 if (do_utf8) {
1a4fad37 3360 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3361 if (OP(scan) == NSPACE
bb7a0f54 3362 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3363 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3364 {
3365 sayNO;
3366 }
3367 locinput += PL_utf8skip[nextchr];
3368 nextchr = UCHARAT(locinput);
3369 break;
a0ed51b3 3370 }
ffc61ed2 3371 if (OP(scan) == NSPACE
d6a28714 3372 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3373 sayNO;
76e3520e 3374 nextchr = UCHARAT(++locinput);
a0d0e21e 3375 break;
d6a28714 3376 case DIGITL:
a0ed51b3
LW
3377 PL_reg_flags |= RF_tainted;
3378 /* FALL THROUGH */
d6a28714 3379 case DIGIT:
9442cb0e 3380 if (!nextchr)
a0ed51b3 3381 sayNO;
1aa99e6b 3382 if (do_utf8) {
1a4fad37 3383 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3384 if (!(OP(scan) == DIGIT
bb7a0f54 3385 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3386 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3387 {
a0ed51b3 3388 sayNO;
dfe13c55 3389 }
6f06b55f 3390 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3391 nextchr = UCHARAT(locinput);
3392 break;
3393 }
ffc61ed2 3394 if (!(OP(scan) == DIGIT
9442cb0e 3395 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3396 sayNO;
3397 nextchr = UCHARAT(++locinput);
3398 break;
d6a28714 3399 case NDIGITL:
b8c5462f
JH
3400 PL_reg_flags |= RF_tainted;
3401 /* FALL THROUGH */
d6a28714 3402 case NDIGIT:
9442cb0e 3403 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3404 sayNO;
1aa99e6b 3405 if (do_utf8) {
1a4fad37 3406 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3407 if (OP(scan) == NDIGIT
bb7a0f54 3408 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3409 : isDIGIT_LC_utf8((U8*)locinput))
3410 {
a0ed51b3 3411 sayNO;
9442cb0e 3412 }
6f06b55f 3413 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3414 nextchr = UCHARAT(locinput);
3415 break;
3416 }
ffc61ed2 3417 if (OP(scan) == NDIGIT
9442cb0e 3418 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3419 sayNO;
3420 nextchr = UCHARAT(++locinput);
3421 break;
3422 case CLUMP:
b7c83a7e 3423 if (locinput >= PL_regeol)
a0ed51b3 3424 sayNO;
b7c83a7e 3425 if (do_utf8) {
1a4fad37 3426 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3427 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3428 sayNO;
3429 locinput += PL_utf8skip[nextchr];
3430 while (locinput < PL_regeol &&
3431 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3432 locinput += UTF8SKIP(locinput);
3433 if (locinput > PL_regeol)
3434 sayNO;
eb08e2da
JH