This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
cflags.SH: rework the gcc warnings selection
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e 33#ifdef PERL_EXT_RE_BUILD
54df2634 34#include "re_top.h"
9041c2e3 35#endif
56953603 36
a687059c 37/*
e50aee73 38 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
39 *
40 * Copyright (c) 1986 by University of Toronto.
41 * Written by Henry Spencer. Not derived from licensed software.
42 *
43 * Permission is granted to anyone to use this software for any
44 * purpose on any computer system, and to redistribute it freely,
45 * subject to the following restrictions:
46 *
47 * 1. The author is not responsible for the consequences of use of
48 * this software, no matter how awful, even if they arise
49 * from defects in it.
50 *
51 * 2. The origin of this software must not be misrepresented, either
52 * by explicit claim or by omission.
53 *
54 * 3. Altered versions must be plainly marked as such, and must not
55 * be misrepresented as being the original software.
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
a687059c
LW
64 *
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
68 */
69#include "EXTERN.h"
864dbfa3 70#define PERL_IN_REGEXEC_C
a687059c 71#include "perl.h"
0f5d15d6 72
54df2634
NC
73#ifdef PERL_IN_XSUB_RE
74# include "re_comp.h"
75#else
76# include "regcomp.h"
77#endif
a687059c 78
c277df42
IZ
79#define RF_tainted 1 /* tainted information used? */
80#define RF_warned 2 /* warned about big count? */
ce862d02 81#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
82#define RF_utf8 8 /* String contains multibyte chars? */
83
eb160463 84#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
85
86#define RS_init 1 /* eval environment created */
87#define RS_set 2 /* replsv value is set */
c277df42 88
a687059c
LW
89#ifndef STATIC
90#define STATIC static
91#endif
92
32fc9b6a 93#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 94
c277df42
IZ
95/*
96 * Forwards.
97 */
98
33b8afdf 99#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 100#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 101
3dab1dad
YO
102#define HOPc(pos,off) \
103 (char *)(PL_reg_match_utf8 \
52657f30 104 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
105 : (U8*)(pos + off))
106#define HOPBACKc(pos, off) \
07be1b83
YO
107 (char*)(PL_reg_match_utf8\
108 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
109 : (pos - off >= PL_bostr) \
8e11feef 110 ? (U8*)pos - off \
3dab1dad 111 : NULL)
efb30f32 112
53c4c00c 113#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 114#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 115
1a4fad37
AL
116#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
117 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
118#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
119#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
120#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
121#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 122
3dab1dad
YO
123/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124
5f80c4cf 125/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
126#define JUMPABLE(rn) ( \
127 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
128 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
129 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 130 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
131)
132
cca55fe3 133#define HAS_TEXT(rn) ( \
3dab1dad 134 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 135)
e2d8ce26 136
a84d97b6
HS
137/*
138 Search for mandatory following text node; for lookahead, the text must
139 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140*/
cca55fe3 141#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
142 while (JUMPABLE(rn)) { \
143 const OPCODE type = OP(rn); \
144 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 145 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 146 else if (type == PLUS) \
cca55fe3 147 rn = NEXTOPER(rn); \
3dab1dad 148 else if (type == IFMATCH) \
a84d97b6 149 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 150 else rn += NEXT_OFF(rn); \
3dab1dad 151 } \
5f80c4cf 152} STMT_END
74750237 153
acfe0abc 154static void restore_pos(pTHX_ void *arg);
51371543 155
76e3520e 156STATIC CHECKPOINT
cea2e8a9 157S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 158{
97aff369 159 dVAR;
a3b680e6 160 const int retval = PL_savestack_ix;
b1ce53c5 161#define REGCP_PAREN_ELEMS 4
a3b680e6 162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
163 int p;
164
e49a9654
IH
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
167
a01268b5 168#define REGCP_OTHER_ELEMS 6
4b3c1a47 169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 170 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 171/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
3280af22 174 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
175 SSPUSHINT(p);
176 }
b1ce53c5 177/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
178 SSPUSHINT(PL_regsize);
179 SSPUSHINT(*PL_reglastparen);
a01268b5 180 SSPUSHINT(*PL_reglastcloseparen);
3280af22 181 SSPUSHPTR(PL_reginput);
41123dfd
JH
182#define REGCP_FRAME_ELEMS 2
183/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
184 * are needed for the regexp context stack bookkeeping. */
185 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 186 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 187
a0d0e21e
LW
188 return retval;
189}
190
c277df42 191/* These are needed since we do not localize EVAL nodes: */
a3621e74 192# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 193 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 194 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 195
a3621e74 196# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 197 PerlIO_printf(Perl_debug_log, \
faccc32b 198 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 199 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 200
76e3520e 201STATIC char *
097eb12c 202S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 203{
97aff369 204 dVAR;
b1ce53c5 205 I32 i;
a0d0e21e 206 char *input;
b1ce53c5 207
a3621e74
YO
208 GET_RE_DEBUG_FLAGS_DECL;
209
b1ce53c5 210 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 211 i = SSPOPINT;
b1ce53c5
JH
212 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
213 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 214 input = (char *) SSPOPPTR;
a01268b5 215 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
216 *PL_reglastparen = SSPOPINT;
217 PL_regsize = SSPOPINT;
b1ce53c5
JH
218
219 /* Now restore the parentheses context. */
41123dfd
JH
220 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
221 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 222 I32 tmps;
097eb12c 223 U32 paren = (U32)SSPOPINT;
3280af22 224 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
225 PL_regstartp[paren] = SSPOPINT;
226 tmps = SSPOPINT;
3280af22
NIS
227 if (paren <= *PL_reglastparen)
228 PL_regendp[paren] = tmps;
a3621e74 229 DEBUG_EXECUTE_r(
c3464db5 230 PerlIO_printf(Perl_debug_log,
b900a521 231 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 232 (UV)paren, (IV)PL_regstartp[paren],
b900a521 233 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 234 (IV)PL_regendp[paren],
3280af22 235 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 236 );
a0d0e21e 237 }
a3621e74 238 DEBUG_EXECUTE_r(
bb7a0f54 239 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 240 PerlIO_printf(Perl_debug_log,
faccc32b 241 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 242 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
243 }
244 );
daf18116 245#if 1
dafc8851
JH
246 /* It would seem that the similar code in regtry()
247 * already takes care of this, and in fact it is in
248 * a better location to since this code can #if 0-ed out
249 * but the code in regtry() is needed or otherwise tests
250 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
251 * (as of patchlevel 7877) will fail. Then again,
252 * this code seems to be necessary or otherwise
253 * building DynaLoader will fail:
254 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
255 * --jhi */
bb7a0f54 256 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
257 if (i > PL_regsize)
258 PL_regstartp[i] = -1;
259 PL_regendp[i] = -1;
a0d0e21e 260 }
dafc8851 261#endif
a0d0e21e
LW
262 return input;
263}
264
02db2b7b 265#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 266
95b24440 267#define TRYPAREN(paren, n, input, where) { \
29d1e993
HS
268 if (paren) { \
269 if (n) { \
270 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
271 PL_regendp[paren] = input - PL_bostr; \
272 } \
273 else \
274 PL_regendp[paren] = -1; \
275 } \
95b24440
DM
276 REGMATCH(next, where); \
277 if (result) \
29d1e993
HS
278 sayYES; \
279 if (paren && n) \
280 PL_regendp[paren] = -1; \
281}
282
283
a687059c 284/*
e50aee73 285 * pregexec and friends
a687059c
LW
286 */
287
76234dfb 288#ifndef PERL_IN_XSUB_RE
a687059c 289/*
c277df42 290 - pregexec - match a regexp against a string
a687059c 291 */
c277df42 292I32
864dbfa3 293Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 294 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
295/* strend: pointer to null at end of string */
296/* strbeg: real beginning of string */
297/* minend: end of match must be >=minend after stringarg. */
298/* nosave: For optimizations. */
299{
300 return
9041c2e3 301 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
302 nosave ? 0 : REXEC_COPY_STR);
303}
76234dfb 304#endif
22e551b9 305
9041c2e3 306/*
cad2e5aa
JH
307 * Need to implement the following flags for reg_anch:
308 *
309 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
310 * USE_INTUIT_ML
311 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
312 * INTUIT_AUTORITATIVE_ML
313 * INTUIT_ONCE_NOML - Intuit can match in one location only.
314 * INTUIT_ONCE_ML
315 *
316 * Another flag for this function: SECOND_TIME (so that float substrs
317 * with giant delta may be not rechecked).
318 */
319
320/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
321
3f7c398e 322/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
323 Otherwise, only SvCUR(sv) is used to get strbeg. */
324
325/* XXXX We assume that strpos is strbeg unless sv. */
326
6eb5f6b9
JH
327/* XXXX Some places assume that there is a fixed substring.
328 An update may be needed if optimizer marks as "INTUITable"
329 RExen without fixed substrings. Similarly, it is assumed that
330 lengths of all the strings are no more than minlen, thus they
331 cannot come from lookahead.
332 (Or minlen should take into account lookahead.) */
333
2c2d71f5
JH
334/* A failure to find a constant substring means that there is no need to make
335 an expensive call to REx engine, thus we celebrate a failure. Similarly,
336 finding a substring too deep into the string means that less calls to
30944b6d
IZ
337 regtry() should be needed.
338
339 REx compiler's optimizer found 4 possible hints:
340 a) Anchored substring;
341 b) Fixed substring;
342 c) Whether we are anchored (beginning-of-line or \G);
343 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 344 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
345 string which does not contradict any of them.
346 */
2c2d71f5 347
6eb5f6b9
JH
348/* Most of decisions we do here should have been done at compile time.
349 The nodes of the REx which we used for the search should have been
350 deleted from the finite automaton. */
351
cad2e5aa
JH
352char *
353Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
354 char *strend, U32 flags, re_scream_pos_data *data)
355{
97aff369 356 dVAR;
b7953727 357 register I32 start_shift = 0;
cad2e5aa 358 /* Should be nonnegative! */
b7953727 359 register I32 end_shift = 0;
2c2d71f5
JH
360 register char *s;
361 register SV *check;
a1933d95 362 char *strbeg;
cad2e5aa 363 char *t;
a3b680e6 364 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 365 I32 ml_anch;
bd61b366
SS
366 register char *other_last = NULL; /* other substr checked before this */
367 char *check_at = NULL; /* check substr found at this pos */
1df70142 368 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 369#ifdef DEBUGGING
890ce7af
AL
370 const char * const i_strpos = strpos;
371 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 372#endif
a3621e74
YO
373
374 GET_RE_DEBUG_FLAGS_DECL;
375
a30b2f1f 376 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 377
b8d68ded 378 if (prog->reganch & ROPT_UTF8) {
a3621e74 379 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
380 "UTF-8 regex...\n"));
381 PL_reg_flags |= RF_utf8;
382 }
383
a3621e74 384 DEBUG_EXECUTE_r({
1df70142 385 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
386 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
387 strpos;
1df70142 388 const int len = PL_reg_match_utf8 ?
bb7a0f54 389 (int)strlen(s) : strend - strpos;
2a782b5b
JH
390 if (!PL_colorset)
391 reginitcolors();
b8d68ded 392 if (PL_reg_match_utf8)
a3621e74 393 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 394 "UTF-8 target...\n"));
2a782b5b 395 PerlIO_printf(Perl_debug_log,
a0288114 396 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 397 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
398 prog->precomp,
399 PL_colors[1],
400 (strlen(prog->precomp) > 60 ? "..." : ""),
401 PL_colors[0],
402 (int)(len > 60 ? 60 : len),
403 s, PL_colors[1],
404 (len > 60 ? "..." : "")
405 );
406 });
cad2e5aa 407
c344f387
JH
408 /* CHR_DIST() would be more correct here but it makes things slow. */
409 if (prog->minlen > strend - strpos) {
a3621e74 410 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 411 "String too short... [re_intuit_start]\n"));
cad2e5aa 412 goto fail;
2c2d71f5 413 }
a1933d95 414 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 415 PL_regeol = strend;
33b8afdf
JH
416 if (do_utf8) {
417 if (!prog->check_utf8 && prog->check_substr)
418 to_utf8_substr(prog);
419 check = prog->check_utf8;
420 } else {
421 if (!prog->check_substr && prog->check_utf8)
422 to_byte_substr(prog);
423 check = prog->check_substr;
424 }
425 if (check == &PL_sv_undef) {
a3621e74 426 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
427 "Non-utf string cannot match utf check string\n"));
428 goto fail;
429 }
2c2d71f5 430 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
431 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
432 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 433 && !multiline ) ); /* Check after \n? */
cad2e5aa 434
7e25d62c
JH
435 if (!ml_anch) {
436 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
437 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 438 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
439 && sv && !SvROK(sv)
440 && (strpos != strbeg)) {
a3621e74 441 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
442 goto fail;
443 }
444 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 445 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 446 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
447 I32 slen;
448
1aa99e6b 449 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
450 if (SvTAIL(check)) {
451 slen = SvCUR(check); /* >= 1 */
cad2e5aa 452
9041c2e3 453 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 454 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 455 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 456 goto fail_finish;
cad2e5aa
JH
457 }
458 /* Now should match s[0..slen-2] */
459 slen--;
3f7c398e 460 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 461 || (slen > 1
3f7c398e 462 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 463 report_neq:
a3621e74 464 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
465 goto fail_finish;
466 }
cad2e5aa 467 }
3f7c398e 468 else if (*SvPVX_const(check) != *s
653099ff 469 || ((slen = SvCUR(check)) > 1
3f7c398e 470 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 471 goto report_neq;
c315bfe8 472 check_at = s;
2c2d71f5 473 goto success_at_start;
7e25d62c 474 }
cad2e5aa 475 }
2c2d71f5 476 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 477 s = strpos;
2c2d71f5 478 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 479 end_shift = prog->minlen - start_shift -
653099ff 480 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 481 if (!ml_anch) {
a3b680e6 482 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 483 - (SvTAIL(check) != 0);
a3b680e6 484 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
485
486 if (end_shift < eshift)
487 end_shift = eshift;
488 }
cad2e5aa 489 }
2c2d71f5 490 else { /* Can match at random position */
cad2e5aa
JH
491 ml_anch = 0;
492 s = strpos;
2c2d71f5
JH
493 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
494 /* Should be nonnegative! */
495 end_shift = prog->minlen - start_shift -
653099ff 496 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
497 }
498
2c2d71f5 499#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 500 if (end_shift < 0)
6bbae5e6 501 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
502#endif
503
2c2d71f5
JH
504 restart:
505 /* Find a possible match in the region s..strend by looking for
506 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 507 if (flags & REXEC_SCREAM) {
cad2e5aa 508 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 509 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 510
2c2d71f5
JH
511 if (PL_screamfirst[BmRARE(check)] >= 0
512 || ( BmRARE(check) == '\n'
513 && (BmPREVIOUS(check) == SvCUR(check) - 1)
514 && SvTAIL(check) ))
9041c2e3 515 s = screaminstr(sv, check,
2c2d71f5 516 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 517 else
2c2d71f5 518 goto fail_finish;
4addbd3b
HS
519 /* we may be pointing at the wrong string */
520 if (s && RX_MATCH_COPIED(prog))
3f7c398e 521 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
522 if (data)
523 *data->scream_olds = s;
524 }
f33976b4 525 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
526 s = fbm_instr((U8*)(s + start_shift),
527 (U8*)(strend - end_shift),
7fba1cd6 528 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 529 else
1aa99e6b
IH
530 s = fbm_instr(HOP3(s, start_shift, strend),
531 HOP3(strend, -end_shift, strbeg),
7fba1cd6 532 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
533
534 /* Update the count-of-usability, remove useless subpatterns,
535 unshift s. */
2c2d71f5 536
a0288114 537 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 538 (s ? "Found" : "Did not find"),
33b8afdf 539 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 540 PL_colors[0],
7b0972df 541 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 542 SvPVX_const(check),
2c2d71f5
JH
543 PL_colors[1], (SvTAIL(check) ? "$" : ""),
544 (s ? " at offset " : "...\n") ) );
545
546 if (!s)
547 goto fail_finish;
548
6eb5f6b9
JH
549 check_at = s;
550
2c2d71f5 551 /* Finish the diagnostic message */
a3621e74 552 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
553
554 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
555 Start with the other substr.
556 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 557 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
558 *always* match. Probably should be marked during compile...
559 Probably it is right to do no SCREAM here...
560 */
561
33b8afdf 562 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 563 /* Take into account the "other" substring. */
2c2d71f5
JH
564 /* XXXX May be hopelessly wrong for UTF... */
565 if (!other_last)
6eb5f6b9 566 other_last = strpos;
33b8afdf 567 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
568 do_other_anchored:
569 {
890ce7af
AL
570 char * const last = HOP3c(s, -start_shift, strbeg);
571 char *last1, *last2;
be8e71aa 572 char * const saved_s = s;
33b8afdf 573 SV* must;
2c2d71f5 574
2c2d71f5
JH
575 t = s - prog->check_offset_max;
576 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 577 && (!do_utf8
0ce71af7 578 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 579 && t > strpos)))
6f207bd3 580 NOOP;
2c2d71f5
JH
581 else
582 t = strpos;
1aa99e6b 583 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
584 if (t < other_last) /* These positions already checked */
585 t = other_last;
1aa99e6b 586 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
587 if (last < last1)
588 last1 = last;
589 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
590 /* On end-of-str: see comment below. */
33b8afdf
JH
591 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
592 if (must == &PL_sv_undef) {
593 s = (char*)NULL;
a3621e74 594 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
595 }
596 else
597 s = fbm_instr(
598 (unsigned char*)t,
599 HOP3(HOP3(last1, prog->anchored_offset, strend)
600 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
601 must,
7fba1cd6 602 multiline ? FBMrf_MULTILINE : 0
33b8afdf 603 );
a3621e74 604 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 605 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
606 (s ? "Found" : "Contradicts"),
607 PL_colors[0],
33b8afdf
JH
608 (int)(SvCUR(must)
609 - (SvTAIL(must)!=0)),
3f7c398e 610 SvPVX_const(must),
33b8afdf 611 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
612 if (!s) {
613 if (last1 >= last2) {
a3621e74 614 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
615 ", giving up...\n"));
616 goto fail_finish;
617 }
a3621e74 618 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 619 ", trying floating at offset %ld...\n",
be8e71aa 620 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
621 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
622 s = HOP3c(last, 1, strend);
2c2d71f5
JH
623 goto restart;
624 }
625 else {
a3621e74 626 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 627 (long)(s - i_strpos)));
1aa99e6b
IH
628 t = HOP3c(s, -prog->anchored_offset, strbeg);
629 other_last = HOP3c(s, 1, strend);
be8e71aa 630 s = saved_s;
2c2d71f5
JH
631 if (t == strpos)
632 goto try_at_start;
2c2d71f5
JH
633 goto try_at_offset;
634 }
30944b6d 635 }
2c2d71f5
JH
636 }
637 else { /* Take into account the floating substring. */
33b8afdf 638 char *last, *last1;
be8e71aa 639 char * const saved_s = s;
33b8afdf
JH
640 SV* must;
641
642 t = HOP3c(s, -start_shift, strbeg);
643 last1 = last =
644 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
645 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
646 last = HOP3c(t, prog->float_max_offset, strend);
647 s = HOP3c(t, prog->float_min_offset, strend);
648 if (s < other_last)
649 s = other_last;
2c2d71f5 650 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
651 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
652 /* fbm_instr() takes into account exact value of end-of-str
653 if the check is SvTAIL(ed). Since false positives are OK,
654 and end-of-str is not later than strend we are OK. */
655 if (must == &PL_sv_undef) {
656 s = (char*)NULL;
a3621e74 657 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
658 }
659 else
2c2d71f5 660 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
661 (unsigned char*)last + SvCUR(must)
662 - (SvTAIL(must)!=0),
7fba1cd6 663 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 664 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
665 (s ? "Found" : "Contradicts"),
666 PL_colors[0],
667 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 668 SvPVX_const(must),
33b8afdf
JH
669 PL_colors[1], (SvTAIL(must) ? "$" : "")));
670 if (!s) {
671 if (last1 == last) {
a3621e74 672 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
673 ", giving up...\n"));
674 goto fail_finish;
2c2d71f5 675 }
a3621e74 676 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 677 ", trying anchored starting at offset %ld...\n",
be8e71aa 678 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
679 other_last = last;
680 s = HOP3c(t, 1, strend);
681 goto restart;
682 }
683 else {
a3621e74 684 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
685 (long)(s - i_strpos)));
686 other_last = s; /* Fix this later. --Hugo */
be8e71aa 687 s = saved_s;
33b8afdf
JH
688 if (t == strpos)
689 goto try_at_start;
690 goto try_at_offset;
691 }
2c2d71f5 692 }
cad2e5aa 693 }
2c2d71f5
JH
694
695 t = s - prog->check_offset_max;
2c2d71f5 696 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 697 && (!do_utf8
0ce71af7 698 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
1aa99e6b 699 && t > strpos))) {
2c2d71f5
JH
700 /* Fixed substring is found far enough so that the match
701 cannot start at strpos. */
702 try_at_offset:
cad2e5aa 703 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
704 /* Eventually fbm_*() should handle this, but often
705 anchored_offset is not 0, so this check will not be wasted. */
706 /* XXXX In the code below we prefer to look for "^" even in
707 presence of anchored substrings. And we search even
708 beyond the found float position. These pessimizations
709 are historical artefacts only. */
710 find_anchor:
2c2d71f5 711 while (t < strend - prog->minlen) {
cad2e5aa 712 if (*t == '\n') {
4ee3650e 713 if (t < check_at - prog->check_offset_min) {
33b8afdf 714 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
715 /* Since we moved from the found position,
716 we definitely contradict the found anchored
30944b6d
IZ
717 substr. Due to the above check we do not
718 contradict "check" substr.
719 Thus we can arrive here only if check substr
720 is float. Redo checking for "other"=="fixed".
721 */
9041c2e3 722 strpos = t + 1;
a3621e74 723 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 724 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
725 goto do_other_anchored;
726 }
4ee3650e
GS
727 /* We don't contradict the found floating substring. */
728 /* XXXX Why not check for STCLASS? */
cad2e5aa 729 s = t + 1;
a3621e74 730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 731 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
732 goto set_useful;
733 }
4ee3650e
GS
734 /* Position contradicts check-string */
735 /* XXXX probably better to look for check-string
736 than for "\n", so one should lower the limit for t? */
a3621e74 737 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 738 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 739 other_last = strpos = s = t + 1;
cad2e5aa
JH
740 goto restart;
741 }
742 t++;
743 }
a3621e74 744 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 745 PL_colors[0], PL_colors[1]));
2c2d71f5 746 goto fail_finish;
cad2e5aa 747 }
f5952150 748 else {
a3621e74 749 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 750 PL_colors[0], PL_colors[1]));
f5952150 751 }
cad2e5aa
JH
752 s = t;
753 set_useful:
33b8afdf 754 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
755 }
756 else {
f5952150 757 /* The found string does not prohibit matching at strpos,
2c2d71f5 758 - no optimization of calling REx engine can be performed,
f5952150
GS
759 unless it was an MBOL and we are not after MBOL,
760 or a future STCLASS check will fail this. */
2c2d71f5
JH
761 try_at_start:
762 /* Even in this situation we may use MBOL flag if strpos is offset
763 wrt the start of the string. */
05b4157f 764 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 765 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
766 /* May be due to an implicit anchor of m{.*foo} */
767 && !(prog->reganch & ROPT_IMPLICIT))
768 {
cad2e5aa
JH
769 t = strpos;
770 goto find_anchor;
771 }
a3621e74 772 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 773 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 774 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 775 );
2c2d71f5 776 success_at_start:
30944b6d 777 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
778 && (do_utf8 ? (
779 prog->check_utf8 /* Could be deleted already */
780 && --BmUSEFUL(prog->check_utf8) < 0
781 && (prog->check_utf8 == prog->float_utf8)
782 ) : (
783 prog->check_substr /* Could be deleted already */
784 && --BmUSEFUL(prog->check_substr) < 0
785 && (prog->check_substr == prog->float_substr)
786 )))
66e933ab 787 {
cad2e5aa 788 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 789 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
790 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
791 if (do_utf8 ? prog->check_substr : prog->check_utf8)
792 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
793 prog->check_substr = prog->check_utf8 = NULL; /* disable */
794 prog->float_substr = prog->float_utf8 = NULL; /* clear */
795 check = NULL; /* abort */
cad2e5aa 796 s = strpos;
3cf5c195
IZ
797 /* XXXX This is a remnant of the old implementation. It
798 looks wasteful, since now INTUIT can use many
6eb5f6b9 799 other heuristics. */
cad2e5aa
JH
800 prog->reganch &= ~RE_USE_INTUIT;
801 }
802 else
803 s = strpos;
804 }
805
6eb5f6b9
JH
806 /* Last resort... */
807 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
07be1b83 808 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
6eb5f6b9
JH
809 /* minlen == 0 is possible if regstclass is \b or \B,
810 and the fixed substr is ''$.
811 Since minlen is already taken into account, s+1 is before strend;
812 accidentally, minlen >= 1 guaranties no false positives at s + 1
813 even for \b or \B. But (minlen? 1 : 0) below assumes that
814 regstclass does not come from lookahead... */
815 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
816 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 817 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 818 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 819 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 820 : 1);
07be1b83 821 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 822 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 823 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
824 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
825 cl_l, strend)
826 : strend);
07be1b83
YO
827 /*if (OP(prog->regstclass) == TRIE)
828 endpos++;*/
6eb5f6b9 829 t = s;
3b0527fe 830 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
831 if (!s) {
832#ifdef DEBUGGING
cbbf8932 833 const char *what = NULL;
6eb5f6b9
JH
834#endif
835 if (endpos == strend) {
a3621e74 836 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
837 "Could not match STCLASS...\n") );
838 goto fail;
839 }
a3621e74 840 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 841 "This position contradicts STCLASS...\n") );
653099ff
GS
842 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
843 goto fail;
6eb5f6b9 844 /* Contradict one of substrings */
33b8afdf
JH
845 if (prog->anchored_substr || prog->anchored_utf8) {
846 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 847 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 848 hop_and_restart:
1aa99e6b 849 s = HOP3c(t, 1, strend);
66e933ab
GS
850 if (s + start_shift + end_shift > strend) {
851 /* XXXX Should be taken into account earlier? */
a3621e74 852 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
853 "Could not match STCLASS...\n") );
854 goto fail;
855 }
5e39e1e5
HS
856 if (!check)
857 goto giveup;
a3621e74 858 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 859 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
860 what, (long)(s + start_shift - i_strpos)) );
861 goto restart;
862 }
66e933ab 863 /* Have both, check_string is floating */
6eb5f6b9
JH
864 if (t + start_shift >= check_at) /* Contradicts floating=check */
865 goto retry_floating_check;
866 /* Recheck anchored substring, but not floating... */
9041c2e3 867 s = check_at;
5e39e1e5
HS
868 if (!check)
869 goto giveup;
a3621e74 870 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 871 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
872 (long)(other_last - i_strpos)) );
873 goto do_other_anchored;
874 }
60e71179
GS
875 /* Another way we could have checked stclass at the
876 current position only: */
877 if (ml_anch) {
878 s = t = t + 1;
5e39e1e5
HS
879 if (!check)
880 goto giveup;
a3621e74 881 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 882 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 883 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 884 goto try_at_offset;
66e933ab 885 }
33b8afdf 886 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 887 goto fail;
6eb5f6b9
JH
888 /* Check is floating subtring. */
889 retry_floating_check:
890 t = check_at - start_shift;
a3621e74 891 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
892 goto hop_and_restart;
893 }
b7953727 894 if (t != s) {
a3621e74 895 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 896 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
897 (long)(t - i_strpos), (long)(s - i_strpos))
898 );
899 }
900 else {
a3621e74 901 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
902 "Does not contradict STCLASS...\n");
903 );
904 }
6eb5f6b9 905 }
5e39e1e5 906 giveup:
a3621e74 907 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
908 PL_colors[4], (check ? "Guessed" : "Giving up"),
909 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 910 return s;
2c2d71f5
JH
911
912 fail_finish: /* Substring not found */
33b8afdf
JH
913 if (prog->check_substr || prog->check_utf8) /* could be removed already */
914 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 915 fail:
a3621e74 916 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 917 PL_colors[4], PL_colors[5]));
bd61b366 918 return NULL;
cad2e5aa 919}
9661b544 920
6eb5f6b9 921/* We know what class REx starts with. Try to find this position... */
3b0527fe 922/* if reginfo is NULL, its a dryrun */
07be1b83
YO
923/* annoyingly all the vars in this routine have different names from their counterparts
924 in regmatch. /grrr */
3b0527fe 925
3c3eec57 926STATIC char *
07be1b83
YO
927S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
928 const char *strend, const regmatch_info *reginfo)
a687059c 929{
27da23d5 930 dVAR;
1df70142 931 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 932 char *m;
d8093b23 933 STRLEN ln;
5dab1207 934 STRLEN lnc;
078c425b 935 register STRLEN uskip;
d8093b23
G
936 unsigned int c1;
937 unsigned int c2;
6eb5f6b9
JH
938 char *e;
939 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 940 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 941
6eb5f6b9
JH
942 /* We know what class it must start with. */
943 switch (OP(c)) {
6eb5f6b9 944 case ANYOF:
388cc4de 945 if (do_utf8) {
078c425b 946 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
947 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
948 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a
DM
949 reginclass(prog, c, (U8*)s, 0, do_utf8) :
950 REGINCLASS(prog, c, (U8*)s)) {
3b0527fe 951 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
952 goto got_it;
953 else
954 tmp = doevery;
955 }
956 else
957 tmp = 1;
078c425b 958 s += uskip;
388cc4de
HS
959 }
960 }
961 else {
962 while (s < strend) {
963 STRLEN skip = 1;
964
32fc9b6a 965 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
966 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
967 /* The assignment of 2 is intentional:
968 * for the folded sharp s, the skip is 2. */
969 (skip = SHARP_S_SKIP))) {
3b0527fe 970 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
971 goto got_it;
972 else
973 tmp = doevery;
974 }
975 else
976 tmp = 1;
977 s += skip;
978 }
a0d0e21e 979 }
6eb5f6b9 980 break;
f33976b4
DB
981 case CANY:
982 while (s < strend) {
3b0527fe 983 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
984 goto got_it;
985 else
986 tmp = doevery;
987 s++;
988 }
989 break;
6eb5f6b9 990 case EXACTF:
5dab1207
NIS
991 m = STRING(c);
992 ln = STR_LEN(c); /* length to match in octets/bytes */
993 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 994 if (UTF) {
a2a2844f 995 STRLEN ulen1, ulen2;
5dab1207 996 U8 *sm = (U8 *) m;
89ebb4a3
JH
997 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
998 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 999 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1000
1001 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1002 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1003
89ebb4a3 1004 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1005 0, uniflags);
89ebb4a3 1006 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1007 0, uniflags);
5dab1207
NIS
1008 lnc = 0;
1009 while (sm < ((U8 *) m + ln)) {
1010 lnc++;
1011 sm += UTF8SKIP(sm);
1012 }
1aa99e6b
IH
1013 }
1014 else {
1015 c1 = *(U8*)m;
1016 c2 = PL_fold[c1];
1017 }
6eb5f6b9
JH
1018 goto do_exactf;
1019 case EXACTFL:
5dab1207
NIS
1020 m = STRING(c);
1021 ln = STR_LEN(c);
1022 lnc = (I32) ln;
d8093b23 1023 c1 = *(U8*)m;
6eb5f6b9
JH
1024 c2 = PL_fold_locale[c1];
1025 do_exactf:
db12adc6 1026 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1027
3b0527fe 1028 if (!reginfo && e < s)
6eb5f6b9 1029 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1030
60a8b682
JH
1031 /* The idea in the EXACTF* cases is to first find the
1032 * first character of the EXACTF* node and then, if
1033 * necessary, case-insensitively compare the full
1034 * text of the node. The c1 and c2 are the first
1035 * characters (though in Unicode it gets a bit
1036 * more complicated because there are more cases
7f16dd3d
JH
1037 * than just upper and lower: one needs to use
1038 * the so-called folding case for case-insensitive
1039 * matching (called "loose matching" in Unicode).
1040 * ibcmp_utf8() will do just that. */
60a8b682 1041
1aa99e6b 1042 if (do_utf8) {
575cac57 1043 UV c, f;
89ebb4a3 1044 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1045 STRLEN len, foldlen;
4ad0818d 1046 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1047 if (c1 == c2) {
5dab1207
NIS
1048 /* Upper and lower of 1st char are equal -
1049 * probably not a "letter". */
1aa99e6b 1050 while (s <= e) {
89ebb4a3 1051 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1052 uniflags);
80aecb99
JH
1053 if ( c == c1
1054 && (ln == len ||
3dab1dad
YO
1055 ibcmp_utf8(s, NULL, 0, do_utf8,
1056 m, NULL, ln, (bool)UTF))
3b0527fe 1057 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1058 goto got_it;
80aecb99 1059 else {
1df70142 1060 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1061 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1062 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1063 if ( f != c
1064 && (f == c1 || f == c2)
1065 && (ln == foldlen ||
66423254 1066 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1067 NULL, foldlen, do_utf8,
d07ddd77 1068 m,
3dab1dad 1069 NULL, ln, (bool)UTF))
3b0527fe 1070 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1071 goto got_it;
1072 }
1aa99e6b
IH
1073 s += len;
1074 }
09091399
JH
1075 }
1076 else {
1aa99e6b 1077 while (s <= e) {
89ebb4a3 1078 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1079 uniflags);
80aecb99 1080
60a8b682 1081 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1082 * Note that not all the possible combinations
1083 * are handled here: some of them are handled
1084 * by the standard folding rules, and some of
1085 * them (the character class or ANYOF cases)
1086 * are handled during compiletime in
1087 * regexec.c:S_regclass(). */
880bd946
JH
1088 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1089 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1090 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1091
1092 if ( (c == c1 || c == c2)
1093 && (ln == len ||
3dab1dad
YO
1094 ibcmp_utf8(s, NULL, 0, do_utf8,
1095 m, NULL, ln, (bool)UTF))
3b0527fe 1096 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1097 goto got_it;
80aecb99 1098 else {
1df70142 1099 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1100 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1101 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1102 if ( f != c
1103 && (f == c1 || f == c2)
1104 && (ln == foldlen ||
a6872d42 1105 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1106 NULL, foldlen, do_utf8,
d07ddd77 1107 m,
3dab1dad 1108 NULL, ln, (bool)UTF))
3b0527fe 1109 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1110 goto got_it;
1111 }
1aa99e6b
IH
1112 s += len;
1113 }
09091399 1114 }
1aa99e6b
IH
1115 }
1116 else {
1117 if (c1 == c2)
1118 while (s <= e) {
1119 if ( *(U8*)s == c1
1120 && (ln == 1 || !(OP(c) == EXACTF
1121 ? ibcmp(s, m, ln)
1122 : ibcmp_locale(s, m, ln)))
3b0527fe 1123 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1124 goto got_it;
1125 s++;
1126 }
1127 else
1128 while (s <= e) {
1129 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1130 && (ln == 1 || !(OP(c) == EXACTF
1131 ? ibcmp(s, m, ln)
1132 : ibcmp_locale(s, m, ln)))
3b0527fe 1133 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1134 goto got_it;
1135 s++;
1136 }
b3c9acc1
IZ
1137 }
1138 break;
bbce6d69 1139 case BOUNDL:
3280af22 1140 PL_reg_flags |= RF_tainted;
bbce6d69 1141 /* FALL THROUGH */
a0d0e21e 1142 case BOUND:
ffc61ed2 1143 if (do_utf8) {
12d33761 1144 if (s == PL_bostr)
ffc61ed2
JH
1145 tmp = '\n';
1146 else {
6136c704 1147 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1148 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1149 }
1150 tmp = ((OP(c) == BOUND ?
9041c2e3 1151 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1152 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1153 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1154 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1155 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1156 isALNUM_LC_utf8((U8*)s)))
1157 {
1158 tmp = !tmp;
3b0527fe 1159 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1160 goto got_it;
1161 }
078c425b 1162 s += uskip;
a687059c 1163 }
a0d0e21e 1164 }
667bb95a 1165 else {
12d33761 1166 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1167 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1168 while (s < strend) {
1169 if (tmp ==
1170 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1171 tmp = !tmp;
3b0527fe 1172 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1173 goto got_it;
1174 }
1175 s++;
a0ed51b3 1176 }
a0ed51b3 1177 }
3b0527fe 1178 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1179 goto got_it;
1180 break;
bbce6d69 1181 case NBOUNDL:
3280af22 1182 PL_reg_flags |= RF_tainted;
bbce6d69 1183 /* FALL THROUGH */
a0d0e21e 1184 case NBOUND:
ffc61ed2 1185 if (do_utf8) {
12d33761 1186 if (s == PL_bostr)
ffc61ed2
JH
1187 tmp = '\n';
1188 else {
6136c704 1189 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1190 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1191 }
1192 tmp = ((OP(c) == NBOUND ?
9041c2e3 1193 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1194 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1195 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1196 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1197 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1198 isALNUM_LC_utf8((U8*)s)))
1199 tmp = !tmp;
3b0527fe 1200 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2 1201 goto got_it;
078c425b 1202 s += uskip;
ffc61ed2 1203 }
a0d0e21e 1204 }
667bb95a 1205 else {
12d33761 1206 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1207 tmp = ((OP(c) == NBOUND ?
1208 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1209 while (s < strend) {
1210 if (tmp ==
1211 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1212 tmp = !tmp;
3b0527fe 1213 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1214 goto got_it;
1215 s++;
1216 }
a0ed51b3 1217 }
3b0527fe 1218 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1219 goto got_it;
1220 break;
a0d0e21e 1221 case ALNUM:
ffc61ed2 1222 if (do_utf8) {
1a4fad37 1223 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1224 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1225 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1226 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1227 goto got_it;
1228 else
1229 tmp = doevery;
1230 }
bbce6d69 1231 else
ffc61ed2 1232 tmp = 1;
078c425b 1233 s += uskip;
bbce6d69 1234 }
bbce6d69 1235 }
ffc61ed2
JH
1236 else {
1237 while (s < strend) {
1238 if (isALNUM(*s)) {
3b0527fe 1239 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1240 goto got_it;
1241 else
1242 tmp = doevery;
1243 }
a0ed51b3 1244 else
ffc61ed2
JH
1245 tmp = 1;
1246 s++;
a0ed51b3 1247 }
a0ed51b3
LW
1248 }
1249 break;
bbce6d69 1250 case ALNUML:
3280af22 1251 PL_reg_flags |= RF_tainted;
ffc61ed2 1252 if (do_utf8) {
078c425b 1253 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1254 if (isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1255 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1256 goto got_it;
1257 else
1258 tmp = doevery;
1259 }
a687059c 1260 else
ffc61ed2 1261 tmp = 1;
078c425b 1262 s += uskip;
a0d0e21e 1263 }
a0d0e21e 1264 }
ffc61ed2
JH
1265 else {
1266 while (s < strend) {
1267 if (isALNUM_LC(*s)) {
3b0527fe 1268 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1269 goto got_it;
1270 else
1271 tmp = doevery;
1272 }
a0ed51b3 1273 else
ffc61ed2
JH
1274 tmp = 1;
1275 s++;
a0ed51b3 1276 }
a0ed51b3
LW
1277 }
1278 break;
a0d0e21e 1279 case NALNUM:
ffc61ed2 1280 if (do_utf8) {
1a4fad37 1281 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1282 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1283 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1284 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1285 goto got_it;
1286 else
1287 tmp = doevery;
1288 }
bbce6d69 1289 else
ffc61ed2 1290 tmp = 1;
078c425b 1291 s += uskip;
bbce6d69 1292 }
bbce6d69 1293 }
ffc61ed2
JH
1294 else {
1295 while (s < strend) {
1296 if (!isALNUM(*s)) {
3b0527fe 1297 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1298 goto got_it;
1299 else
1300 tmp = doevery;
1301 }
a0ed51b3 1302 else
ffc61ed2
JH
1303 tmp = 1;
1304 s++;
a0ed51b3 1305 }
a0ed51b3
LW
1306 }
1307 break;
bbce6d69 1308 case NALNUML:
3280af22 1309 PL_reg_flags |= RF_tainted;
ffc61ed2 1310 if (do_utf8) {
078c425b 1311 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1312 if (!isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1313 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1314 goto got_it;
1315 else
1316 tmp = doevery;
1317 }
a687059c 1318 else
ffc61ed2 1319 tmp = 1;
078c425b 1320 s += uskip;
a687059c 1321 }
a0d0e21e 1322 }
ffc61ed2
JH
1323 else {
1324 while (s < strend) {
1325 if (!isALNUM_LC(*s)) {
3b0527fe 1326 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1327 goto got_it;
1328 else
1329 tmp = doevery;
1330 }
a0ed51b3 1331 else
ffc61ed2
JH
1332 tmp = 1;
1333 s++;
a0ed51b3 1334 }
a0ed51b3
LW
1335 }
1336 break;
a0d0e21e 1337 case SPACE:
ffc61ed2 1338 if (do_utf8) {
1a4fad37 1339 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1340 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1341 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
3b0527fe 1342 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1343 goto got_it;
1344 else
1345 tmp = doevery;
1346 }
a0d0e21e 1347 else
ffc61ed2 1348 tmp = 1;
078c425b 1349 s += uskip;
2304df62 1350 }
a0d0e21e 1351 }
ffc61ed2
JH
1352 else {
1353 while (s < strend) {
1354 if (isSPACE(*s)) {
3b0527fe 1355 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1356 goto got_it;
1357 else
1358 tmp = doevery;
1359 }
a0ed51b3 1360 else
ffc61ed2
JH
1361 tmp = 1;
1362 s++;
a0ed51b3 1363 }
a0ed51b3
LW
1364 }
1365 break;
bbce6d69 1366 case SPACEL:
3280af22 1367 PL_reg_flags |= RF_tainted;
ffc61ed2 1368 if (do_utf8) {
078c425b 1369 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1370 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
3b0527fe 1371 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1372 goto got_it;
1373 else
1374 tmp = doevery;
1375 }
bbce6d69 1376 else
ffc61ed2 1377 tmp = 1;
078c425b 1378 s += uskip;
bbce6d69 1379 }
bbce6d69 1380 }
ffc61ed2
JH
1381 else {
1382 while (s < strend) {
1383 if (isSPACE_LC(*s)) {
3b0527fe 1384 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1385 goto got_it;
1386 else
1387 tmp = doevery;
1388 }
a0ed51b3 1389 else
ffc61ed2
JH
1390 tmp = 1;
1391 s++;
a0ed51b3 1392 }
a0ed51b3
LW
1393 }
1394 break;
a0d0e21e 1395 case NSPACE:
ffc61ed2 1396 if (do_utf8) {
1a4fad37 1397 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1398 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1399 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
3b0527fe 1400 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1401 goto got_it;
1402 else
1403 tmp = doevery;
1404 }
a0d0e21e 1405 else
ffc61ed2 1406 tmp = 1;
078c425b 1407 s += uskip;
a687059c 1408 }
a0d0e21e 1409 }
ffc61ed2
JH
1410 else {
1411 while (s < strend) {
1412 if (!isSPACE(*s)) {
3b0527fe 1413 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1414 goto got_it;
1415 else
1416 tmp = doevery;
1417 }
a0ed51b3 1418 else
ffc61ed2
JH
1419 tmp = 1;
1420 s++;
a0ed51b3 1421 }
a0ed51b3
LW
1422 }
1423 break;
bbce6d69 1424 case NSPACEL:
3280af22 1425 PL_reg_flags |= RF_tainted;
ffc61ed2 1426 if (do_utf8) {
078c425b 1427 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1428 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
3b0527fe 1429 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1430 goto got_it;
1431 else
1432 tmp = doevery;
1433 }
bbce6d69 1434 else
ffc61ed2 1435 tmp = 1;
078c425b 1436 s += uskip;
bbce6d69 1437 }
bbce6d69 1438 }
ffc61ed2
JH
1439 else {
1440 while (s < strend) {
1441 if (!isSPACE_LC(*s)) {
3b0527fe 1442 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1443 goto got_it;
1444 else
1445 tmp = doevery;
1446 }
a0ed51b3 1447 else
ffc61ed2
JH
1448 tmp = 1;
1449 s++;
a0ed51b3 1450 }
a0ed51b3
LW
1451 }
1452 break;
a0d0e21e 1453 case DIGIT:
ffc61ed2 1454 if (do_utf8) {
1a4fad37 1455 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1456 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1457 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1458 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1459 goto got_it;
1460 else
1461 tmp = doevery;
1462 }
a0d0e21e 1463 else
ffc61ed2 1464 tmp = 1;
078c425b 1465 s += uskip;
2b69d0c2 1466 }
a0d0e21e 1467 }
ffc61ed2
JH
1468 else {
1469 while (s < strend) {
1470 if (isDIGIT(*s)) {
3b0527fe 1471 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1472 goto got_it;
1473 else
1474 tmp = doevery;
1475 }
a0ed51b3 1476 else
ffc61ed2
JH
1477 tmp = 1;
1478 s++;
a0ed51b3 1479 }
a0ed51b3
LW
1480 }
1481 break;
b8c5462f
JH
1482 case DIGITL:
1483 PL_reg_flags |= RF_tainted;
ffc61ed2 1484 if (do_utf8) {
078c425b 1485 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1486 if (isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1487 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1488 goto got_it;
1489 else
1490 tmp = doevery;
1491 }
b8c5462f 1492 else
ffc61ed2 1493 tmp = 1;
078c425b 1494 s += uskip;
b8c5462f 1495 }
b8c5462f 1496 }
ffc61ed2
JH
1497 else {
1498 while (s < strend) {
1499 if (isDIGIT_LC(*s)) {
3b0527fe 1500 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1501 goto got_it;
1502 else
1503 tmp = doevery;
1504 }
b8c5462f 1505 else
ffc61ed2
JH
1506 tmp = 1;
1507 s++;
b8c5462f 1508 }
b8c5462f
JH
1509 }
1510 break;
a0d0e21e 1511 case NDIGIT:
ffc61ed2 1512 if (do_utf8) {
1a4fad37 1513 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1514 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1515 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1516 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1517 goto got_it;
1518 else
1519 tmp = doevery;
1520 }
a0d0e21e 1521 else
ffc61ed2 1522 tmp = 1;
078c425b 1523 s += uskip;
a687059c 1524 }
a0d0e21e 1525 }
ffc61ed2
JH
1526 else {
1527 while (s < strend) {
1528 if (!isDIGIT(*s)) {
3b0527fe 1529 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1530 goto got_it;
1531 else
1532 tmp = doevery;
1533 }
a0ed51b3 1534 else
ffc61ed2
JH
1535 tmp = 1;
1536 s++;
a0ed51b3 1537 }
a0ed51b3
LW
1538 }
1539 break;
b8c5462f
JH
1540 case NDIGITL:
1541 PL_reg_flags |= RF_tainted;
ffc61ed2 1542 if (do_utf8) {
078c425b 1543 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1544 if (!isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1545 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1546 goto got_it;
1547 else
1548 tmp = doevery;
1549 }
b8c5462f 1550 else
ffc61ed2 1551 tmp = 1;
078c425b 1552 s += uskip;
b8c5462f 1553 }
a0ed51b3 1554 }
ffc61ed2
JH
1555 else {
1556 while (s < strend) {
1557 if (!isDIGIT_LC(*s)) {
3b0527fe 1558 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1559 goto got_it;
1560 else
1561 tmp = doevery;
1562 }
cf93c79d 1563 else
ffc61ed2
JH
1564 tmp = 1;
1565 s++;
b8c5462f 1566 }
b8c5462f
JH
1567 }
1568 break;
07be1b83
YO
1569 case TRIE:
1570 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1571 {
1572 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1573 trie_type = do_utf8 ?
1574 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1575 : trie_plain;
1576 /* what trie are we using right now */
1577 reg_ac_data *aho
1578 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1579 reg_trie_data *trie=aho->trie;
1580
1581 const char *last_start = strend - trie->minlen;
1582 const char *real_start = s;
1583 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1584 SV *sv_points;
1585 U8 **points; /* map of where we were in the input string
1586 when reading a given string. For ASCII this
1587 is unnecessary overhead as the relationship
1588 is always 1:1, but for unicode, especially
1589 case folded unicode this is not true. */
07be1b83
YO
1590
1591 GET_RE_DEBUG_FLAGS_DECL;
1592
be8e71aa
YO
1593 /* We can't just allocate points here. We need to wrap it in
1594 * an SV so it gets freed properly if there is a croak while
1595 * running the match */
1596 ENTER;
1597 SAVETMPS;
1598 sv_points=newSV(maxlen * sizeof(U8 *));
1599 SvCUR_set(sv_points,
1600 maxlen * sizeof(U8 *));
1601 SvPOK_on(sv_points);
1602 sv_2mortal(sv_points);
1603 points=(U8**)SvPV_nolen(sv_points );
07be1b83
YO
1604
1605 if (trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1606 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
07be1b83
YO
1607 s++;
1608 }
1609 }
1610
1611 while (s <= last_start) {
1612 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1613 U8 *uc = (U8*)s;
1614 U16 charid = 0;
1615 U32 base = 1;
1616 U32 state = 1;
1617 UV uvc = 0;
1618 STRLEN len = 0;
1619 STRLEN foldlen = 0;
1620 U8 *uscan = (U8*)NULL;
1621 U8 *leftmost = NULL;
1622
1623 U32 pointpos = 0;
1624
1625 while ( state && uc <= (U8*)strend ) {
1626 int failed=0;
1627 if (aho->states[ state ].wordnum) {
1628 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1629 if (!leftmost || lpos < leftmost)
1630 leftmost= lpos;
1631 if (base==0) break;
1632 }
1633 points[pointpos++ % maxlen]= uc;
1634 switch (trie_type) {
1635 case trie_utf8_fold:
1636 if ( foldlen>0 ) {
1637 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
1638 foldlen -= len;
1639 uscan += len;
1640 len=0;
1641 } else {
1642 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1643 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
1644 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
1645 foldlen -= UNISKIP( uvc );
1646 uscan = foldbuf + UNISKIP( uvc );
1647 }
1648 break;
1649 case trie_utf8:
1650 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
1651 &len, uniflags );
1652 break;
1653 case trie_plain:
1654 uvc = (UV)*uc;
1655 len = 1;
1656 }
1657
1658 if (uvc < 256) {
1659 charid = trie->charmap[ uvc ];
1660 }
1661 else {
1662 charid = 0;
1663 if (trie->widecharmap) {
1664 SV** const svpp = hv_fetch(trie->widecharmap,
1665 (char*)&uvc, sizeof(UV), 0);
1666 if (svpp)
1667 charid = (U16)SvIV(*svpp);
1668 }
1669 }
1670 DEBUG_TRIE_EXECUTE_r(
1671 PerlIO_printf(Perl_debug_log,
1672 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1673 (int)((const char*)uc - real_start), charid, uvc)
1674 );
1675 uc += len;
1676
1677 do {
1678 U32 word = aho->states[ state ].wordnum;
1679 base = aho->states[ state ].trans.base;
1680
1681 DEBUG_TRIE_EXECUTE_r(
1682 PerlIO_printf( Perl_debug_log,
1683 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1684 failed ? "Fail transition to " : "",
1685 state, base, uvc, word)
1686 );
1687 if ( base ) {
1688 U32 tmp;
1689 if (charid &&
1690 (base + charid > trie->uniquecharcount )
1691 && (base + charid - 1 - trie->uniquecharcount
1692 < trie->lasttrans)
1693 && trie->trans[base + charid - 1 -
1694 trie->uniquecharcount].check == state
1695 && (tmp=trie->trans[base + charid - 1 -
1696 trie->uniquecharcount ].next))
1697 {
1698 state = tmp;
1699 break;
1700 }
1701 else {
1702 failed++;
1703 if ( state == 1 )
1704 break;
1705 else
1706 state = aho->fail[state];
1707 }
1708 }
1709 else {
1710 /* we must be accepting here */
1711 failed++;
1712 break;
1713 }
1714 } while(state);
1715 if (failed) {
1716 if (leftmost)
1717 break;
1718 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1719 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
07be1b83
YO
1720 uc++;
1721 }
1722 }
1723 }
1724 }
1725 if ( aho->states[ state ].wordnum ) {
1726 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1727 if (!leftmost || lpos < leftmost)
1728 leftmost = lpos;
1729 }
1730 DEBUG_TRIE_EXECUTE_r(
1731 PerlIO_printf( Perl_debug_log,
1732 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1733 "All done: ",
1734 state, base, uvc)
1735 );
1736 if (leftmost) {
1737 s = (char*)leftmost;
be8e71aa
YO
1738 if (!reginfo || regtry(reginfo, s)) {
1739 FREETMPS;
1740 LEAVE;
07be1b83 1741 goto got_it;
be8e71aa 1742 }
07be1b83
YO
1743 s = HOPc(s,1);
1744 } else {
1745 break;
1746 }
1747 }
be8e71aa
YO
1748 FREETMPS;
1749 LEAVE;
07be1b83
YO
1750 }
1751 break;
b3c9acc1 1752 default:
3c3eec57
GS
1753 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1754 break;
d6a28714 1755 }
6eb5f6b9
JH
1756 return 0;
1757 got_it:
1758 return s;
1759}
1760
1761/*
1762 - regexec_flags - match a regexp against a string
1763 */
1764I32
1765Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1766 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1767/* strend: pointer to null at end of string */
1768/* strbeg: real beginning of string */
1769/* minend: end of match must be >=minend after stringarg. */
1770/* data: May be used for some additional optimizations. */
1771/* nosave: For optimizations. */
1772{
97aff369 1773 dVAR;
6eb5f6b9
JH
1774 register char *s;
1775 register regnode *c;
1776 register char *startpos = stringarg;
6eb5f6b9
JH
1777 I32 minlen; /* must match at least this many chars */
1778 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1779 I32 end_shift = 0; /* Same for the end. */ /* CC */
1780 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1781 char *scream_olds = NULL;
3dab1dad 1782 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1783 const bool do_utf8 = DO_UTF8(sv);
2757e526 1784 I32 multiline;
2a782b5b 1785#ifdef DEBUGGING
2757e526
JH
1786 SV* dsv0;
1787 SV* dsv1;
2a782b5b 1788#endif
3b0527fe 1789 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1790
1791 GET_RE_DEBUG_FLAGS_DECL;
1792
9d4ba2ae 1793 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1794
1795 /* Be paranoid... */
1796 if (prog == NULL || startpos == NULL) {
1797 Perl_croak(aTHX_ "NULL regexp parameter");
1798 return 0;
1799 }
1800
2757e526 1801 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1802 reginfo.prog = prog;
2757e526
JH
1803
1804#ifdef DEBUGGING
1805 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1806 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1807#endif
1808
bac06658
JH
1809 RX_MATCH_UTF8_set(prog, do_utf8);
1810
6eb5f6b9 1811 minlen = prog->minlen;
61a36c01 1812 if (strend - startpos < minlen) {
a3621e74 1813 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1814 "String too short [regexec_flags]...\n"));
1815 goto phooey;
1aa99e6b 1816 }
6eb5f6b9 1817
6eb5f6b9
JH
1818 /* Check validity of program. */
1819 if (UCHARAT(prog->program) != REG_MAGIC) {
1820 Perl_croak(aTHX_ "corrupted regexp program");
1821 }
1822
1823 PL_reg_flags = 0;
1824 PL_reg_eval_set = 0;
1825 PL_reg_maxiter = 0;
1826
1827 if (prog->reganch & ROPT_UTF8)
1828 PL_reg_flags |= RF_utf8;
1829
1830 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1831 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1832 PL_bostr = strbeg;
3b0527fe 1833 reginfo.sv = sv;
6eb5f6b9
JH
1834
1835 /* Mark end of line for $ (and such) */
1836 PL_regeol = strend;
1837
1838 /* see how far we have to get to not match where we matched before */
3b0527fe 1839 reginfo.till = startpos+minend;
6eb5f6b9 1840
6eb5f6b9
JH
1841 /* If there is a "must appear" string, look for it. */
1842 s = startpos;
1843
3b0527fe 1844 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1845 MAGIC *mg;
1846
1847 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1848 reginfo.ganch = startpos;
6eb5f6b9
JH
1849 else if (sv && SvTYPE(sv) >= SVt_PVMG
1850 && SvMAGIC(sv)
14befaf4
DM
1851 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1852 && mg->mg_len >= 0) {
3b0527fe 1853 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1854 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1855 if (s > reginfo.ganch)
6eb5f6b9 1856 goto phooey;
3b0527fe 1857 s = reginfo.ganch;
6eb5f6b9
JH
1858 }
1859 }
1860 else /* pos() not defined */
3b0527fe 1861 reginfo.ganch = strbeg;
6eb5f6b9
JH
1862 }
1863
a0714e2c 1864 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1865 re_scream_pos_data d;
1866
1867 d.scream_olds = &scream_olds;
1868 d.scream_pos = &scream_pos;
1869 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1870 if (!s) {
a3621e74 1871 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1872 goto phooey; /* not present */
3fa9c3d7 1873 }
6eb5f6b9
JH
1874 }
1875
a3621e74 1876 DEBUG_EXECUTE_r({
1df70142
AL
1877 const char * const s0 = UTF
1878 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1879 UNI_DISPLAY_REGEX)
1880 : prog->precomp;
bb7a0f54 1881 const int len0 = UTF ? (int)SvCUR(dsv0) : prog->prelen;
1df70142 1882 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1883 UNI_DISPLAY_REGEX) : startpos;
bb7a0f54 1884 const int len1 = do_utf8 ? (int)SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1885 if (!PL_colorset)
1886 reginitcolors();
1887 PerlIO_printf(Perl_debug_log,
a0288114 1888 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1889 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1890 len0, len0, s0,
2a782b5b 1891 PL_colors[1],
9e55ce06 1892 len0 > 60 ? "..." : "",
2a782b5b 1893 PL_colors[0],
9e55ce06
JH
1894 (int)(len1 > 60 ? 60 : len1),
1895 s1, PL_colors[1],
1896 (len1 > 60 ? "..." : "")
2a782b5b
JH
1897 );
1898 });
6eb5f6b9
JH
1899
1900 /* Simplest case: anchored match need be tried only once. */
1901 /* [unless only anchor is BOL and multiline is set] */
1902 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1903 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1904 goto got_it;
7fba1cd6 1905 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1906 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1907 {
1908 char *end;
1909
1910 if (minlen)
1911 dontbother = minlen - 1;
1aa99e6b 1912 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1913 /* for multiline we only have to try after newlines */
33b8afdf 1914 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1915 if (s == startpos)
1916 goto after_try;
1917 while (1) {
3b0527fe 1918 if (regtry(&reginfo, s))
6eb5f6b9
JH
1919 goto got_it;
1920 after_try:
1921 if (s >= end)
1922 goto phooey;
1923 if (prog->reganch & RE_USE_INTUIT) {
1924 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1925 if (!s)
1926 goto phooey;
1927 }
1928 else
1929 s++;
1930 }
1931 } else {
1932 if (s > startpos)
1933 s--;
1934 while (s < end) {
1935 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1936 if (regtry(&reginfo, s))
6eb5f6b9
JH
1937 goto got_it;
1938 }
1939 }
1940 }
1941 }
1942 goto phooey;
1943 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1944 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1945 goto got_it;
1946 goto phooey;
1947 }
1948
1949 /* Messy cases: unanchored match. */
33b8afdf 1950 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1951 /* we have /x+whatever/ */
1952 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1953 char ch;
bf93d4cc
GS
1954#ifdef DEBUGGING
1955 int did_match = 0;
1956#endif
33b8afdf
JH
1957 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1958 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1959 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1960
1aa99e6b 1961 if (do_utf8) {
6eb5f6b9
JH
1962 while (s < strend) {
1963 if (*s == ch) {
a3621e74 1964 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1965 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1966 s += UTF8SKIP(s);
1967 while (s < strend && *s == ch)
1968 s += UTF8SKIP(s);
1969 }
1970 s += UTF8SKIP(s);
1971 }
1972 }
1973 else {
1974 while (s < strend) {
1975 if (*s == ch) {
a3621e74 1976 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1977 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1978 s++;
1979 while (s < strend && *s == ch)
1980 s++;
1981 }
1982 s++;
1983 }
1984 }
a3621e74 1985 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1986 PerlIO_printf(Perl_debug_log,
b7953727
JH
1987 "Did not find anchored character...\n")
1988 );
6eb5f6b9 1989 }
a0714e2c
SS
1990 else if (prog->anchored_substr != NULL
1991 || prog->anchored_utf8 != NULL
1992 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1993 && prog->float_max_offset < strend - s)) {
1994 SV *must;
1995 I32 back_max;
1996 I32 back_min;
1997 char *last;
6eb5f6b9 1998 char *last1; /* Last position checked before */
bf93d4cc
GS
1999#ifdef DEBUGGING
2000 int did_match = 0;
2001#endif
33b8afdf
JH
2002 if (prog->anchored_substr || prog->anchored_utf8) {
2003 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2004 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2005 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2006 back_max = back_min = prog->anchored_offset;
2007 } else {
2008 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2009 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2010 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2011 back_max = prog->float_max_offset;
2012 back_min = prog->float_min_offset;
2013 }
2014 if (must == &PL_sv_undef)
2015 /* could not downgrade utf8 check substring, so must fail */
2016 goto phooey;
2017
2018 last = HOP3c(strend, /* Cannot start after this */
2019 -(I32)(CHR_SVLEN(must)
2020 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
2021
2022 if (s > PL_bostr)
2023 last1 = HOPc(s, -1);
2024 else
2025 last1 = s - 1; /* bogus */
2026
a0288114 2027 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2028 check_substr==must. */
2029 scream_pos = -1;
2030 dontbother = end_shift;
2031 strend = HOPc(strend, -dontbother);
2032 while ( (s <= last) &&
9041c2e3 2033 ((flags & REXEC_SCREAM)
1aa99e6b 2034 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 2035 end_shift, &scream_pos, 0))
1aa99e6b 2036 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 2037 (unsigned char*)strend, must,
7fba1cd6 2038 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
2039 /* we may be pointing at the wrong string */
2040 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 2041 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2042 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2043 if (HOPc(s, -back_max) > last1) {
2044 last1 = HOPc(s, -back_min);
2045 s = HOPc(s, -back_max);
2046 }
2047 else {
52657f30 2048 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2049
2050 last1 = HOPc(s, -back_min);
52657f30 2051 s = t;
6eb5f6b9 2052 }
1aa99e6b 2053 if (do_utf8) {
6eb5f6b9 2054 while (s <= last1) {
3b0527fe 2055 if (regtry(&reginfo, s))
6eb5f6b9
JH
2056 goto got_it;
2057 s += UTF8SKIP(s);
2058 }
2059 }
2060 else {
2061 while (s <= last1) {
3b0527fe 2062 if (regtry(&reginfo, s))
6eb5f6b9
JH
2063 goto got_it;
2064 s++;
2065 }
2066 }
2067 }
a3621e74 2068 DEBUG_EXECUTE_r(if (!did_match)
b7953727 2069 PerlIO_printf(Perl_debug_log,
a0288114 2070 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 2071 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
2072 ? "anchored" : "floating"),
2073 PL_colors[0],
2074 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 2075 SvPVX_const(must),
b7953727
JH
2076 PL_colors[1], (SvTAIL(must) ? "$" : ""))
2077 );
6eb5f6b9
JH
2078 goto phooey;
2079 }
155aba94 2080 else if ((c = prog->regstclass)) {
f14c76ed 2081 if (minlen) {
be8e71aa 2082 const OPCODE op = OP(prog->regstclass);
66e933ab 2083 /* don't bother with what can't match */
07be1b83 2084 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
f14c76ed
RGS
2085 strend = HOPc(strend, -(minlen - 1));
2086 }
a3621e74 2087 DEBUG_EXECUTE_r({
be8e71aa 2088 SV * const prop = sv_newmortal();
cfd0369c
NC
2089 const char *s0;
2090 const char *s1;
9e55ce06
JH
2091 int len0;
2092 int len1;
2093
32fc9b6a 2094 regprop(prog, prop, c);
9e55ce06 2095 s0 = UTF ?
3f7c398e 2096 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 2097 UNI_DISPLAY_REGEX) :
cfd0369c 2098 SvPVX_const(prop);
9e55ce06
JH
2099 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
2100 s1 = UTF ?
c728cb41 2101 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
bb7a0f54 2102 len1 = UTF ? (int)SvCUR(dsv1) : strend - s;
9e55ce06 2103 PerlIO_printf(Perl_debug_log,
07be1b83 2104 "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
9e55ce06 2105 len0, len0, s0,
07be1b83 2106 len1, len1, s1, (int)(strend - s));
ffc61ed2 2107 });
3b0527fe 2108 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2109 goto got_it;
07be1b83 2110 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2111 }
2112 else {
2113 dontbother = 0;
a0714e2c 2114 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2115 /* Trim the end. */
d6a28714 2116 char *last;
33b8afdf
JH
2117 SV* float_real;
2118
2119 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2120 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2121 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2122
2123 if (flags & REXEC_SCREAM) {
33b8afdf 2124 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2125 end_shift, &scream_pos, 1); /* last one */
2126 if (!last)
ffc61ed2 2127 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
2128 /* we may be pointing at the wrong string */
2129 else if (RX_MATCH_COPIED(prog))
3f7c398e 2130 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2131 }
d6a28714
JH
2132 else {
2133 STRLEN len;
cfd0369c 2134 const char * const little = SvPV_const(float_real, len);
d6a28714 2135
33b8afdf 2136 if (SvTAIL(float_real)) {
d6a28714
JH
2137 if (memEQ(strend - len + 1, little, len - 1))
2138 last = strend - len + 1;
7fba1cd6 2139 else if (!multiline)
9041c2e3 2140 last = memEQ(strend - len, little, len)
bd61b366 2141 ? strend - len : NULL;
b8c5462f 2142 else
d6a28714
JH
2143 goto find_last;
2144 } else {
2145 find_last:
9041c2e3 2146 if (len)
d6a28714 2147 last = rninstr(s, strend, little, little + len);
b8c5462f 2148 else
a0288114 2149 last = strend; /* matching "$" */
b8c5462f 2150 }
b8c5462f 2151 }
bf93d4cc 2152 if (last == NULL) {
a3621e74 2153 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2154 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2155 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2156 goto phooey; /* Should not happen! */
2157 }
d6a28714
JH
2158 dontbother = strend - last + prog->float_min_offset;
2159 }
2160 if (minlen && (dontbother < minlen))
2161 dontbother = minlen - 1;
2162 strend -= dontbother; /* this one's always in bytes! */
2163 /* We don't know much -- general case. */
1aa99e6b 2164 if (do_utf8) {
d6a28714 2165 for (;;) {
3b0527fe 2166 if (regtry(&reginfo, s))
d6a28714
JH
2167 goto got_it;
2168 if (s >= strend)
2169 break;
b8c5462f 2170 s += UTF8SKIP(s);
d6a28714
JH
2171 };
2172 }
2173 else {
2174 do {
3b0527fe 2175 if (regtry(&reginfo, s))
d6a28714
JH
2176 goto got_it;
2177 } while (s++ < strend);
2178 }
2179 }
2180
2181 /* Failure. */
2182 goto phooey;
2183
2184got_it:
2185 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2186
2187 if (PL_reg_eval_set) {
2188 /* Preserve the current value of $^R */
2189 if (oreplsv != GvSV(PL_replgv))
2190 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2191 restored, the value remains
2192 the same. */
4f639d21 2193 restore_pos(aTHX_ prog);
d6a28714
JH
2194 }
2195
2196 /* make sure $`, $&, $', and $digit will work later */
2197 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2198 RX_MATCH_COPY_FREE(prog);
d6a28714 2199 if (flags & REXEC_COPY_STR) {
be8e71aa 2200 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2201#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2202 if ((SvIsCOW(sv)
2203 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2204 if (DEBUG_C_TEST) {
2205 PerlIO_printf(Perl_debug_log,
2206 "Copy on write: regexp capture, type %d\n",
2207 (int) SvTYPE(sv));
2208 }
2209 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2210 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2211 assert (SvPOKp(prog->saved_copy));
2212 } else
2213#endif
2214 {
2215 RX_MATCH_COPIED_on(prog);
2216 s = savepvn(strbeg, i);
2217 prog->subbeg = s;
2218 }
d6a28714 2219 prog->sublen = i;
d6a28714
JH
2220 }
2221 else {
2222 prog->subbeg = strbeg;
2223 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2224 }
2225 }
9041c2e3 2226
d6a28714
JH
2227 return 1;
2228
2229phooey:
a3621e74 2230 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2231 PL_colors[4], PL_colors[5]));
d6a28714 2232 if (PL_reg_eval_set)
4f639d21 2233 restore_pos(aTHX_ prog);
d6a28714
JH
2234 return 0;
2235}
2236
2237/*
2238 - regtry - try match at specific point
2239 */
2240STATIC I32 /* 0 failure, 1 success */
3b0527fe 2241S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2242{
97aff369 2243 dVAR;
d6a28714
JH
2244 register I32 *sp;
2245 register I32 *ep;
2246 CHECKPOINT lastcp;
3b0527fe 2247 regexp *prog = reginfo->prog;
a3621e74 2248 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2249
02db2b7b
IZ
2250#ifdef DEBUGGING
2251 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2252#endif
d6a28714
JH
2253 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2254 MAGIC *mg;
2255
2256 PL_reg_eval_set = RS_init;
a3621e74 2257 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2258 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2259 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2260 ));
e8347627 2261 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2262 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2263 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2264 SAVETMPS;
2265 /* Apparently this is not needed, judging by wantarray. */
e8347627 2266 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2267 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2268
3b0527fe 2269 if (reginfo->sv) {
d6a28714 2270 /* Make $_ available to executed code. */
3b0527fe 2271 if (reginfo->sv != DEFSV) {
59f00321 2272 SAVE_DEFSV;
3b0527fe 2273 DEFSV = reginfo->sv;
b8c5462f 2274 }
d6a28714 2275
3b0527fe
DM
2276 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2277 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2278 /* prepare for quick setting of pos */
d300d9fa
NC
2279#ifdef PERL_OLD_COPY_ON_WRITE
2280 if (SvIsCOW(sv))
2281 sv_force_normal_flags(sv, 0);
2282#endif
3dab1dad 2283 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2284 &PL_vtbl_mglob, NULL, 0);
d6a28714 2285 mg->mg_len = -1;
b8c5462f 2286 }
d6a28714
JH
2287 PL_reg_magic = mg;
2288 PL_reg_oldpos = mg->mg_len;
4f639d21 2289 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2290 }
09687e5a 2291 if (!PL_reg_curpm) {
a02a5408 2292 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2293#ifdef USE_ITHREADS
2294 {
be8e71aa 2295 SV* const repointer = newSViv(0);
577e12cc 2296 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2297 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2298 av_push(PL_regex_padav,repointer);
2299 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2300 PL_regex_pad = AvARRAY(PL_regex_padav);
2301 }
2302#endif
2303 }
aaa362c4 2304 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2305 PL_reg_oldcurpm = PL_curpm;
2306 PL_curpm = PL_reg_curpm;
2307 if (RX_MATCH_COPIED(prog)) {
2308 /* Here is a serious problem: we cannot rewrite subbeg,
2309 since it may be needed if this match fails. Thus
2310 $` inside (?{}) could fail... */
2311 PL_reg_oldsaved = prog->subbeg;
2312 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2313#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2314 PL_nrs = prog->saved_copy;
2315#endif
d6a28714
JH
2316 RX_MATCH_COPIED_off(prog);
2317 }
2318 else
bd61b366 2319 PL_reg_oldsaved = NULL;
d6a28714
JH
2320 prog->subbeg = PL_bostr;
2321 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2322 }
973dddac 2323 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2324 PL_reginput = startpos;
2325 PL_regstartp = prog->startp;
2326 PL_regendp = prog->endp;
2327 PL_reglastparen = &prog->lastparen;
a01268b5 2328 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2329 prog->lastparen = 0;
03994de8 2330 prog->lastcloseparen = 0;
d6a28714 2331 PL_regsize = 0;
a3621e74 2332 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2333 if (PL_reg_start_tmpl <= prog->nparens) {
2334 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2335 if(PL_reg_start_tmp)
2336 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2337 else
a02a5408 2338 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2339 }
2340
2341 /* XXXX What this code is doing here?!!! There should be no need
2342 to do this again and again, PL_reglastparen should take care of
3dd2943c 2343 this! --ilya*/
dafc8851
JH
2344
2345 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2346 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2347 * PL_reglastparen), is not needed at all by the test suite
2348 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2349 * enough, for building DynaLoader, or otherwise this
2350 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2351 * will happen. Meanwhile, this code *is* needed for the
2352 * above-mentioned test suite tests to succeed. The common theme
2353 * on those tests seems to be returning null fields from matches.
2354 * --jhi */
dafc8851 2355#if 1
d6a28714
JH
2356 sp = prog->startp;
2357 ep = prog->endp;
2358 if (prog->nparens) {
097eb12c 2359 register I32 i;
eb160463 2360 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2361 *++sp = -1;
2362 *++ep = -1;
2363 }
2364 }
dafc8851 2365#endif
02db2b7b 2366 REGCP_SET(lastcp);
3b0527fe 2367 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2368 prog->endp[0] = PL_reginput - PL_bostr;
2369 return 1;
2370 }
02db2b7b 2371 REGCP_UNWIND(lastcp);
d6a28714
JH
2372 return 0;
2373}
2374
02db2b7b
IZ
2375#define RE_UNWIND_BRANCH 1
2376#define RE_UNWIND_BRANCHJ 2
2377
2378union re_unwind_t;
2379
2380typedef struct { /* XX: makes sense to enlarge it... */
2381 I32 type;
2382 I32 prev;
2383 CHECKPOINT lastcp;
2384} re_unwind_generic_t;
2385
2386typedef struct {
2387 I32 type;
2388 I32 prev;
2389 CHECKPOINT lastcp;
2390 I32 lastparen;
2391 regnode *next;
2392 char *locinput;
2393 I32 nextchr;
3a2830be 2394 int minmod;
02db2b7b
IZ
2395#ifdef DEBUGGING
2396 int regindent;
2397#endif
2398} re_unwind_branch_t;
2399
2400typedef union re_unwind_t {
2401 I32 type;
2402 re_unwind_generic_t generic;
2403 re_unwind_branch_t branch;
2404} re_unwind_t;
2405
8ba1375e
MJD
2406#define sayYES goto yes
2407#define sayNO goto no
e0f9d4a8 2408#define sayNO_ANYOF goto no_anyof
8ba1375e 2409#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2410#define sayNO_FINAL goto no_final
2411#define sayNO_SILENT goto do_no
2412#define saySAME(x) if (x) goto yes; else goto no
2413
3ab3c9b4
HS
2414#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2415#define POSCACHE_SEEN 1 /* we know what we're caching */
2416#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
7409bbd3 2417
3ab3c9b4 2418#define CACHEsayYES STMT_START { \
d8319b27 2419 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3
DM
2420 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2421 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2422 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2423 } \
2424 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2425 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2426 } \
2427 else { \
3ab3c9b4
HS
2428 /* cache records failure, but this is success */ \
2429 DEBUG_r( \
2430 PerlIO_printf(Perl_debug_log, \
2431 "%*s (remove success from failure cache)\n", \
2432 REPORT_CODE_OFF+PL_regindent*2, "") \
2433 ); \
d8319b27 2434 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2435 } \
2436 } \
2437 sayYES; \
2438} STMT_END
7409bbd3 2439
3ab3c9b4 2440#define CACHEsayNO STMT_START { \
d8319b27 2441 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3 2442 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
3ab3c9b4 2443 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
7409bbd3
DM
2444 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2445 } \
2446 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2447 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2448 } \
2449 else { \
3ab3c9b4
HS
2450 /* cache records success, but this is failure */ \
2451 DEBUG_r( \
2452 PerlIO_printf(Perl_debug_log, \
2453 "%*s (remove failure from success cache)\n", \
2454 REPORT_CODE_OFF+PL_regindent*2, "") \
2455 ); \
d8319b27 2456 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2457 } \
2458 } \
2459 sayNO; \
2460} STMT_END
2461
a3621e74
YO
2462/* this is used to determine how far from the left messages like
2463 'failed...' are printed. Currently 29 makes these messages line
2464 up with the opcode they refer to. Earlier perls used 25 which
2465 left these messages outdented making reviewing a debug output
2466 quite difficult.
2467*/
2468#define REPORT_CODE_OFF 29
2469
2470
2471/* Make sure there is a test for this +1 options in re_tests */
2472#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2473
9e137952
DM
2474/* this value indiciates that the c1/c2 "next char" test should be skipped */
2475#define CHRTEST_VOID -1000
2476
86545054
DM
2477#define SLAB_FIRST(s) (&(s)->states[0])
2478#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2479
5d9a96ca
DM
2480/* grab a new slab and return the first slot in it */
2481
2482STATIC regmatch_state *
2483S_push_slab(pTHX)
2484{
54df2634
NC
2485#if PERL_VERSION < 9
2486 dMY_CXT;
2487#endif
5d9a96ca
DM
2488 regmatch_slab *s = PL_regmatch_slab->next;
2489 if (!s) {
2490 Newx(s, 1, regmatch_slab);
2491 s->prev = PL_regmatch_slab;
2492 s->next = NULL;
2493 PL_regmatch_slab->next = s;
2494 }
2495 PL_regmatch_slab = s;
86545054 2496 return SLAB_FIRST(s);
5d9a96ca 2497}
5b47454d 2498
95b24440
DM
2499/* simulate a recursive call to regmatch */
2500
2501#define REGMATCH(ns, where) \
5d9a96ca
DM
2502 st->scan = scan; \
2503 scan = (ns); \
2504 st->resume_state = resume_##where; \
95b24440
DM
2505 goto start_recurse; \
2506 resume_point_##where:
2507
aa283a38
DM
2508
2509/* push a new regex state. Set newst to point to it */
2510
2511#define PUSH_STATE(newst, resume) \
2512 depth++; \
2513 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH STATE(%d)\n", depth)); \
2514 st->scan = scan; \
2515 st->next = next; \
2516 st->n = n; \
2517 st->locinput = locinput; \
2518 st->resume_state = resume; \
2519 newst = st+1; \
86545054 2520 if (newst > SLAB_LAST(PL_regmatch_slab)) \
aa283a38
DM
2521 newst = S_push_slab(aTHX); \
2522 PL_regmatch_state = newst; \
2523 newst->cc = 0; \
2524 newst->minmod = 0; \
2525 newst->sw = 0; \
2526 newst->logical = 0; \
2527 newst->unwind = 0; \
2528 locinput = PL_reginput; \
2529 nextchr = UCHARAT(locinput);
2530
2531#define POP_STATE \
2532 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth)); \
2533 depth--; \
2534 st--; \
86545054 2535 if (st < SLAB_FIRST(PL_regmatch_slab)) { \
aa283a38 2536 PL_regmatch_slab = PL_regmatch_slab->prev; \
86545054 2537 st = SLAB_LAST(PL_regmatch_slab); \
aa283a38
DM
2538 } \
2539 PL_regmatch_state = st; \
2540 scan = st->scan; \
2541 next = st->next; \
2542 n = st->n; \
2543 locinput = st->locinput; \
2544 nextchr = UCHARAT(locinput);
2545
d6a28714
JH
2546/*
2547 - regmatch - main matching routine
2548 *
2549 * Conceptually the strategy is simple: check to see whether the current
2550 * node matches, call self recursively to see whether the rest matches,
2551 * and then act accordingly. In practice we make some effort to avoid
2552 * recursion, in particular by going through "ordinary" nodes (that don't
2553 * need to know whether the rest of the match failed) by a loop instead of
2554 * by recursion.
2555 */
2556/* [lwall] I've hoisted the register declarations to the outer block in order to
2557 * maybe save a little bit of pushing and popping on the stack. It also takes
2558 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2559 *
2560 * This function used to be heavily recursive, but since this had the
2561 * effect of blowing the CPU stack on complex regexes, it has been
2562 * restructured to be iterative, and to save state onto the heap rather
2563 * than the stack. Essentially whereever regmatch() used to be called, it
2564 * pushes the current state, notes where to return, then jumps back into
2565 * the main loop.
2566 *
2567 * Originally the structure of this function used to look something like
2568
2569 S_regmatch() {
2570 int a = 1, b = 2;
2571 ...
2572 while (scan != NULL) {
5d9a96ca 2573 a++; // do stuff with a and b
95b24440
DM
2574 ...
2575 switch (OP(scan)) {
2576 case FOO: {
2577 int local = 3;
2578 ...
2579 if (regmatch(...)) // recurse
2580 goto yes;
2581 }
2582 ...
2583 }
2584 }
2585 yes:
2586 return 1;
2587 }
2588
2589 * Now it looks something like this:
2590
5d9a96ca 2591 typedef struct {
95b24440
DM
2592 int a, b, local;
2593 int resume_state;
5d9a96ca 2594 } regmatch_state;
95b24440
DM
2595
2596 S_regmatch() {
5d9a96ca
DM
2597 regmatch_state *st = new();
2598 int depth=0;
2599 st->a++; // do stuff with a and b
95b24440
DM
2600 ...
2601 while (scan != NULL) {
2602 ...
2603 switch (OP(scan)) {
2604 case FOO: {
5d9a96ca 2605 st->local = 3;
95b24440 2606 ...
5d9a96ca
DM
2607 st->scan = scan;
2608 scan = ...;
2609 st->resume_state = resume_FOO;
2610 goto start_recurse; // recurse
95b24440 2611
5d9a96ca
DM
2612 resume_point_FOO:
2613 if (result)
95b24440
DM
2614 goto yes;
2615 }
2616 ...
2617 }
5d9a96ca
DM
2618 start_recurse:
2619 st = new(); push a new state
2620 st->a = 1; st->b = 2;
2621 depth++;
95b24440 2622 }
5d9a96ca 2623 yes:
95b24440 2624 result = 1;
5d9a96ca
DM
2625 if (depth--) {
2626 st = pop();
95b24440
DM
2627 switch (resume_state) {
2628 case resume_FOO:
2629 goto resume_point_FOO;
2630 ...
2631 }
2632 }
2633 return result
2634 }
2635
2636 * WARNING: this means that any line in this function that contains a
2637 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2638 * regmatch() using gotos instead. Thus the values of any local variables
2639 * not saved in the regmatch_state structure will have been lost when
2640 * execution resumes on the next line .
5d9a96ca
DM
2641 *
2642 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2643 * PL_regmatch_state always points to the currently active state, and
2644 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2645 * The first time regmatch is called, the first slab is allocated, and is
2646 * never freed until interpreter desctruction. When the slab is full,
2647 * a new one is allocated chained to the end. At exit from regmatch, slabs
2648 * allocated since entry are freed.
d6a28714 2649 */
95b24440 2650
3dab1dad 2651#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2652
07be1b83
YO
2653#ifdef DEBUGGING
2654STATIC void
2655S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2656{
2657 const int docolor = *PL_colors[0];
2658 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2659 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2660 /* The part of the string before starttry has one color
2661 (pref0_len chars), between starttry and current
2662 position another one (pref_len - pref0_len chars),
2663 after the current position the third one.
2664 We assume that pref0_len <= pref_len, otherwise we
2665 decrease pref0_len. */
2666 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2667 ? (5 + taill) - l : locinput - PL_bostr;
2668 int pref0_len;
2669
2670 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2671 pref_len++;
2672 pref0_len = pref_len - (locinput - PL_reg_starttry);
2673 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2674 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2675 ? (5 + taill) - pref_len : PL_regeol - locinput);
2676 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2677 l--;
2678 if (pref0_len < 0)
2679 pref0_len = 0;
2680 if (pref0_len > pref_len)
2681 pref0_len = pref_len;
2682 {
2683 const char * const s0 =
2684 do_utf8 && OP(scan) != CANY ?
2685 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
2686 pref0_len, 60, UNI_DISPLAY_REGEX) :
2687 locinput - pref_len;
2688 const int len0 = do_utf8 ? (int)strlen(s0) : pref0_len;
2689 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2690 pv_uni_display(PERL_DEBUG_PAD(1),
2691 (U8*)(locinput - pref_len + pref0_len),
2692 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2693 locinput - pref_len + pref0_len;
2694 const int len1 = do_utf8 ? (int)strlen(s1) : pref_len - pref0_len;
2695 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2696 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
2697 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2698 locinput;
2699 const int len2 = do_utf8 ? (int)strlen(s2) : l;
2700 PerlIO_printf(Perl_debug_log,
2701 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
2702 (IV)(locinput - PL_bostr),
2703 PL_colors[4],
2704 len0, s0,
2705 PL_colors[5],
2706 PL_colors[2],
2707 len1, s1,
2708 PL_colors[3],
2709 (docolor ? "" : "> <"),
2710 PL_colors[0],
2711 len2, s2,
2712 PL_colors[1],
2713 15 - l - pref_len + 1,
2714 "");
2715 }
2716}
2717#endif
2718
d6a28714 2719STATIC I32 /* 0 failure, 1 success */
3b0527fe 2720S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2721{
54df2634
NC
2722#if PERL_VERSION < 9
2723 dMY_CXT;
2724#endif
27da23d5 2725 dVAR;
95b24440 2726 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2727 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2728
3b0527fe
DM
2729 regexp *rex = reginfo->prog;
2730
5d9a96ca
DM
2731 regmatch_slab *orig_slab;
2732 regmatch_state *orig_state;
a3621e74 2733
5d9a96ca
DM
2734 /* the current state. This is a cached copy of PL_regmatch_state */
2735 register regmatch_state *st;
95b24440 2736
5d9a96ca
DM
2737 /* cache heavy used fields of st in registers */
2738 register regnode *scan;
2739 register regnode *next;
2740 register I32 n = 0; /* initialize to shut up compiler warning */
2741 register char *locinput = PL_reginput;
95b24440 2742
5d9a96ca
DM
2743 /* these variables are NOT saved during a recusive RFEGMATCH: */
2744 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2745 bool result; /* return value of S_regmatch */
2746 regnode *inner; /* Next node in internal branch. */
2747 int depth = 0; /* depth of recursion */
aa283a38 2748 regmatch_state *newst; /* when pushing a state, this is the new one */
77cb431f
DM
2749 regmatch_state *yes_state = NULL; /* state to pop to on success of
2750 subpattern */
95b24440
DM
2751
2752#ifdef DEBUGGING
e68ec53f 2753 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2754 PL_regindent++;
2755#endif
2756
5d9a96ca
DM
2757 /* on first ever call to regmatch, allocate first slab */
2758 if (!PL_regmatch_slab) {
2759 Newx(PL_regmatch_slab, 1, regmatch_slab);
2760 PL_regmatch_slab->prev = NULL;
2761 PL_regmatch_slab->next = NULL;
86545054 2762 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2763 }
2764
2765 /* remember current high-water mark for exit */
2766 /* XXX this should be done with SAVE* instead */
2767 orig_slab = PL_regmatch_slab;
2768 orig_state = PL_regmatch_state;
2769
2770 /* grab next free state slot */
2771 st = ++PL_regmatch_state;
86545054 2772 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2773 st = PL_regmatch_state = S_push_slab(aTHX);
2774
2775 st->minmod = 0;
2776 st->sw = 0;
2777 st->logical = 0;
2778 st->unwind = 0;
2779 st->cc = NULL;
d6a28714
JH
2780 /* Note that nextchr is a byte even in UTF */
2781 nextchr = UCHARAT(locinput);
2782 scan = prog;
2783 while (scan != NULL) {
8ba1375e 2784
a3621e74 2785 DEBUG_EXECUTE_r( {
6136c704 2786 SV * const prop = sv_newmortal();
07be1b83 2787 dump_exec_pos( locinput, scan, do_utf8 );
32fc9b6a 2788 regprop(rex, prop, scan);
07be1b83
YO
2789
2790 PerlIO_printf(Perl_debug_log,
2791 "%3"IVdf":%*s%s(%"IVdf")\n",
2792 (IV)(scan - rex->program), PL_regindent*2, "",
2793 SvPVX_const(prop),
2794 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2a782b5b 2795 });
d6a28714
JH
2796
2797 next = scan + NEXT_OFF(scan);
2798 if (next == scan)
2799 next = NULL;
2800
2801 switch (OP(scan)) {
2802 case BOL:
7fba1cd6 2803 if (locinput == PL_bostr)
d6a28714 2804 {
3b0527fe 2805 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2806 break;
2807 }
d6a28714
JH
2808 sayNO;
2809 case MBOL:
12d33761
HS
2810 if (locinput == PL_bostr ||
2811 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2812 {
b8c5462f
JH
2813 break;
2814 }
d6a28714
JH
2815 sayNO;
2816 case SBOL:
c2a73568 2817 if (locinput == PL_bostr)
b8c5462f 2818 break;
d6a28714
JH
2819 sayNO;
2820 case GPOS:
3b0527fe 2821 if (locinput == reginfo->ganch)
d6a28714
JH
2822 break;
2823 sayNO;
2824 case EOL:
d6a28714
JH
2825 goto seol;
2826 case MEOL:
d6a28714 2827 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2828 sayNO;
b8c5462f 2829 break;
d6a28714
JH
2830 case SEOL:
2831 seol:
2832 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2833 sayNO;
d6a28714 2834 if (PL_regeol - locinput > 1)
b8c5462f 2835 sayNO;
b8c5462f 2836 break;
d6a28714
JH
2837 case EOS:
2838 if (PL_regeol != locinput)
b8c5462f 2839 sayNO;
d6a28714 2840 break;
ffc61ed2 2841 case SANY:
d6a28714 2842 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2843 sayNO;
f33976b4
DB
2844 if (do_utf8) {
2845 locinput += PL_utf8skip[nextchr];
2846 if (locinput > PL_regeol)
2847 sayNO;
2848 nextchr = UCHARAT(locinput);
2849 }
2850 else
2851 nextchr = UCHARAT(++locinput);
2852 break;
2853 case CANY:
2854 if (!nextchr && locinput >= PL_regeol)
2855 sayNO;
b8c5462f 2856 nextchr = UCHARAT(++locinput);
a0d0e21e 2857 break;
ffc61ed2 2858 case REG_ANY:
1aa99e6b
IH
2859 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2860 sayNO;
2861 if (do_utf8) {
b8c5462f 2862 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2863 if (locinput > PL_regeol)
2864 sayNO;
a0ed51b3 2865 nextchr = UCHARAT(locinput);
a0ed51b3 2866 }
1aa99e6b
IH
2867 else
2868 nextchr = UCHARAT(++locinput);
a0ed51b3 2869 break;
5b47454d 2870 case TRIE:
3dab1dad 2871 {
07be1b83 2872 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2873 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2874 trie_type = do_utf8 ?
2875 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2876 : trie_plain;
2877
2878 /* what trie are we using right now */
be8e71aa 2879 reg_trie_data * const trie
3dab1dad
YO
2880 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2881 U32 state = trie->startstate;
2882
2883 if (trie->bitmap && trie_type != trie_utf8_fold &&
2884 !TRIE_BITMAP_TEST(trie,*locinput)
2885 ) {
2886 if (trie->states[ state ].wordnum) {
2887 DEBUG_EXECUTE_r(
2888 PerlIO_printf(Perl_debug_log,
2889 "%*s %smatched empty string...%s\n",
2890 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2891 );
2892 break;
2893 } else {
2894 DEBUG_EXECUTE_r(
2895 PerlIO_printf(Perl_debug_log,
2896 "%*s %sfailed to match start class...%s\n",
2897 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2898 );
2899 sayNO_SILENT;
2900 }
2901 }
a3621e74 2902 {
07be1b83
YO
2903 /*
2904 traverse the TRIE keeping track of all accepting states
2905 we transition through until we get to a failing node.
2906 */
2907
a3621e74 2908 U8 *uc = ( U8* )locinput;
a3621e74
YO
2909 U16 charid = 0;
2910 U32 base = 0;
2911 UV uvc = 0;
2912 STRLEN len = 0;
2913 STRLEN foldlen = 0;
a3621e74
YO
2914 U8 *uscan = (U8*)NULL;
2915 STRLEN bufflen=0;
95b24440 2916 SV *sv_accept_buff = NULL;
5b47454d 2917
d8319b27 2918 st->u.trie.accepted = 0; /* how many accepting states we have seen */
95b24440 2919 result = 0;
a3621e74
YO
2920
2921 while ( state && uc <= (U8*)PL_regeol ) {
2922
5b47454d 2923 if (trie->states[ state ].wordnum) {
d8319b27 2924 if (!st->u.trie.accepted ) {
5b47454d
DM
2925 ENTER;
2926 SAVETMPS;
2927 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2928 sv_accept_buff=newSV(bufflen *
2929 sizeof(reg_trie_accepted) - 1);
2930 SvCUR_set(sv_accept_buff,
2931 sizeof(reg_trie_accepted));
2932 SvPOK_on(sv_accept_buff);
2933 sv_2mortal(sv_accept_buff);
d8319b27 2934 st->u.trie.accept_buff =
5b47454d
DM
2935 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2936 }
2937 else {
d8319b27 2938 if (st->u.trie.accepted >= bufflen) {
5b47454d 2939 bufflen *= 2;
d8319b27 2940 st->u.trie.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2941 SvGROW(sv_accept_buff,
2942 bufflen * sizeof(reg_trie_accepted));
2943 }
2944 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2945 + sizeof(reg_trie_accepted));
2946 }
d8319b27
DM
2947 st->u.trie.accept_buff[st->u.trie.accepted].wordnum = trie->states[state].wordnum;
2948 st->u.trie.accept_buff[st->u.trie.accepted].endpos = uc;
2949 ++st->u.trie.accepted;
5b47454d 2950 }
a3621e74
YO
2951
2952 base = trie->states[ state ].trans.base;
2953
07be1b83
YO
2954 DEBUG_TRIE_EXECUTE_r({
2955 dump_exec_pos( (char *)uc, scan, do_utf8 );
a3621e74 2956 PerlIO_printf( Perl_debug_log,
e4584336 2957 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
07be1b83 2958 2+PL_regindent * 2, "", PL_colors[4],
d8319b27 2959 (UV)state, (UV)base, (UV)st->u.trie.accepted );
07be1b83 2960 });
a3621e74
YO
2961
2962 if ( base ) {
5b47454d 2963 switch (trie_type) {
3dab1dad 2964 case trie_utf8_fold:
a3621e74
YO
2965 if ( foldlen>0 ) {
2966 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2967 foldlen -= len;
2968 uscan += len;
2969 len=0;
2970 } else {
1df70142 2971 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2972 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2973 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2974 foldlen -= UNISKIP( uvc );
2975 uscan = foldbuf + UNISKIP( uvc );
2976 }
5b47454d
DM
2977 break;
2978 case trie_utf8:
2979 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2980 &len, uniflags );
2981 break;
2982 case trie_plain:
e4584336 2983 uvc = (UV)*uc;
a3621e74
YO
2984 len = 1;
2985 }
2986
5b47454d
DM
2987 if (uvc < 256) {
2988 charid = trie->charmap[ uvc ];
2989 }
2990 else {
2991 charid = 0;
2992 if (trie->widecharmap) {
3dab1dad 2993 SV** const svpp = hv_fetch(trie->widecharmap,
5b47454d
DM
2994 (char*)&uvc, sizeof(UV), 0);
2995 if (svpp)
2996 charid = (U16)SvIV(*svpp);
2997 }
2998 }
a3621e74 2999
5b47454d
DM
3000 if (charid &&
3001 (base + charid > trie->uniquecharcount )
3002 && (base + charid - 1 - trie->uniquecharcount
3003 < trie->lasttrans)
3004 && trie->trans[base + charid - 1 -
3005 trie->uniquecharcount].check == state)
3006 {
3007 state = trie->trans[base + charid - 1 -
3008 trie->uniquecharcount ].next;
3009 }
3010 else {
3011 state = 0;
3012 }
3013 uc += len;
3014
3015 }
3016 else {
a3621e74
YO
3017 state = 0;
3018 }
3019 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
3020 PerlIO_printf( Perl_debug_log,
3021 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
3022 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3023 );
3024 }
d8319b27 3025 if (!st->u.trie.accepted )
a3621e74 3026 sayNO;
a3621e74
YO
3027
3028 /*
3029 There was at least one accepting state that we
3030 transitioned through. Presumably the number of accepting
3031 states is going to be low, typically one or two. So we
3032 simply scan through to find the one with lowest wordnum.
3033 Once we find it, we swap the last state into its place
3034 and decrement the size. We then try to match the rest of
3035 the pattern at the point where the word ends, if we
3036 succeed then we end the loop, otherwise the loop
3037 eventually terminates once all of the accepting states
3038 have been tried.
3039 */
a3621e74 3040
d8319b27 3041 if ( st->u.trie.accepted == 1 ) {
f2278c82 3042 DEBUG_EXECUTE_r({
be8e71aa 3043 SV ** const tmp = RX_DEBUG(reginfo->prog)
f2278c82
YO
3044 ? av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 )
3045 : NULL;
3046 PerlIO_printf( Perl_debug_log,
3047 "%*s %sonly one match : #%d <%s>%s\n",
3048 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3049 st->u.trie.accept_buff[ 0 ].wordnum,
3050 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
3051 PL_colors[5] );
3052 });
d8319b27 3053 PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos;
a3621e74
YO
3054 /* in this case we free tmps/leave before we call regmatch
3055 as we wont be using accept_buff again. */
3056 FREETMPS;
3057 LEAVE;
07be1b83 3058 /* do we need this? why dont we just do a break? */
95b24440
DM
3059 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
3060 /*** all unsaved local vars undefined at this point */
a3621e74
YO
3061 } else {
3062 DEBUG_EXECUTE_r(
e4584336 3063 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
d8319b27 3064 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)st->u.trie.accepted,
a3621e74
YO
3065 PL_colors[5] );
3066 );
d8319b27 3067 while ( !result && st->u.trie.accepted-- ) {
a3621e74
YO
3068 U32 best = 0;
3069 U32 cur;
d8319b27 3070 for( cur = 1 ; cur <= st->u.trie.accepted ; cur++ ) {
e4584336
RB
3071 DEBUG_TRIE_EXECUTE_r(
3072 PerlIO_printf( Perl_debug_log,
3073 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3074 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27
DM
3075 (IV)best, st->u.trie.accept_buff[ best ].wordnum, (IV)cur,
3076 st->u.trie.accept_buff[ cur ].wordnum, PL_colors[5] );
e4584336 3077 );
a3621e74 3078
d8319b27
DM
3079 if (st->u.trie.accept_buff[cur].wordnum <
3080 st->u.trie.accept_buff[best].wordnum)
e822a8b4 3081 best = cur;
a3621e74 3082 }
f2278c82
YO
3083 DEBUG_EXECUTE_r({
3084 reg_trie_data * const trie = (reg_trie_data*)
3085 rex->data->data[ARG(scan)];
be8e71aa 3086 SV ** const tmp = RX_DEBUG(reginfo->prog)
f2278c82
YO
3087 ? av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 )
3088 : NULL;
3089 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
3090 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3091 st->u.trie.accept_buff[best].wordnum,
3092 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
3093 PL_colors[5] );
3094 });
d8319b27
DM
3095 if ( best<st->u.trie.accepted ) {
3096 reg_trie_accepted tmp = st->u.trie.accept_buff[ best ];
3097 st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ];
3098 st->u.trie.accept_buff[ st->u.trie.accepted ] = tmp;
3099 best = st->u.trie.accepted;
a3621e74 3100 }
d8319b27 3101 PL_reginput = (char *)st->u.trie.accept_buff[ best ].endpos;
a3621e74
YO
3102
3103 /*
3104 as far as I can tell we only need the SAVETMPS/FREETMPS
3105 for re's with EVAL in them but I'm leaving them in for
3106 all until I can be sure.
3107 */
3108 SAVETMPS;
95b24440
DM
3109 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
3110 /*** all unsaved local vars undefined at this point */
a3621e74
YO
3111 FREETMPS;
3112 }
3113 FREETMPS;
3114 LEAVE;
3115 }
3116
95b24440 3117 if (result) {
a3621e74
YO
3118 sayYES;
3119 } else {
3120 sayNO;
3121 }
3dab1dad 3122 }}
a3621e74 3123 /* unreached codepoint */
95b24440
DM
3124 case EXACT: {
3125 char *s = STRING(scan);
5d9a96ca 3126 st->ln = STR_LEN(scan);
eb160463 3127 if (do_utf8 != UTF) {
bc517b45 3128 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3129 char *l = locinput;
be8e71aa 3130 const char * const e = s + st->ln;
a72c7584 3131
5ff6fc6d
JH
3132 if (do_utf8) {
3133 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3134 while (s < e) {
a3b680e6 3135 STRLEN ulen;
1aa99e6b 3136 if (l >= PL_regeol)
5ff6fc6d
JH
3137 sayNO;
3138 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3139 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3140 uniflags))
5ff6fc6d 3141 sayNO;
bc517b45 3142 l += ulen;
5ff6fc6d 3143 s ++;
1aa99e6b 3144 }
5ff6fc6d
JH
3145 }
3146 else {
3147 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3148 while (s < e) {
a3b680e6 3149 STRLEN ulen;
1aa99e6b
IH
3150 if (l >= PL_regeol)
3151 sayNO;
5ff6fc6d 3152 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3153 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3154 uniflags))
1aa99e6b 3155 sayNO;
bc517b45 3156 s += ulen;
a72c7584 3157 l ++;
1aa99e6b 3158 }
5ff6fc6d 3159 }
1aa99e6b
IH
3160 locinput = l;
3161 nextchr = UCHARAT(locinput);
3162 break;
3163 }
bc517b45 3164 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3165 /* Inline the first character, for speed. */
3166 if (UCHARAT(s) != nextchr)
3167 sayNO;
5d9a96ca 3168 if (PL_regeol - locinput < st->ln)
d6a28714 3169 sayNO;
5d9a96ca 3170 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 3171 sayNO;
5d9a96ca 3172 locinput += st->ln;
d6a28714
JH
3173 nextchr = UCHARAT(locinput);
3174 break;
95b24440 3175 }
d6a28714 3176 case EXACTFL:
b8c5462f
JH
3177 PL_reg_flags |= RF_tainted;
3178 /* FALL THROUGH */
95b24440 3179 case EXACTF: {
be8e71aa 3180 char * const s = STRING(scan);
5d9a96ca 3181 st->ln = STR_LEN(scan);
d6a28714 3182
d07ddd77
JH
3183 if (do_utf8 || UTF) {
3184 /* Either target or the pattern are utf8. */
be8e71aa 3185 const char * const l = locinput;
d07ddd77 3186 char *e = PL_regeol;
bc517b45 3187
5d9a96ca 3188 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 3189 l, &e, 0, do_utf8)) {
5486206c
JH
3190 /* One more case for the sharp s:
3191 * pack("U0U*", 0xDF) =~ /ss/i,
3192 * the 0xC3 0x9F are the UTF-8
3193 * byte sequence for the U+00DF. */
3194 if (!(do_utf8 &&
3195 toLOWER(s[0]) == 's' &&
5d9a96ca 3196 st->ln >= 2 &&
5486206c
JH
3197 toLOWER(s[1]) == 's' &&
3198 (U8)l[0] == 0xC3 &&
3199 e - l >= 2 &&
3200 (U8)l[1] == 0x9F))
3201 sayNO;
3202 }
d07ddd77
JH
3203 locinput = e;
3204 nextchr = UCHARAT(locinput);
3205 break;
a0ed51b3 3206 }
d6a28714 3207
bc517b45
JH
3208 /* Neither the target and the pattern are utf8. */
3209
d6a28714
JH
3210 /* Inline the first character, for speed. */
3211 if (UCHARAT(s) != nextchr &&
3212 UCHARAT(s) != ((OP(scan) == EXACTF)
3213 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3214 sayNO;
5d9a96ca 3215 if (PL_regeol - locinput < st->ln)
b8c5462f 3216 sayNO;
5d9a96ca
DM
3217 if (st->ln > 1 && (OP(scan) == EXACTF
3218 ? ibcmp(s, locinput, st->ln)
3219 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 3220 sayNO;
5d9a96ca 3221 locinput += st->ln;
d6a28714 3222 nextchr = UCHARAT(locinput);
a0d0e21e 3223 break;
95b24440 3224 }
d6a28714 3225 case ANYOF:
ffc61ed2 3226 if (do_utf8) {
9e55ce06
JH
3227 STRLEN inclasslen = PL_regeol - locinput;
3228
32fc9b6a 3229 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 3230 sayNO_ANYOF;
ffc61ed2
JH
3231 if (locinput >= PL_regeol)
3232 sayNO;
0f0076b4 3233 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3234 nextchr = UCHARAT(locinput);
e0f9d4a8 3235 break;
ffc61ed2
JH
3236 }
3237 else {
3238 if (nextchr < 0)
3239 nextchr = UCHARAT(locinput);
32fc9b6a 3240 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 3241 sayNO_ANYOF;
ffc61ed2
JH
3242 if (!nextchr && locinput >= PL_regeol)
3243 sayNO;
3244 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3245 break;
3246 }
3247 no_anyof:
3248 /* If we might have the case of the German sharp s
3249 * in a casefolding Unicode character class. */
3250
ebc501f0
JH
3251 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3252 locinput += SHARP_S_SKIP;
e0f9d4a8 3253 nextchr = UCHARAT(locinput);
ffc61ed2 3254 }
e0f9d4a8
JH
3255 else
3256 sayNO;
b8c5462f 3257 break;
d6a28714 3258 case ALNUML:
b8c5462f
JH
3259 PL_reg_flags |= RF_tainted;
3260 /* FALL THROUGH */
d6a28714 3261 case ALNUM:
b8c5462f 3262 if (!nextchr)
4633a7c4 3263 sayNO;
ffc61ed2 3264 if (do_utf8) {
1a4fad37 3265 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3266 if (!(OP(scan) == ALNUM
bb7a0f54 3267 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3268 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3269 {
3270 sayNO;
a0ed51b3 3271 }
b8c5462f 3272 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3273 nextchr = UCHARAT(locinput);
3274 break;
3275 }
ffc61ed2 3276 if (!(OP(scan) == ALNUM
d6a28714 3277 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3278 sayNO;
b8c5462f 3279 nextchr = UCHARAT(++locinput);
a0d0e21e 3280 break;
d6a28714 3281 case NALNUML:
b8c5462f
JH
3282 PL_reg_flags |= RF_tainted;
3283 /* FALL THROUGH */
d6a28714
JH
3284 case NALNUM:
3285 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3286 sayNO;
ffc61ed2 3287 if (do_utf8) {
1a4fad37 3288 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3289 if (OP(scan) == NALNUM
bb7a0f54 3290 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3291 : isALNUM_LC_utf8((U8*)locinput))
3292 {
b8c5462f 3293 sayNO;
d6a28714 3294 }
b8c5462f
JH
3295 locinput += PL_utf8skip[nextchr];
3296 nextchr = UCHARAT(locinput);
3297 break;
3298 }
ffc61ed2 3299 if (OP(scan) == NALNUM
d6a28714 3300 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3301 sayNO;
76e3520e 3302 nextchr = UCHARAT(++locinput);
a0d0e21e 3303 break;
d6a28714
JH
3304 case BOUNDL:
3305 case NBOUNDL:
3280af22 3306 PL_reg_flags |= RF_tainted;
bbce6d69 3307 /* FALL THROUGH */
d6a28714
JH
3308 case BOUND:
3309 case NBOUND:
3310 /* was last char in word? */
ffc61ed2 3311 if (do_utf8) {
12d33761 3312 if (locinput == PL_bostr)
5d9a96ca 3313 st->ln = '\n';
ffc61ed2 3314 else {
a3b680e6 3315 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3316
4ad0818d 3317 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3318 }
3319 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3320 st->ln = isALNUM_uni(st->ln);
1a4fad37 3321 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3322 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3323 }
3324 else {
5d9a96ca 3325 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3326 n = isALNUM_LC_utf8((U8*)locinput);
3327 }
a0ed51b3 3328 }
d6a28714 3329 else {
5d9a96ca 3330 st->ln = (locinput != PL_bostr) ?
12d33761 3331 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3332 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3333 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3334 n = isALNUM(nextchr);
3335 }
3336 else {
5d9a96ca 3337 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3338 n = isALNUM_LC(nextchr);
3339 }
d6a28714 3340 }
5d9a96ca 3341 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3342 OP(scan) == BOUNDL))
3343 sayNO;
a0ed51b3 3344 break;
d6a28714 3345 case SPACEL:
3280af22 3346 PL_reg_flags |= RF_tainted;
bbce6d69 3347 /* FALL THROUGH */
d6a28714 3348 case SPACE:
9442cb0e 3349 if (!nextchr)
4633a7c4 3350 sayNO;
1aa99e6b 3351 if (do_utf8) {
fd400ab9 3352 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3353 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3354 if (!(OP(scan) == SPACE
bb7a0f54 3355 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3356 : isSPACE_LC_utf8((U8*)locinput)))
3357 {
3358 sayNO;
3359 }
3360 locinput += PL_utf8skip[nextchr];
3361 nextchr = UCHARAT(locinput);
3362 break;
d6a28714 3363 }
ffc61ed2
JH
3364 if (!(OP(scan) == SPACE
3365 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3366 sayNO;
3367 nextchr = UCHARAT(++locinput);
3368 }
3369 else {
3370 if (!(OP(scan) == SPACE
3371 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3372 sayNO;
3373 nextchr = UCHARAT(++locinput);
a0ed51b3 3374 }
a0ed51b3 3375 break;
d6a28714 3376 case NSPACEL:
3280af22 3377 PL_reg_flags |= RF_tainted;
bbce6d69 3378 /* FALL THROUGH */
d6a28714 3379 case NSPACE:
9442cb0e 3380 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3381 sayNO;
1aa99e6b 3382 if (do_utf8) {
1a4fad37 3383 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3384 if (OP(scan) == NSPACE
bb7a0f54 3385 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3386 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3387 {
3388 sayNO;
3389 }
3390 locinput += PL_utf8skip[nextchr];
3391 nextchr = UCHARAT(locinput);
3392 break;
a0ed51b3 3393 }
ffc61ed2 3394 if (OP(scan) == NSPACE
d6a28714 3395 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3396 sayNO;
76e3520e 3397 nextchr = UCHARAT(++locinput);
a0d0e21e 3398 break;
d6a28714 3399 case DIGITL:
a0ed51b3
LW
3400 PL_reg_flags |= RF_tainted;
3401 /* FALL THROUGH */
d6a28714 3402 case DIGIT:
9442cb0e 3403 if (!nextchr)
a0ed51b3 3404 sayNO;
1aa99e6b 3405 if (do_utf8) {
1a4fad37 3406 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3407 if (!(OP(scan) == DIGIT
bb7a0f54 3408 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3409 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3410 {
a0ed51b3 3411 sayNO;
dfe13c55 3412 }
6f06b55f 3413 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3414 nextchr = UCHARAT(locinput);
3415 break;
3416 }
ffc61ed2 3417 if (!(OP(scan) == DIGIT
9442cb0e 3418 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3419 sayNO;
3420 nextchr = UCHARAT(++locinput);
3421 break;
d6a28714 3422 case NDIGITL:
b8c5462f
JH
3423 PL_reg_flags |= RF_tainted;
3424 /* FALL THROUGH */
d6a28714 3425 case NDIGIT:
9442cb0e 3426 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3427 sayNO;