This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
I thought I had fixed the spelling of DOES() in universal.t, but
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e 33#ifdef PERL_EXT_RE_BUILD
54df2634 34#include "re_top.h"
9041c2e3 35#endif
56953603 36
a687059c 37/*
e50aee73 38 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
39 *
40 * Copyright (c) 1986 by University of Toronto.
41 * Written by Henry Spencer. Not derived from licensed software.
42 *
43 * Permission is granted to anyone to use this software for any
44 * purpose on any computer system, and to redistribute it freely,
45 * subject to the following restrictions:
46 *
47 * 1. The author is not responsible for the consequences of use of
48 * this software, no matter how awful, even if they arise
49 * from defects in it.
50 *
51 * 2. The origin of this software must not be misrepresented, either
52 * by explicit claim or by omission.
53 *
54 * 3. Altered versions must be plainly marked as such, and must not
55 * be misrepresented as being the original software.
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
a687059c
LW
64 *
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
68 */
69#include "EXTERN.h"
864dbfa3 70#define PERL_IN_REGEXEC_C
a687059c 71#include "perl.h"
0f5d15d6 72
54df2634
NC
73#ifdef PERL_IN_XSUB_RE
74# include "re_comp.h"
75#else
76# include "regcomp.h"
77#endif
a687059c 78
c277df42
IZ
79#define RF_tainted 1 /* tainted information used? */
80#define RF_warned 2 /* warned about big count? */
ce862d02 81#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
82#define RF_utf8 8 /* String contains multibyte chars? */
83
eb160463 84#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
85
86#define RS_init 1 /* eval environment created */
87#define RS_set 2 /* replsv value is set */
c277df42 88
a687059c
LW
89#ifndef STATIC
90#define STATIC static
91#endif
92
32fc9b6a 93#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 94
c277df42
IZ
95/*
96 * Forwards.
97 */
98
33b8afdf 99#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 100#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 101
3dab1dad
YO
102#define HOPc(pos,off) \
103 (char *)(PL_reg_match_utf8 \
52657f30 104 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
105 : (U8*)(pos + off))
106#define HOPBACKc(pos, off) \
07be1b83
YO
107 (char*)(PL_reg_match_utf8\
108 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
109 : (pos - off >= PL_bostr) \
8e11feef 110 ? (U8*)pos - off \
3dab1dad 111 : NULL)
efb30f32 112
53c4c00c 113#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 114#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 115
1a4fad37
AL
116#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
117 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
118#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
119#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
120#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
121#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 122
3dab1dad
YO
123/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124
5f80c4cf 125/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
126#define JUMPABLE(rn) ( \
127 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
128 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
129 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 130 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
131)
132
cca55fe3 133#define HAS_TEXT(rn) ( \
3dab1dad 134 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 135)
e2d8ce26 136
a84d97b6
HS
137/*
138 Search for mandatory following text node; for lookahead, the text must
139 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140*/
cca55fe3 141#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
142 while (JUMPABLE(rn)) { \
143 const OPCODE type = OP(rn); \
144 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 145 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 146 else if (type == PLUS) \
cca55fe3 147 rn = NEXTOPER(rn); \
3dab1dad 148 else if (type == IFMATCH) \
a84d97b6 149 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 150 else rn += NEXT_OFF(rn); \
3dab1dad 151 } \
5f80c4cf 152} STMT_END
74750237 153
acfe0abc 154static void restore_pos(pTHX_ void *arg);
51371543 155
76e3520e 156STATIC CHECKPOINT
cea2e8a9 157S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 158{
97aff369 159 dVAR;
a3b680e6 160 const int retval = PL_savestack_ix;
b1ce53c5 161#define REGCP_PAREN_ELEMS 4
a3b680e6 162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
163 int p;
164
e49a9654
IH
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
167
a01268b5 168#define REGCP_OTHER_ELEMS 6
4b3c1a47 169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 170 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 171/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
3280af22 174 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
175 SSPUSHINT(p);
176 }
b1ce53c5 177/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
178 SSPUSHINT(PL_regsize);
179 SSPUSHINT(*PL_reglastparen);
a01268b5 180 SSPUSHINT(*PL_reglastcloseparen);
3280af22 181 SSPUSHPTR(PL_reginput);
41123dfd
JH
182#define REGCP_FRAME_ELEMS 2
183/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
184 * are needed for the regexp context stack bookkeeping. */
185 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 186 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 187
a0d0e21e
LW
188 return retval;
189}
190
c277df42 191/* These are needed since we do not localize EVAL nodes: */
a3621e74 192# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 193 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 194 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 195
a3621e74 196# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 197 PerlIO_printf(Perl_debug_log, \
faccc32b 198 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 199 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 200
76e3520e 201STATIC char *
097eb12c 202S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 203{
97aff369 204 dVAR;
b1ce53c5 205 I32 i;
a0d0e21e 206 char *input;
b1ce53c5 207
a3621e74
YO
208 GET_RE_DEBUG_FLAGS_DECL;
209
b1ce53c5 210 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 211 i = SSPOPINT;
b1ce53c5
JH
212 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
213 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 214 input = (char *) SSPOPPTR;
a01268b5 215 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
216 *PL_reglastparen = SSPOPINT;
217 PL_regsize = SSPOPINT;
b1ce53c5
JH
218
219 /* Now restore the parentheses context. */
41123dfd
JH
220 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
221 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 222 I32 tmps;
097eb12c 223 U32 paren = (U32)SSPOPINT;
3280af22 224 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
225 PL_regstartp[paren] = SSPOPINT;
226 tmps = SSPOPINT;
3280af22
NIS
227 if (paren <= *PL_reglastparen)
228 PL_regendp[paren] = tmps;
a3621e74 229 DEBUG_EXECUTE_r(
c3464db5 230 PerlIO_printf(Perl_debug_log,
b900a521 231 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 232 (UV)paren, (IV)PL_regstartp[paren],
b900a521 233 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 234 (IV)PL_regendp[paren],
3280af22 235 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 236 );
a0d0e21e 237 }
a3621e74 238 DEBUG_EXECUTE_r(
bb7a0f54 239 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 240 PerlIO_printf(Perl_debug_log,
faccc32b 241 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 242 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
243 }
244 );
daf18116 245#if 1
dafc8851
JH
246 /* It would seem that the similar code in regtry()
247 * already takes care of this, and in fact it is in
248 * a better location to since this code can #if 0-ed out
249 * but the code in regtry() is needed or otherwise tests
250 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
251 * (as of patchlevel 7877) will fail. Then again,
252 * this code seems to be necessary or otherwise
253 * building DynaLoader will fail:
254 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
255 * --jhi */
bb7a0f54 256 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
257 if (i > PL_regsize)
258 PL_regstartp[i] = -1;
259 PL_regendp[i] = -1;
a0d0e21e 260 }
dafc8851 261#endif
a0d0e21e
LW
262 return input;
263}
264
02db2b7b 265#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 266
95b24440 267#define TRYPAREN(paren, n, input, where) { \
29d1e993
HS
268 if (paren) { \
269 if (n) { \
270 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
271 PL_regendp[paren] = input - PL_bostr; \
272 } \
273 else \
274 PL_regendp[paren] = -1; \
275 } \
95b24440
DM
276 REGMATCH(next, where); \
277 if (result) \
29d1e993
HS
278 sayYES; \
279 if (paren && n) \
280 PL_regendp[paren] = -1; \
281}
282
283
a687059c 284/*
e50aee73 285 * pregexec and friends
a687059c
LW
286 */
287
76234dfb 288#ifndef PERL_IN_XSUB_RE
a687059c 289/*
c277df42 290 - pregexec - match a regexp against a string
a687059c 291 */
c277df42 292I32
864dbfa3 293Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 294 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
295/* strend: pointer to null at end of string */
296/* strbeg: real beginning of string */
297/* minend: end of match must be >=minend after stringarg. */
298/* nosave: For optimizations. */
299{
300 return
9041c2e3 301 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
302 nosave ? 0 : REXEC_COPY_STR);
303}
76234dfb 304#endif
22e551b9 305
9041c2e3 306/*
cad2e5aa
JH
307 * Need to implement the following flags for reg_anch:
308 *
309 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
310 * USE_INTUIT_ML
311 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
312 * INTUIT_AUTORITATIVE_ML
313 * INTUIT_ONCE_NOML - Intuit can match in one location only.
314 * INTUIT_ONCE_ML
315 *
316 * Another flag for this function: SECOND_TIME (so that float substrs
317 * with giant delta may be not rechecked).
318 */
319
320/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
321
3f7c398e 322/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
323 Otherwise, only SvCUR(sv) is used to get strbeg. */
324
325/* XXXX We assume that strpos is strbeg unless sv. */
326
6eb5f6b9
JH
327/* XXXX Some places assume that there is a fixed substring.
328 An update may be needed if optimizer marks as "INTUITable"
329 RExen without fixed substrings. Similarly, it is assumed that
330 lengths of all the strings are no more than minlen, thus they
331 cannot come from lookahead.
332 (Or minlen should take into account lookahead.) */
333
2c2d71f5
JH
334/* A failure to find a constant substring means that there is no need to make
335 an expensive call to REx engine, thus we celebrate a failure. Similarly,
336 finding a substring too deep into the string means that less calls to
30944b6d
IZ
337 regtry() should be needed.
338
339 REx compiler's optimizer found 4 possible hints:
340 a) Anchored substring;
341 b) Fixed substring;
342 c) Whether we are anchored (beginning-of-line or \G);
343 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 344 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
345 string which does not contradict any of them.
346 */
2c2d71f5 347
6eb5f6b9
JH
348/* Most of decisions we do here should have been done at compile time.
349 The nodes of the REx which we used for the search should have been
350 deleted from the finite automaton. */
351
cad2e5aa
JH
352char *
353Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
354 char *strend, U32 flags, re_scream_pos_data *data)
355{
97aff369 356 dVAR;
b7953727 357 register I32 start_shift = 0;
cad2e5aa 358 /* Should be nonnegative! */
b7953727 359 register I32 end_shift = 0;
2c2d71f5
JH
360 register char *s;
361 register SV *check;
a1933d95 362 char *strbeg;
cad2e5aa 363 char *t;
a3b680e6 364 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 365 I32 ml_anch;
bd61b366
SS
366 register char *other_last = NULL; /* other substr checked before this */
367 char *check_at = NULL; /* check substr found at this pos */
1df70142 368 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 369#ifdef DEBUGGING
890ce7af
AL
370 const char * const i_strpos = strpos;
371 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 372#endif
a3621e74
YO
373
374 GET_RE_DEBUG_FLAGS_DECL;
375
a30b2f1f 376 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 377
b8d68ded 378 if (prog->reganch & ROPT_UTF8) {
a3621e74 379 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
380 "UTF-8 regex...\n"));
381 PL_reg_flags |= RF_utf8;
382 }
383
a3621e74 384 DEBUG_EXECUTE_r({
1df70142 385 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
386 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
387 strpos;
1df70142 388 const int len = PL_reg_match_utf8 ?
bb7a0f54 389 (int)strlen(s) : strend - strpos;
2a782b5b
JH
390 if (!PL_colorset)
391 reginitcolors();
b8d68ded 392 if (PL_reg_match_utf8)
a3621e74 393 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 394 "UTF-8 target...\n"));
2a782b5b 395 PerlIO_printf(Perl_debug_log,
a0288114 396 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 397 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
398 prog->precomp,
399 PL_colors[1],
400 (strlen(prog->precomp) > 60 ? "..." : ""),
401 PL_colors[0],
402 (int)(len > 60 ? 60 : len),
403 s, PL_colors[1],
404 (len > 60 ? "..." : "")
405 );
406 });
cad2e5aa 407
c344f387
JH
408 /* CHR_DIST() would be more correct here but it makes things slow. */
409 if (prog->minlen > strend - strpos) {
a3621e74 410 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 411 "String too short... [re_intuit_start]\n"));
cad2e5aa 412 goto fail;
2c2d71f5 413 }
a1933d95 414 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 415 PL_regeol = strend;
33b8afdf
JH
416 if (do_utf8) {
417 if (!prog->check_utf8 && prog->check_substr)
418 to_utf8_substr(prog);
419 check = prog->check_utf8;
420 } else {
421 if (!prog->check_substr && prog->check_utf8)
422 to_byte_substr(prog);
423 check = prog->check_substr;
424 }
425 if (check == &PL_sv_undef) {
a3621e74 426 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
427 "Non-utf string cannot match utf check string\n"));
428 goto fail;
429 }
2c2d71f5 430 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
431 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
432 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 433 && !multiline ) ); /* Check after \n? */
cad2e5aa 434
7e25d62c
JH
435 if (!ml_anch) {
436 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
437 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 438 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
439 && sv && !SvROK(sv)
440 && (strpos != strbeg)) {
a3621e74 441 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
442 goto fail;
443 }
444 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 445 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 446 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
447 I32 slen;
448
1aa99e6b 449 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
450 if (SvTAIL(check)) {
451 slen = SvCUR(check); /* >= 1 */
cad2e5aa 452
9041c2e3 453 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 454 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 455 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 456 goto fail_finish;
cad2e5aa
JH
457 }
458 /* Now should match s[0..slen-2] */
459 slen--;
3f7c398e 460 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 461 || (slen > 1
3f7c398e 462 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 463 report_neq:
a3621e74 464 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
465 goto fail_finish;
466 }
cad2e5aa 467 }
3f7c398e 468 else if (*SvPVX_const(check) != *s
653099ff 469 || ((slen = SvCUR(check)) > 1
3f7c398e 470 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 471 goto report_neq;
c315bfe8 472 check_at = s;
2c2d71f5 473 goto success_at_start;
7e25d62c 474 }
cad2e5aa 475 }
2c2d71f5 476 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 477 s = strpos;
2c2d71f5 478 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 479 end_shift = prog->minlen - start_shift -
653099ff 480 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 481 if (!ml_anch) {
a3b680e6 482 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 483 - (SvTAIL(check) != 0);
a3b680e6 484 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
485
486 if (end_shift < eshift)
487 end_shift = eshift;
488 }
cad2e5aa 489 }
2c2d71f5 490 else { /* Can match at random position */
cad2e5aa
JH
491 ml_anch = 0;
492 s = strpos;
2c2d71f5
JH
493 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
494 /* Should be nonnegative! */
495 end_shift = prog->minlen - start_shift -
653099ff 496 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
497 }
498
2c2d71f5 499#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 500 if (end_shift < 0)
6bbae5e6 501 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
502#endif
503
2c2d71f5
JH
504 restart:
505 /* Find a possible match in the region s..strend by looking for
506 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 507 if (flags & REXEC_SCREAM) {
cad2e5aa 508 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 509 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 510
2c2d71f5
JH
511 if (PL_screamfirst[BmRARE(check)] >= 0
512 || ( BmRARE(check) == '\n'
513 && (BmPREVIOUS(check) == SvCUR(check) - 1)
514 && SvTAIL(check) ))
9041c2e3 515 s = screaminstr(sv, check,
2c2d71f5 516 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 517 else
2c2d71f5 518 goto fail_finish;
4addbd3b
HS
519 /* we may be pointing at the wrong string */
520 if (s && RX_MATCH_COPIED(prog))
3f7c398e 521 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
522 if (data)
523 *data->scream_olds = s;
524 }
f33976b4 525 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
526 s = fbm_instr((U8*)(s + start_shift),
527 (U8*)(strend - end_shift),
7fba1cd6 528 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 529 else
1aa99e6b
IH
530 s = fbm_instr(HOP3(s, start_shift, strend),
531 HOP3(strend, -end_shift, strbeg),
7fba1cd6 532 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
533
534 /* Update the count-of-usability, remove useless subpatterns,
535 unshift s. */
2c2d71f5 536
a0288114 537 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 538 (s ? "Found" : "Did not find"),
33b8afdf 539 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 540 PL_colors[0],
7b0972df 541 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 542 SvPVX_const(check),
2c2d71f5
JH
543 PL_colors[1], (SvTAIL(check) ? "$" : ""),
544 (s ? " at offset " : "...\n") ) );
545
546 if (!s)
547 goto fail_finish;
548
6eb5f6b9
JH
549 check_at = s;
550
2c2d71f5 551 /* Finish the diagnostic message */
a3621e74 552 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
553
554 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
555 Start with the other substr.
556 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 557 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
558 *always* match. Probably should be marked during compile...
559 Probably it is right to do no SCREAM here...
560 */
561
33b8afdf 562 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 563 /* Take into account the "other" substring. */
2c2d71f5
JH
564 /* XXXX May be hopelessly wrong for UTF... */
565 if (!other_last)
6eb5f6b9 566 other_last = strpos;
33b8afdf 567 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
568 do_other_anchored:
569 {
890ce7af
AL
570 char * const last = HOP3c(s, -start_shift, strbeg);
571 char *last1, *last2;
be8e71aa 572 char * const saved_s = s;
33b8afdf 573 SV* must;
2c2d71f5 574
2c2d71f5
JH
575 t = s - prog->check_offset_max;
576 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 577 && (!do_utf8
0ce71af7 578 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 579 && t > strpos)))
6f207bd3 580 NOOP;
2c2d71f5
JH
581 else
582 t = strpos;
1aa99e6b 583 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
584 if (t < other_last) /* These positions already checked */
585 t = other_last;
1aa99e6b 586 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
587 if (last < last1)
588 last1 = last;
589 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
590 /* On end-of-str: see comment below. */
33b8afdf
JH
591 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
592 if (must == &PL_sv_undef) {
593 s = (char*)NULL;
a3621e74 594 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
595 }
596 else
597 s = fbm_instr(
598 (unsigned char*)t,
599 HOP3(HOP3(last1, prog->anchored_offset, strend)
600 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
601 must,
7fba1cd6 602 multiline ? FBMrf_MULTILINE : 0
33b8afdf 603 );
a3621e74 604 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 605 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
606 (s ? "Found" : "Contradicts"),
607 PL_colors[0],
33b8afdf
JH
608 (int)(SvCUR(must)
609 - (SvTAIL(must)!=0)),
3f7c398e 610 SvPVX_const(must),
33b8afdf 611 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
612 if (!s) {
613 if (last1 >= last2) {
a3621e74 614 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
615 ", giving up...\n"));
616 goto fail_finish;
617 }
a3621e74 618 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 619 ", trying floating at offset %ld...\n",
be8e71aa 620 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
621 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
622 s = HOP3c(last, 1, strend);
2c2d71f5
JH
623 goto restart;
624 }
625 else {
a3621e74 626 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 627 (long)(s - i_strpos)));
1aa99e6b
IH
628 t = HOP3c(s, -prog->anchored_offset, strbeg);
629 other_last = HOP3c(s, 1, strend);
be8e71aa 630 s = saved_s;
2c2d71f5
JH
631 if (t == strpos)
632 goto try_at_start;
2c2d71f5
JH
633 goto try_at_offset;
634 }
30944b6d 635 }
2c2d71f5
JH
636 }
637 else { /* Take into account the floating substring. */
33b8afdf 638 char *last, *last1;
be8e71aa 639 char * const saved_s = s;
33b8afdf
JH
640 SV* must;
641
642 t = HOP3c(s, -start_shift, strbeg);
643 last1 = last =
644 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
645 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
646 last = HOP3c(t, prog->float_max_offset, strend);
647 s = HOP3c(t, prog->float_min_offset, strend);
648 if (s < other_last)
649 s = other_last;
2c2d71f5 650 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
651 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
652 /* fbm_instr() takes into account exact value of end-of-str
653 if the check is SvTAIL(ed). Since false positives are OK,
654 and end-of-str is not later than strend we are OK. */
655 if (must == &PL_sv_undef) {
656 s = (char*)NULL;
a3621e74 657 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
658 }
659 else
2c2d71f5 660 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
661 (unsigned char*)last + SvCUR(must)
662 - (SvTAIL(must)!=0),
7fba1cd6 663 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 664 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
665 (s ? "Found" : "Contradicts"),
666 PL_colors[0],
667 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 668 SvPVX_const(must),
33b8afdf
JH
669 PL_colors[1], (SvTAIL(must) ? "$" : "")));
670 if (!s) {
671 if (last1 == last) {
a3621e74 672 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
673 ", giving up...\n"));
674 goto fail_finish;
2c2d71f5 675 }
a3621e74 676 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 677 ", trying anchored starting at offset %ld...\n",
be8e71aa 678 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
679 other_last = last;
680 s = HOP3c(t, 1, strend);
681 goto restart;
682 }
683 else {
a3621e74 684 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
685 (long)(s - i_strpos)));
686 other_last = s; /* Fix this later. --Hugo */
be8e71aa 687 s = saved_s;
33b8afdf
JH
688 if (t == strpos)
689 goto try_at_start;
690 goto try_at_offset;
691 }
2c2d71f5 692 }
cad2e5aa 693 }
2c2d71f5
JH
694
695 t = s - prog->check_offset_max;
2c2d71f5 696 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 697 && (!do_utf8
0ce71af7 698 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
1aa99e6b 699 && t > strpos))) {
2c2d71f5
JH
700 /* Fixed substring is found far enough so that the match
701 cannot start at strpos. */
702 try_at_offset:
cad2e5aa 703 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
704 /* Eventually fbm_*() should handle this, but often
705 anchored_offset is not 0, so this check will not be wasted. */
706 /* XXXX In the code below we prefer to look for "^" even in
707 presence of anchored substrings. And we search even
708 beyond the found float position. These pessimizations
709 are historical artefacts only. */
710 find_anchor:
2c2d71f5 711 while (t < strend - prog->minlen) {
cad2e5aa 712 if (*t == '\n') {
4ee3650e 713 if (t < check_at - prog->check_offset_min) {
33b8afdf 714 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
715 /* Since we moved from the found position,
716 we definitely contradict the found anchored
30944b6d
IZ
717 substr. Due to the above check we do not
718 contradict "check" substr.
719 Thus we can arrive here only if check substr
720 is float. Redo checking for "other"=="fixed".
721 */
9041c2e3 722 strpos = t + 1;
a3621e74 723 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 724 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
725 goto do_other_anchored;
726 }
4ee3650e
GS
727 /* We don't contradict the found floating substring. */
728 /* XXXX Why not check for STCLASS? */
cad2e5aa 729 s = t + 1;
a3621e74 730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 731 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
732 goto set_useful;
733 }
4ee3650e
GS
734 /* Position contradicts check-string */
735 /* XXXX probably better to look for check-string
736 than for "\n", so one should lower the limit for t? */
a3621e74 737 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 738 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 739 other_last = strpos = s = t + 1;
cad2e5aa
JH
740 goto restart;
741 }
742 t++;
743 }
a3621e74 744 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 745 PL_colors[0], PL_colors[1]));
2c2d71f5 746 goto fail_finish;
cad2e5aa 747 }
f5952150 748 else {
a3621e74 749 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 750 PL_colors[0], PL_colors[1]));
f5952150 751 }
cad2e5aa
JH
752 s = t;
753 set_useful:
33b8afdf 754 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
755 }
756 else {
f5952150 757 /* The found string does not prohibit matching at strpos,
2c2d71f5 758 - no optimization of calling REx engine can be performed,
f5952150
GS
759 unless it was an MBOL and we are not after MBOL,
760 or a future STCLASS check will fail this. */
2c2d71f5
JH
761 try_at_start:
762 /* Even in this situation we may use MBOL flag if strpos is offset
763 wrt the start of the string. */
05b4157f 764 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 765 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
766 /* May be due to an implicit anchor of m{.*foo} */
767 && !(prog->reganch & ROPT_IMPLICIT))
768 {
cad2e5aa
JH
769 t = strpos;
770 goto find_anchor;
771 }
a3621e74 772 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 773 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 774 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 775 );
2c2d71f5 776 success_at_start:
30944b6d 777 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
778 && (do_utf8 ? (
779 prog->check_utf8 /* Could be deleted already */
780 && --BmUSEFUL(prog->check_utf8) < 0
781 && (prog->check_utf8 == prog->float_utf8)
782 ) : (
783 prog->check_substr /* Could be deleted already */
784 && --BmUSEFUL(prog->check_substr) < 0
785 && (prog->check_substr == prog->float_substr)
786 )))
66e933ab 787 {
cad2e5aa 788 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 789 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
790 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
791 if (do_utf8 ? prog->check_substr : prog->check_utf8)
792 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
793 prog->check_substr = prog->check_utf8 = NULL; /* disable */
794 prog->float_substr = prog->float_utf8 = NULL; /* clear */
795 check = NULL; /* abort */
cad2e5aa 796 s = strpos;
3cf5c195
IZ
797 /* XXXX This is a remnant of the old implementation. It
798 looks wasteful, since now INTUIT can use many
6eb5f6b9 799 other heuristics. */
cad2e5aa
JH
800 prog->reganch &= ~RE_USE_INTUIT;
801 }
802 else
803 s = strpos;
804 }
805
6eb5f6b9
JH
806 /* Last resort... */
807 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
07be1b83 808 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
6eb5f6b9
JH
809 /* minlen == 0 is possible if regstclass is \b or \B,
810 and the fixed substr is ''$.
811 Since minlen is already taken into account, s+1 is before strend;
812 accidentally, minlen >= 1 guaranties no false positives at s + 1
813 even for \b or \B. But (minlen? 1 : 0) below assumes that
814 regstclass does not come from lookahead... */
815 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
816 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 817 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 818 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 819 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 820 : 1);
07be1b83 821 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 822 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 823 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
824 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
825 cl_l, strend)
826 : strend);
07be1b83
YO
827 /*if (OP(prog->regstclass) == TRIE)
828 endpos++;*/
6eb5f6b9 829 t = s;
3b0527fe 830 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
831 if (!s) {
832#ifdef DEBUGGING
cbbf8932 833 const char *what = NULL;
6eb5f6b9
JH
834#endif
835 if (endpos == strend) {
a3621e74 836 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
837 "Could not match STCLASS...\n") );
838 goto fail;
839 }
a3621e74 840 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 841 "This position contradicts STCLASS...\n") );
653099ff
GS
842 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
843 goto fail;
6eb5f6b9 844 /* Contradict one of substrings */
33b8afdf
JH
845 if (prog->anchored_substr || prog->anchored_utf8) {
846 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 847 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 848 hop_and_restart:
1aa99e6b 849 s = HOP3c(t, 1, strend);
66e933ab
GS
850 if (s + start_shift + end_shift > strend) {
851 /* XXXX Should be taken into account earlier? */
a3621e74 852 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
853 "Could not match STCLASS...\n") );
854 goto fail;
855 }
5e39e1e5
HS
856 if (!check)
857 goto giveup;
a3621e74 858 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 859 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
860 what, (long)(s + start_shift - i_strpos)) );
861 goto restart;
862 }
66e933ab 863 /* Have both, check_string is floating */
6eb5f6b9
JH
864 if (t + start_shift >= check_at) /* Contradicts floating=check */
865 goto retry_floating_check;
866 /* Recheck anchored substring, but not floating... */
9041c2e3 867 s = check_at;
5e39e1e5
HS
868 if (!check)
869 goto giveup;
a3621e74 870 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 871 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
872 (long)(other_last - i_strpos)) );
873 goto do_other_anchored;
874 }
60e71179
GS
875 /* Another way we could have checked stclass at the
876 current position only: */
877 if (ml_anch) {
878 s = t = t + 1;
5e39e1e5
HS
879 if (!check)
880 goto giveup;
a3621e74 881 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 882 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 883 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 884 goto try_at_offset;
66e933ab 885 }
33b8afdf 886 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 887 goto fail;
6eb5f6b9
JH
888 /* Check is floating subtring. */
889 retry_floating_check:
890 t = check_at - start_shift;
a3621e74 891 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
892 goto hop_and_restart;
893 }
b7953727 894 if (t != s) {
a3621e74 895 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 896 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
897 (long)(t - i_strpos), (long)(s - i_strpos))
898 );
899 }
900 else {
a3621e74 901 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
902 "Does not contradict STCLASS...\n");
903 );
904 }
6eb5f6b9 905 }
5e39e1e5 906 giveup:
a3621e74 907 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
908 PL_colors[4], (check ? "Guessed" : "Giving up"),
909 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 910 return s;
2c2d71f5
JH
911
912 fail_finish: /* Substring not found */
33b8afdf
JH
913 if (prog->check_substr || prog->check_utf8) /* could be removed already */
914 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 915 fail:
a3621e74 916 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 917 PL_colors[4], PL_colors[5]));
bd61b366 918 return NULL;
cad2e5aa 919}
9661b544 920
6eb5f6b9 921/* We know what class REx starts with. Try to find this position... */
3b0527fe 922/* if reginfo is NULL, its a dryrun */
07be1b83
YO
923/* annoyingly all the vars in this routine have different names from their counterparts
924 in regmatch. /grrr */
3b0527fe 925
3c3eec57 926STATIC char *
07be1b83
YO
927S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
928 const char *strend, const regmatch_info *reginfo)
a687059c 929{
27da23d5 930 dVAR;
1df70142 931 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 932 char *m;
d8093b23 933 STRLEN ln;
5dab1207 934 STRLEN lnc;
078c425b 935 register STRLEN uskip;
d8093b23
G
936 unsigned int c1;
937 unsigned int c2;
6eb5f6b9
JH
938 char *e;
939 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 940 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 941
6eb5f6b9
JH
942 /* We know what class it must start with. */
943 switch (OP(c)) {
6eb5f6b9 944 case ANYOF:
388cc4de 945 if (do_utf8) {
078c425b 946 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
947 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
948 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a
DM
949 reginclass(prog, c, (U8*)s, 0, do_utf8) :
950 REGINCLASS(prog, c, (U8*)s)) {
3b0527fe 951 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
952 goto got_it;
953 else
954 tmp = doevery;
955 }
956 else
957 tmp = 1;
078c425b 958 s += uskip;
388cc4de
HS
959 }
960 }
961 else {
962 while (s < strend) {
963 STRLEN skip = 1;
964
32fc9b6a 965 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
966 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
967 /* The assignment of 2 is intentional:
968 * for the folded sharp s, the skip is 2. */
969 (skip = SHARP_S_SKIP))) {
3b0527fe 970 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
971 goto got_it;
972 else
973 tmp = doevery;
974 }
975 else
976 tmp = 1;
977 s += skip;
978 }
a0d0e21e 979 }
6eb5f6b9 980 break;
f33976b4
DB
981 case CANY:
982 while (s < strend) {
3b0527fe 983 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
984 goto got_it;
985 else
986 tmp = doevery;
987 s++;
988 }
989 break;
6eb5f6b9 990 case EXACTF:
5dab1207
NIS
991 m = STRING(c);
992 ln = STR_LEN(c); /* length to match in octets/bytes */
993 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 994 if (UTF) {
a2a2844f 995 STRLEN ulen1, ulen2;
5dab1207 996 U8 *sm = (U8 *) m;
89ebb4a3
JH
997 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
998 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 999 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1000
1001 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1002 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1003
89ebb4a3 1004 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1005 0, uniflags);
89ebb4a3 1006 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1007 0, uniflags);
5dab1207
NIS
1008 lnc = 0;
1009 while (sm < ((U8 *) m + ln)) {
1010 lnc++;
1011 sm += UTF8SKIP(sm);
1012 }
1aa99e6b
IH
1013 }
1014 else {
1015 c1 = *(U8*)m;
1016 c2 = PL_fold[c1];
1017 }
6eb5f6b9
JH
1018 goto do_exactf;
1019 case EXACTFL:
5dab1207
NIS
1020 m = STRING(c);
1021 ln = STR_LEN(c);
1022 lnc = (I32) ln;
d8093b23 1023 c1 = *(U8*)m;
6eb5f6b9
JH
1024 c2 = PL_fold_locale[c1];
1025 do_exactf:
db12adc6 1026 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1027
3b0527fe 1028 if (!reginfo && e < s)
6eb5f6b9 1029 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1030
60a8b682
JH
1031 /* The idea in the EXACTF* cases is to first find the
1032 * first character of the EXACTF* node and then, if
1033 * necessary, case-insensitively compare the full
1034 * text of the node. The c1 and c2 are the first
1035 * characters (though in Unicode it gets a bit
1036 * more complicated because there are more cases
7f16dd3d
JH
1037 * than just upper and lower: one needs to use
1038 * the so-called folding case for case-insensitive
1039 * matching (called "loose matching" in Unicode).
1040 * ibcmp_utf8() will do just that. */
60a8b682 1041
1aa99e6b 1042 if (do_utf8) {
575cac57 1043 UV c, f;
89ebb4a3 1044 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1045 STRLEN len, foldlen;
4ad0818d 1046 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1047 if (c1 == c2) {
5dab1207
NIS
1048 /* Upper and lower of 1st char are equal -
1049 * probably not a "letter". */
1aa99e6b 1050 while (s <= e) {
89ebb4a3 1051 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1052 uniflags);
80aecb99
JH
1053 if ( c == c1
1054 && (ln == len ||
3dab1dad
YO
1055 ibcmp_utf8(s, NULL, 0, do_utf8,
1056 m, NULL, ln, (bool)UTF))
3b0527fe 1057 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1058 goto got_it;
80aecb99 1059 else {
1df70142 1060 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1061 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1062 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1063 if ( f != c
1064 && (f == c1 || f == c2)
1065 && (ln == foldlen ||
66423254 1066 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1067 NULL, foldlen, do_utf8,
d07ddd77 1068 m,
3dab1dad 1069 NULL, ln, (bool)UTF))
3b0527fe 1070 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1071 goto got_it;
1072 }
1aa99e6b
IH
1073 s += len;
1074 }
09091399
JH
1075 }
1076 else {
1aa99e6b 1077 while (s <= e) {
89ebb4a3 1078 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1079 uniflags);
80aecb99 1080
60a8b682 1081 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1082 * Note that not all the possible combinations
1083 * are handled here: some of them are handled
1084 * by the standard folding rules, and some of
1085 * them (the character class or ANYOF cases)
1086 * are handled during compiletime in
1087 * regexec.c:S_regclass(). */
880bd946
JH
1088 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1089 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1090 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1091
1092 if ( (c == c1 || c == c2)
1093 && (ln == len ||
3dab1dad
YO
1094 ibcmp_utf8(s, NULL, 0, do_utf8,
1095 m, NULL, ln, (bool)UTF))
3b0527fe 1096 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1097 goto got_it;
80aecb99 1098 else {
1df70142 1099 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1100 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1101 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1102 if ( f != c
1103 && (f == c1 || f == c2)
1104 && (ln == foldlen ||
a6872d42 1105 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1106 NULL, foldlen, do_utf8,
d07ddd77 1107 m,
3dab1dad 1108 NULL, ln, (bool)UTF))
3b0527fe 1109 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1110 goto got_it;
1111 }
1aa99e6b
IH
1112 s += len;
1113 }
09091399 1114 }
1aa99e6b
IH
1115 }
1116 else {
1117 if (c1 == c2)
1118 while (s <= e) {
1119 if ( *(U8*)s == c1
1120 && (ln == 1 || !(OP(c) == EXACTF
1121 ? ibcmp(s, m, ln)
1122 : ibcmp_locale(s, m, ln)))
3b0527fe 1123 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1124 goto got_it;
1125 s++;
1126 }
1127 else
1128 while (s <= e) {
1129 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1130 && (ln == 1 || !(OP(c) == EXACTF
1131 ? ibcmp(s, m, ln)
1132 : ibcmp_locale(s, m, ln)))
3b0527fe 1133 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1134 goto got_it;
1135 s++;
1136 }
b3c9acc1
IZ
1137 }
1138 break;
bbce6d69 1139 case BOUNDL:
3280af22 1140 PL_reg_flags |= RF_tainted;
bbce6d69 1141 /* FALL THROUGH */
a0d0e21e 1142 case BOUND:
ffc61ed2 1143 if (do_utf8) {
12d33761 1144 if (s == PL_bostr)
ffc61ed2
JH
1145 tmp = '\n';
1146 else {
6136c704 1147 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1148 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1149 }
1150 tmp = ((OP(c) == BOUND ?
9041c2e3 1151 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1152 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1153 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1154 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1155 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1156 isALNUM_LC_utf8((U8*)s)))
1157 {
1158 tmp = !tmp;
3b0527fe 1159 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1160 goto got_it;
1161 }
078c425b 1162 s += uskip;
a687059c 1163 }
a0d0e21e 1164 }
667bb95a 1165 else {
12d33761 1166 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1167 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1168 while (s < strend) {
1169 if (tmp ==
1170 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1171 tmp = !tmp;
3b0527fe 1172 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1173 goto got_it;
1174 }
1175 s++;
a0ed51b3 1176 }
a0ed51b3 1177 }
3b0527fe 1178 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1179 goto got_it;
1180 break;
bbce6d69 1181 case NBOUNDL:
3280af22 1182 PL_reg_flags |= RF_tainted;
bbce6d69 1183 /* FALL THROUGH */
a0d0e21e 1184 case NBOUND:
ffc61ed2 1185 if (do_utf8) {
12d33761 1186 if (s == PL_bostr)
ffc61ed2
JH
1187 tmp = '\n';
1188 else {
6136c704 1189 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1190 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1191 }
1192 tmp = ((OP(c) == NBOUND ?
9041c2e3 1193 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1194 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1195 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1196 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1197 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1198 isALNUM_LC_utf8((U8*)s)))
1199 tmp = !tmp;
3b0527fe 1200 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2 1201 goto got_it;
078c425b 1202 s += uskip;
ffc61ed2 1203 }
a0d0e21e 1204 }
667bb95a 1205 else {
12d33761 1206 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1207 tmp = ((OP(c) == NBOUND ?
1208 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1209 while (s < strend) {
1210 if (tmp ==
1211 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1212 tmp = !tmp;
3b0527fe 1213 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1214 goto got_it;
1215 s++;
1216 }
a0ed51b3 1217 }
3b0527fe 1218 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1219 goto got_it;
1220 break;
a0d0e21e 1221 case ALNUM:
ffc61ed2 1222 if (do_utf8) {
1a4fad37 1223 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1224 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1225 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1226 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1227 goto got_it;
1228 else
1229 tmp = doevery;
1230 }
bbce6d69 1231 else
ffc61ed2 1232 tmp = 1;
078c425b 1233 s += uskip;
bbce6d69 1234 }
bbce6d69 1235 }
ffc61ed2
JH
1236 else {
1237 while (s < strend) {
1238 if (isALNUM(*s)) {
3b0527fe 1239 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1240 goto got_it;
1241 else
1242 tmp = doevery;
1243 }
a0ed51b3 1244 else
ffc61ed2
JH
1245 tmp = 1;
1246 s++;
a0ed51b3 1247 }
a0ed51b3
LW
1248 }
1249 break;
bbce6d69 1250 case ALNUML:
3280af22 1251 PL_reg_flags |= RF_tainted;
ffc61ed2 1252 if (do_utf8) {
078c425b 1253 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1254 if (isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1255 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1256 goto got_it;
1257 else
1258 tmp = doevery;
1259 }
a687059c 1260 else
ffc61ed2 1261 tmp = 1;
078c425b 1262 s += uskip;
a0d0e21e 1263 }
a0d0e21e 1264 }
ffc61ed2
JH
1265 else {
1266 while (s < strend) {
1267 if (isALNUM_LC(*s)) {
3b0527fe 1268 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1269 goto got_it;
1270 else
1271 tmp = doevery;
1272 }
a0ed51b3 1273 else
ffc61ed2
JH
1274 tmp = 1;
1275 s++;
a0ed51b3 1276 }
a0ed51b3
LW
1277 }
1278 break;
a0d0e21e 1279 case NALNUM:
ffc61ed2 1280 if (do_utf8) {
1a4fad37 1281 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1282 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1283 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1284 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1285 goto got_it;
1286 else
1287 tmp = doevery;
1288 }
bbce6d69 1289 else
ffc61ed2 1290 tmp = 1;
078c425b 1291 s += uskip;
bbce6d69 1292 }
bbce6d69 1293 }
ffc61ed2
JH
1294 else {
1295 while (s < strend) {
1296 if (!isALNUM(*s)) {
3b0527fe 1297 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1298 goto got_it;
1299 else
1300 tmp = doevery;
1301 }
a0ed51b3 1302 else
ffc61ed2
JH
1303 tmp = 1;
1304 s++;
a0ed51b3 1305 }
a0ed51b3
LW
1306 }
1307 break;
bbce6d69 1308 case NALNUML:
3280af22 1309 PL_reg_flags |= RF_tainted;
ffc61ed2 1310 if (do_utf8) {
078c425b 1311 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1312 if (!isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1313 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1314 goto got_it;
1315 else
1316 tmp = doevery;
1317 }
a687059c 1318 else
ffc61ed2 1319 tmp = 1;
078c425b 1320 s += uskip;
a687059c 1321 }
a0d0e21e 1322 }
ffc61ed2
JH
1323 else {
1324 while (s < strend) {
1325 if (!isALNUM_LC(*s)) {
3b0527fe 1326 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1327 goto got_it;
1328 else
1329 tmp = doevery;
1330 }
a0ed51b3 1331 else
ffc61ed2
JH
1332 tmp = 1;
1333 s++;
a0ed51b3 1334 }
a0ed51b3
LW
1335 }
1336 break;
a0d0e21e 1337 case SPACE:
ffc61ed2 1338 if (do_utf8) {
1a4fad37 1339 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1340 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1341 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
3b0527fe 1342 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1343 goto got_it;
1344 else
1345 tmp = doevery;
1346 }
a0d0e21e 1347 else
ffc61ed2 1348 tmp = 1;
078c425b 1349 s += uskip;
2304df62 1350 }
a0d0e21e 1351 }
ffc61ed2
JH
1352 else {
1353 while (s < strend) {
1354 if (isSPACE(*s)) {
3b0527fe 1355 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1356 goto got_it;
1357 else
1358 tmp = doevery;
1359 }
a0ed51b3 1360 else
ffc61ed2
JH
1361 tmp = 1;
1362 s++;
a0ed51b3 1363 }
a0ed51b3
LW
1364 }
1365 break;
bbce6d69 1366 case SPACEL:
3280af22 1367 PL_reg_flags |= RF_tainted;
ffc61ed2 1368 if (do_utf8) {
078c425b 1369 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1370 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
3b0527fe 1371 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1372 goto got_it;
1373 else
1374 tmp = doevery;
1375 }
bbce6d69 1376 else
ffc61ed2 1377 tmp = 1;
078c425b 1378 s += uskip;
bbce6d69 1379 }
bbce6d69 1380 }
ffc61ed2
JH
1381 else {
1382 while (s < strend) {
1383 if (isSPACE_LC(*s)) {
3b0527fe 1384 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1385 goto got_it;
1386 else
1387 tmp = doevery;
1388 }
a0ed51b3 1389 else
ffc61ed2
JH
1390 tmp = 1;
1391 s++;
a0ed51b3 1392 }
a0ed51b3
LW
1393 }
1394 break;
a0d0e21e 1395 case NSPACE:
ffc61ed2 1396 if (do_utf8) {
1a4fad37 1397 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1398 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1399 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
3b0527fe 1400 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1401 goto got_it;
1402 else
1403 tmp = doevery;
1404 }
a0d0e21e 1405 else
ffc61ed2 1406 tmp = 1;
078c425b 1407 s += uskip;
a687059c 1408 }
a0d0e21e 1409 }
ffc61ed2
JH
1410 else {
1411 while (s < strend) {
1412 if (!isSPACE(*s)) {
3b0527fe 1413 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1414 goto got_it;
1415 else
1416 tmp = doevery;
1417 }
a0ed51b3 1418 else
ffc61ed2
JH
1419 tmp = 1;
1420 s++;
a0ed51b3 1421 }
a0ed51b3
LW
1422 }
1423 break;
bbce6d69 1424 case NSPACEL:
3280af22 1425 PL_reg_flags |= RF_tainted;
ffc61ed2 1426 if (do_utf8) {
078c425b 1427 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1428 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
3b0527fe 1429 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1430 goto got_it;
1431 else
1432 tmp = doevery;
1433 }
bbce6d69 1434 else
ffc61ed2 1435 tmp = 1;
078c425b 1436 s += uskip;
bbce6d69 1437 }
bbce6d69 1438 }
ffc61ed2
JH
1439 else {
1440 while (s < strend) {
1441 if (!isSPACE_LC(*s)) {
3b0527fe 1442 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1443 goto got_it;
1444 else
1445 tmp = doevery;
1446 }
a0ed51b3 1447 else
ffc61ed2
JH
1448 tmp = 1;
1449 s++;
a0ed51b3 1450 }
a0ed51b3
LW
1451 }
1452 break;
a0d0e21e 1453 case DIGIT:
ffc61ed2 1454 if (do_utf8) {
1a4fad37 1455 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1456 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1457 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1458 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1459 goto got_it;
1460 else
1461 tmp = doevery;
1462 }
a0d0e21e 1463 else
ffc61ed2 1464 tmp = 1;
078c425b 1465 s += uskip;
2b69d0c2 1466 }
a0d0e21e 1467 }
ffc61ed2
JH
1468 else {
1469 while (s < strend) {
1470 if (isDIGIT(*s)) {
3b0527fe 1471 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1472 goto got_it;
1473 else
1474 tmp = doevery;
1475 }
a0ed51b3 1476 else
ffc61ed2
JH
1477 tmp = 1;
1478 s++;
a0ed51b3 1479 }
a0ed51b3
LW
1480 }
1481 break;
b8c5462f
JH
1482 case DIGITL:
1483 PL_reg_flags |= RF_tainted;
ffc61ed2 1484 if (do_utf8) {
078c425b 1485 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1486 if (isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1487 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1488 goto got_it;
1489 else
1490 tmp = doevery;
1491 }
b8c5462f 1492 else
ffc61ed2 1493 tmp = 1;
078c425b 1494 s += uskip;
b8c5462f 1495 }
b8c5462f 1496 }
ffc61ed2
JH
1497 else {
1498 while (s < strend) {
1499 if (isDIGIT_LC(*s)) {
3b0527fe 1500 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1501 goto got_it;
1502 else
1503 tmp = doevery;
1504 }
b8c5462f 1505 else
ffc61ed2
JH
1506 tmp = 1;
1507 s++;
b8c5462f 1508 }
b8c5462f
JH
1509 }
1510 break;
a0d0e21e 1511 case NDIGIT:
ffc61ed2 1512 if (do_utf8) {
1a4fad37 1513 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1514 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1515 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1516 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1517 goto got_it;
1518 else
1519 tmp = doevery;
1520 }
a0d0e21e 1521 else
ffc61ed2 1522 tmp = 1;
078c425b 1523 s += uskip;
a687059c 1524 }
a0d0e21e 1525 }
ffc61ed2
JH
1526 else {
1527 while (s < strend) {
1528 if (!isDIGIT(*s)) {
3b0527fe 1529 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1530 goto got_it;
1531 else
1532 tmp = doevery;
1533 }
a0ed51b3 1534 else
ffc61ed2
JH
1535 tmp = 1;
1536 s++;
a0ed51b3 1537 }
a0ed51b3
LW
1538 }
1539 break;
b8c5462f
JH
1540 case NDIGITL:
1541 PL_reg_flags |= RF_tainted;
ffc61ed2 1542 if (do_utf8) {
078c425b 1543 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1544 if (!isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1545 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1546 goto got_it;
1547 else
1548 tmp = doevery;
1549 }
b8c5462f 1550 else
ffc61ed2 1551 tmp = 1;
078c425b 1552 s += uskip;
b8c5462f 1553 }
a0ed51b3 1554 }
ffc61ed2
JH
1555 else {
1556 while (s < strend) {
1557 if (!isDIGIT_LC(*s)) {
3b0527fe 1558 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1559 goto got_it;
1560 else
1561 tmp = doevery;
1562 }
cf93c79d 1563 else
ffc61ed2
JH
1564 tmp = 1;
1565 s++;
b8c5462f 1566 }
b8c5462f
JH
1567 }
1568 break;
07be1b83
YO
1569 case TRIE:
1570 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1571 {
1572 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1573 trie_type = do_utf8 ?
1574 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1575 : trie_plain;
1576 /* what trie are we using right now */
1577 reg_ac_data *aho
1578 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1579 reg_trie_data *trie=aho->trie;
1580
1581 const char *last_start = strend - trie->minlen;
1582 const char *real_start = s;
1583 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1584 SV *sv_points;
1585 U8 **points; /* map of where we were in the input string
1586 when reading a given string. For ASCII this
1587 is unnecessary overhead as the relationship
1588 is always 1:1, but for unicode, especially
1589 case folded unicode this is not true. */
07be1b83
YO
1590
1591 GET_RE_DEBUG_FLAGS_DECL;
1592
be8e71aa
YO
1593 /* We can't just allocate points here. We need to wrap it in
1594 * an SV so it gets freed properly if there is a croak while
1595 * running the match */
1596 ENTER;
1597 SAVETMPS;
1598 sv_points=newSV(maxlen * sizeof(U8 *));
1599 SvCUR_set(sv_points,
1600 maxlen * sizeof(U8 *));
1601 SvPOK_on(sv_points);
1602 sv_2mortal(sv_points);
1603 points=(U8**)SvPV_nolen(sv_points );
07be1b83
YO
1604
1605 if (trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1606 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
07be1b83
YO
1607 s++;
1608 }
1609 }
1610
1611 while (s <= last_start) {
1612 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1613 U8 *uc = (U8*)s;
1614 U16 charid = 0;
1615 U32 base = 1;
1616 U32 state = 1;
1617 UV uvc = 0;
1618 STRLEN len = 0;
1619 STRLEN foldlen = 0;
1620 U8 *uscan = (U8*)NULL;
1621 U8 *leftmost = NULL;
1622
1623 U32 pointpos = 0;
1624
1625 while ( state && uc <= (U8*)strend ) {
1626 int failed=0;
1627 if (aho->states[ state ].wordnum) {
1628 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1629 if (!leftmost || lpos < leftmost)
1630 leftmost= lpos;
1631 if (base==0) break;
1632 }
1633 points[pointpos++ % maxlen]= uc;
1634 switch (trie_type) {
1635 case trie_utf8_fold:
1636 if ( foldlen>0 ) {
1637 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
1638 foldlen -= len;
1639 uscan += len;
1640 len=0;
1641 } else {
1642 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1643 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
1644 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
1645 foldlen -= UNISKIP( uvc );
1646 uscan = foldbuf + UNISKIP( uvc );
1647 }
1648 break;
1649 case trie_utf8:
1650 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
1651 &len, uniflags );
1652 break;
1653 case trie_plain:
1654 uvc = (UV)*uc;
1655 len = 1;
1656 }
1657
1658 if (uvc < 256) {
1659 charid = trie->charmap[ uvc ];
1660 }
1661 else {
1662 charid = 0;
1663 if (trie->widecharmap) {
1664 SV** const svpp = hv_fetch(trie->widecharmap,
1665 (char*)&uvc, sizeof(UV), 0);
1666 if (svpp)
1667 charid = (U16)SvIV(*svpp);
1668 }
1669 }
1670 DEBUG_TRIE_EXECUTE_r(
1671 PerlIO_printf(Perl_debug_log,
1672 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1673 (int)((const char*)uc - real_start), charid, uvc)
1674 );
1675 uc += len;
1676
1677 do {
1678 U32 word = aho->states[ state ].wordnum;
1679 base = aho->states[ state ].trans.base;
1680
1681 DEBUG_TRIE_EXECUTE_r(
1682 PerlIO_printf( Perl_debug_log,
1683 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1684 failed ? "Fail transition to " : "",
1685 state, base, uvc, word)
1686 );
1687 if ( base ) {
1688 U32 tmp;
1689 if (charid &&
1690 (base + charid > trie->uniquecharcount )
1691 && (base + charid - 1 - trie->uniquecharcount
1692 < trie->lasttrans)
1693 && trie->trans[base + charid - 1 -
1694 trie->uniquecharcount].check == state
1695 && (tmp=trie->trans[base + charid - 1 -
1696 trie->uniquecharcount ].next))
1697 {
1698 state = tmp;
1699 break;
1700 }
1701 else {
1702 failed++;
1703 if ( state == 1 )
1704 break;
1705 else
1706 state = aho->fail[state];
1707 }
1708 }
1709 else {
1710 /* we must be accepting here */
1711 failed++;
1712 break;
1713 }
1714 } while(state);
1715 if (failed) {
1716 if (leftmost)
1717 break;
1718 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1719 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
07be1b83
YO
1720 uc++;
1721 }
1722 }
1723 }
1724 }
1725 if ( aho->states[ state ].wordnum ) {
1726 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1727 if (!leftmost || lpos < leftmost)
1728 leftmost = lpos;
1729 }
1730 DEBUG_TRIE_EXECUTE_r(
1731 PerlIO_printf( Perl_debug_log,
1732 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1733 "All done: ",
1734 state, base, uvc)
1735 );
1736 if (leftmost) {
1737 s = (char*)leftmost;
be8e71aa
YO
1738 if (!reginfo || regtry(reginfo, s)) {
1739 FREETMPS;
1740 LEAVE;
07be1b83 1741 goto got_it;
be8e71aa 1742 }
07be1b83
YO
1743 s = HOPc(s,1);
1744 } else {
1745 break;
1746 }
1747 }
be8e71aa
YO
1748 FREETMPS;
1749 LEAVE;
07be1b83
YO
1750 }
1751 break;
b3c9acc1 1752 default:
3c3eec57
GS
1753 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1754 break;
d6a28714 1755 }
6eb5f6b9
JH
1756 return 0;
1757 got_it:
1758 return s;
1759}
1760
1761/*
1762 - regexec_flags - match a regexp against a string
1763 */
1764I32
1765Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1766 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1767/* strend: pointer to null at end of string */
1768/* strbeg: real beginning of string */
1769/* minend: end of match must be >=minend after stringarg. */
1770/* data: May be used for some additional optimizations. */
1771/* nosave: For optimizations. */
1772{
97aff369 1773 dVAR;
6eb5f6b9
JH
1774 register char *s;
1775 register regnode *c;
1776 register char *startpos = stringarg;
6eb5f6b9
JH
1777 I32 minlen; /* must match at least this many chars */
1778 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1779 I32 end_shift = 0; /* Same for the end. */ /* CC */
1780 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1781 char *scream_olds = NULL;
3dab1dad 1782 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1783 const bool do_utf8 = DO_UTF8(sv);
2757e526 1784 I32 multiline;
2a782b5b 1785#ifdef DEBUGGING
2757e526
JH
1786 SV* dsv0;
1787 SV* dsv1;
2a782b5b 1788#endif
3b0527fe 1789 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1790
1791 GET_RE_DEBUG_FLAGS_DECL;
1792
9d4ba2ae 1793 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1794
1795 /* Be paranoid... */
1796 if (prog == NULL || startpos == NULL) {
1797 Perl_croak(aTHX_ "NULL regexp parameter");
1798 return 0;
1799 }
1800
2757e526 1801 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1802 reginfo.prog = prog;
2757e526
JH
1803
1804#ifdef DEBUGGING
1805 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1806 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1807#endif
1808
bac06658
JH
1809 RX_MATCH_UTF8_set(prog, do_utf8);
1810
6eb5f6b9 1811 minlen = prog->minlen;
61a36c01 1812 if (strend - startpos < minlen) {
a3621e74 1813 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1814 "String too short [regexec_flags]...\n"));
1815 goto phooey;
1aa99e6b 1816 }
6eb5f6b9 1817
6eb5f6b9
JH
1818 /* Check validity of program. */
1819 if (UCHARAT(prog->program) != REG_MAGIC) {
1820 Perl_croak(aTHX_ "corrupted regexp program");
1821 }
1822
1823 PL_reg_flags = 0;
1824 PL_reg_eval_set = 0;
1825 PL_reg_maxiter = 0;
1826
1827 if (prog->reganch & ROPT_UTF8)
1828 PL_reg_flags |= RF_utf8;
1829
1830 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1831 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1832 PL_bostr = strbeg;
3b0527fe 1833 reginfo.sv = sv;
6eb5f6b9
JH
1834
1835 /* Mark end of line for $ (and such) */
1836 PL_regeol = strend;
1837
1838 /* see how far we have to get to not match where we matched before */
3b0527fe 1839 reginfo.till = startpos+minend;
6eb5f6b9 1840
6eb5f6b9
JH
1841 /* If there is a "must appear" string, look for it. */
1842 s = startpos;
1843
3b0527fe 1844 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1845 MAGIC *mg;
1846
1847 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1848 reginfo.ganch = startpos;
6eb5f6b9
JH
1849 else if (sv && SvTYPE(sv) >= SVt_PVMG
1850 && SvMAGIC(sv)
14befaf4
DM
1851 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1852 && mg->mg_len >= 0) {
3b0527fe 1853 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1854 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1855 if (s > reginfo.ganch)
6eb5f6b9 1856 goto phooey;
3b0527fe 1857 s = reginfo.ganch;
6eb5f6b9
JH
1858 }
1859 }
1860 else /* pos() not defined */
3b0527fe 1861 reginfo.ganch = strbeg;
6eb5f6b9
JH
1862 }
1863
a0714e2c 1864 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1865 re_scream_pos_data d;
1866
1867 d.scream_olds = &scream_olds;
1868 d.scream_pos = &scream_pos;
1869 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1870 if (!s) {
a3621e74 1871 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1872 goto phooey; /* not present */
3fa9c3d7 1873 }
6eb5f6b9
JH
1874 }
1875
a3621e74 1876 DEBUG_EXECUTE_r({
1df70142
AL
1877 const char * const s0 = UTF
1878 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1879 UNI_DISPLAY_REGEX)
1880 : prog->precomp;
bb7a0f54 1881 const int len0 = UTF ? (int)SvCUR(dsv0) : prog->prelen;
1df70142 1882 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1883 UNI_DISPLAY_REGEX) : startpos;
bb7a0f54 1884 const int len1 = do_utf8 ? (int)SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1885 if (!PL_colorset)
1886 reginitcolors();
1887 PerlIO_printf(Perl_debug_log,
a0288114 1888 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1889 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1890 len0, len0, s0,
2a782b5b 1891 PL_colors[1],
9e55ce06 1892 len0 > 60 ? "..." : "",
2a782b5b 1893 PL_colors[0],
9e55ce06
JH
1894 (int)(len1 > 60 ? 60 : len1),
1895 s1, PL_colors[1],
1896 (len1 > 60 ? "..." : "")
2a782b5b
JH
1897 );
1898 });
6eb5f6b9
JH
1899
1900 /* Simplest case: anchored match need be tried only once. */
1901 /* [unless only anchor is BOL and multiline is set] */
1902 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1903 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1904 goto got_it;
7fba1cd6 1905 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1906 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1907 {
1908 char *end;
1909
1910 if (minlen)
1911 dontbother = minlen - 1;
1aa99e6b 1912 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1913 /* for multiline we only have to try after newlines */
33b8afdf 1914 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1915 if (s == startpos)
1916 goto after_try;
1917 while (1) {
3b0527fe 1918 if (regtry(&reginfo, s))
6eb5f6b9
JH
1919 goto got_it;
1920 after_try:
1921 if (s >= end)
1922 goto phooey;
1923 if (prog->reganch & RE_USE_INTUIT) {
1924 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1925 if (!s)
1926 goto phooey;
1927 }
1928 else
1929 s++;
1930 }
1931 } else {
1932 if (s > startpos)
1933 s--;
1934 while (s < end) {
1935 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1936 if (regtry(&reginfo, s))
6eb5f6b9
JH
1937 goto got_it;
1938 }
1939 }
1940 }
1941 }
1942 goto phooey;
1943 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1944 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1945 goto got_it;
1946 goto phooey;
1947 }
1948
1949 /* Messy cases: unanchored match. */
33b8afdf 1950 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1951 /* we have /x+whatever/ */
1952 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1953 char ch;
bf93d4cc
GS
1954#ifdef DEBUGGING
1955 int did_match = 0;
1956#endif
33b8afdf
JH
1957 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1958 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1959 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1960
1aa99e6b 1961 if (do_utf8) {
6eb5f6b9
JH
1962 while (s < strend) {
1963 if (*s == ch) {
a3621e74 1964 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1965 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1966 s += UTF8SKIP(s);
1967 while (s < strend && *s == ch)
1968 s += UTF8SKIP(s);
1969 }
1970 s += UTF8SKIP(s);
1971 }
1972 }
1973 else {
1974 while (s < strend) {
1975 if (*s == ch) {
a3621e74 1976 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1977 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1978 s++;
1979 while (s < strend && *s == ch)
1980 s++;
1981 }
1982 s++;
1983 }
1984 }
a3621e74 1985 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1986 PerlIO_printf(Perl_debug_log,
b7953727
JH
1987 "Did not find anchored character...\n")
1988 );
6eb5f6b9 1989 }
a0714e2c
SS
1990 else if (prog->anchored_substr != NULL
1991 || prog->anchored_utf8 != NULL
1992 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1993 && prog->float_max_offset < strend - s)) {
1994 SV *must;
1995 I32 back_max;
1996 I32 back_min;
1997 char *last;
6eb5f6b9 1998 char *last1; /* Last position checked before */
bf93d4cc
GS
1999#ifdef DEBUGGING
2000 int did_match = 0;
2001#endif
33b8afdf
JH
2002 if (prog->anchored_substr || prog->anchored_utf8) {
2003 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2004 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2005 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2006 back_max = back_min = prog->anchored_offset;
2007 } else {
2008 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2009 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2010 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2011 back_max = prog->float_max_offset;
2012 back_min = prog->float_min_offset;
2013 }
2014 if (must == &PL_sv_undef)
2015 /* could not downgrade utf8 check substring, so must fail */
2016 goto phooey;
2017
2018 last = HOP3c(strend, /* Cannot start after this */
2019 -(I32)(CHR_SVLEN(must)
2020 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
2021
2022 if (s > PL_bostr)
2023 last1 = HOPc(s, -1);
2024 else
2025 last1 = s - 1; /* bogus */
2026
a0288114 2027 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2028 check_substr==must. */
2029 scream_pos = -1;
2030 dontbother = end_shift;
2031 strend = HOPc(strend, -dontbother);
2032 while ( (s <= last) &&
9041c2e3 2033 ((flags & REXEC_SCREAM)
1aa99e6b 2034 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 2035 end_shift, &scream_pos, 0))
1aa99e6b 2036 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 2037 (unsigned char*)strend, must,
7fba1cd6 2038 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
2039 /* we may be pointing at the wrong string */
2040 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 2041 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2042 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2043 if (HOPc(s, -back_max) > last1) {
2044 last1 = HOPc(s, -back_min);
2045 s = HOPc(s, -back_max);
2046 }
2047 else {
52657f30 2048 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2049
2050 last1 = HOPc(s, -back_min);
52657f30 2051 s = t;
6eb5f6b9 2052 }
1aa99e6b 2053 if (do_utf8) {
6eb5f6b9 2054 while (s <= last1) {
3b0527fe 2055 if (regtry(&reginfo, s))
6eb5f6b9
JH
2056 goto got_it;
2057 s += UTF8SKIP(s);
2058 }
2059 }
2060 else {
2061 while (s <= last1) {
3b0527fe 2062 if (regtry(&reginfo, s))
6eb5f6b9
JH
2063 goto got_it;
2064 s++;
2065 }
2066 }
2067 }
a3621e74 2068 DEBUG_EXECUTE_r(if (!did_match)
b7953727 2069 PerlIO_printf(Perl_debug_log,
a0288114 2070 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 2071 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
2072 ? "anchored" : "floating"),
2073 PL_colors[0],
2074 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 2075 SvPVX_const(must),
b7953727
JH
2076 PL_colors[1], (SvTAIL(must) ? "$" : ""))
2077 );
6eb5f6b9
JH
2078 goto phooey;
2079 }
155aba94 2080 else if ((c = prog->regstclass)) {
f14c76ed 2081 if (minlen) {
be8e71aa 2082 const OPCODE op = OP(prog->regstclass);
66e933ab 2083 /* don't bother with what can't match */
07be1b83 2084 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
f14c76ed
RGS
2085 strend = HOPc(strend, -(minlen - 1));
2086 }
a3621e74 2087 DEBUG_EXECUTE_r({
be8e71aa 2088 SV * const prop = sv_newmortal();
cfd0369c
NC
2089 const char *s0;
2090 const char *s1;
9e55ce06
JH
2091 int len0;
2092 int len1;
2093
32fc9b6a 2094 regprop(prog, prop, c);
9e55ce06 2095 s0 = UTF ?
3f7c398e 2096 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 2097 UNI_DISPLAY_REGEX) :
cfd0369c 2098 SvPVX_const(prop);
9e55ce06
JH
2099 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
2100 s1 = UTF ?
c728cb41 2101 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
bb7a0f54 2102 len1 = UTF ? (int)SvCUR(dsv1) : strend - s;
9e55ce06 2103 PerlIO_printf(Perl_debug_log,
07be1b83 2104 "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
9e55ce06 2105 len0, len0, s0,
07be1b83 2106 len1, len1, s1, (int)(strend - s));
ffc61ed2 2107 });
3b0527fe 2108 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2109 goto got_it;
07be1b83 2110 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2111 }
2112 else {
2113 dontbother = 0;
a0714e2c 2114 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2115 /* Trim the end. */
d6a28714 2116 char *last;
33b8afdf
JH
2117 SV* float_real;
2118
2119 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2120 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2121 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2122
2123 if (flags & REXEC_SCREAM) {
33b8afdf 2124 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2125 end_shift, &scream_pos, 1); /* last one */
2126 if (!last)
ffc61ed2 2127 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
2128 /* we may be pointing at the wrong string */
2129 else if (RX_MATCH_COPIED(prog))
3f7c398e 2130 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2131 }
d6a28714
JH
2132 else {
2133 STRLEN len;
cfd0369c 2134 const char * const little = SvPV_const(float_real, len);
d6a28714 2135
33b8afdf 2136 if (SvTAIL(float_real)) {
d6a28714
JH
2137 if (memEQ(strend - len + 1, little, len - 1))
2138 last = strend - len + 1;
7fba1cd6 2139 else if (!multiline)
9041c2e3 2140 last = memEQ(strend - len, little, len)
bd61b366 2141 ? strend - len : NULL;
b8c5462f 2142 else
d6a28714
JH
2143 goto find_last;
2144 } else {
2145 find_last:
9041c2e3 2146 if (len)
d6a28714 2147 last = rninstr(s, strend, little, little + len);
b8c5462f 2148 else
a0288114 2149 last = strend; /* matching "$" */
b8c5462f 2150 }
b8c5462f 2151 }
bf93d4cc 2152 if (last == NULL) {
a3621e74 2153 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2154 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2155 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2156 goto phooey; /* Should not happen! */
2157 }
d6a28714
JH
2158 dontbother = strend - last + prog->float_min_offset;
2159 }
2160 if (minlen && (dontbother < minlen))
2161 dontbother = minlen - 1;
2162 strend -= dontbother; /* this one's always in bytes! */
2163 /* We don't know much -- general case. */
1aa99e6b 2164 if (do_utf8) {
d6a28714 2165 for (;;) {
3b0527fe 2166 if (regtry(&reginfo, s))
d6a28714
JH
2167 goto got_it;
2168 if (s >= strend)
2169 break;
b8c5462f 2170 s += UTF8SKIP(s);
d6a28714
JH
2171 };
2172 }
2173 else {
2174 do {
3b0527fe 2175 if (regtry(&reginfo, s))
d6a28714
JH
2176 goto got_it;
2177 } while (s++ < strend);
2178 }
2179 }
2180
2181 /* Failure. */
2182 goto phooey;
2183
2184got_it:
2185 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2186
2187 if (PL_reg_eval_set) {
2188 /* Preserve the current value of $^R */
2189 if (oreplsv != GvSV(PL_replgv))
2190 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2191 restored, the value remains
2192 the same. */
4f639d21 2193 restore_pos(aTHX_ prog);
d6a28714
JH
2194 }
2195
2196 /* make sure $`, $&, $', and $digit will work later */
2197 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2198 RX_MATCH_COPY_FREE(prog);
d6a28714 2199 if (flags & REXEC_COPY_STR) {
be8e71aa 2200 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2201#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2202 if ((SvIsCOW(sv)
2203 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2204 if (DEBUG_C_TEST) {
2205 PerlIO_printf(Perl_debug_log,
2206 "Copy on write: regexp capture, type %d\n",
2207 (int) SvTYPE(sv));
2208 }
2209 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2210 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2211 assert (SvPOKp(prog->saved_copy));
2212 } else
2213#endif
2214 {
2215 RX_MATCH_COPIED_on(prog);
2216 s = savepvn(strbeg, i);
2217 prog->subbeg = s;
2218 }
d6a28714 2219 prog->sublen = i;
d6a28714
JH
2220 }
2221 else {
2222 prog->subbeg = strbeg;
2223 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2224 }
2225 }
9041c2e3 2226
d6a28714
JH
2227 return 1;
2228
2229phooey:
a3621e74 2230 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2231 PL_colors[4], PL_colors[5]));
d6a28714 2232 if (PL_reg_eval_set)
4f639d21 2233 restore_pos(aTHX_ prog);
d6a28714
JH
2234 return 0;
2235}
2236
2237/*
2238 - regtry - try match at specific point
2239 */
2240STATIC I32 /* 0 failure, 1 success */
3b0527fe 2241S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2242{
97aff369 2243 dVAR;
d6a28714
JH
2244 register I32 *sp;
2245 register I32 *ep;
2246 CHECKPOINT lastcp;
3b0527fe 2247 regexp *prog = reginfo->prog;
a3621e74 2248 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2249
02db2b7b
IZ
2250#ifdef DEBUGGING
2251 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2252#endif
d6a28714
JH
2253 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2254 MAGIC *mg;
2255
2256 PL_reg_eval_set = RS_init;
a3621e74 2257 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2258 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2259 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2260 ));
e8347627 2261 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2262 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2263 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2264 SAVETMPS;
2265 /* Apparently this is not needed, judging by wantarray. */
e8347627 2266 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2267 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2268
3b0527fe 2269 if (reginfo->sv) {
d6a28714 2270 /* Make $_ available to executed code. */
3b0527fe 2271 if (reginfo->sv != DEFSV) {
59f00321 2272 SAVE_DEFSV;
3b0527fe 2273 DEFSV = reginfo->sv;
b8c5462f 2274 }
d6a28714 2275
3b0527fe
DM
2276 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2277 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2278 /* prepare for quick setting of pos */
d300d9fa
NC
2279#ifdef PERL_OLD_COPY_ON_WRITE
2280 if (SvIsCOW(sv))
2281 sv_force_normal_flags(sv, 0);
2282#endif
3dab1dad 2283 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2284 &PL_vtbl_mglob, NULL, 0);
d6a28714 2285 mg->mg_len = -1;
b8c5462f 2286 }
d6a28714
JH
2287 PL_reg_magic = mg;
2288 PL_reg_oldpos = mg->mg_len;
4f639d21 2289 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2290 }
09687e5a 2291 if (!PL_reg_curpm) {
a02a5408 2292 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2293#ifdef USE_ITHREADS
2294 {
be8e71aa 2295 SV* const repointer = newSViv(0);
577e12cc 2296 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2297 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2298 av_push(PL_regex_padav,repointer);
2299 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2300 PL_regex_pad = AvARRAY(PL_regex_padav);
2301 }
2302#endif
2303 }
aaa362c4 2304 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2305 PL_reg_oldcurpm = PL_curpm;
2306 PL_curpm = PL_reg_curpm;
2307 if (RX_MATCH_COPIED(prog)) {
2308 /* Here is a serious problem: we cannot rewrite subbeg,
2309 since it may be needed if this match fails. Thus
2310 $` inside (?{}) could fail... */
2311 PL_reg_oldsaved = prog->subbeg;
2312 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2313#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2314 PL_nrs = prog->saved_copy;
2315#endif
d6a28714
JH
2316 RX_MATCH_COPIED_off(prog);
2317 }
2318 else
bd61b366 2319 PL_reg_oldsaved = NULL;
d6a28714
JH
2320 prog->subbeg = PL_bostr;
2321 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2322 }
973dddac 2323 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2324 PL_reginput = startpos;
2325 PL_regstartp = prog->startp;
2326 PL_regendp = prog->endp;
2327 PL_reglastparen = &prog->lastparen;
a01268b5 2328 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2329 prog->lastparen = 0;
03994de8 2330 prog->lastcloseparen = 0;
d6a28714 2331 PL_regsize = 0;
a3621e74 2332 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2333 if (PL_reg_start_tmpl <= prog->nparens) {
2334 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2335 if(PL_reg_start_tmp)
2336 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2337 else
a02a5408 2338 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2339 }
2340
2341 /* XXXX What this code is doing here?!!! There should be no need
2342 to do this again and again, PL_reglastparen should take care of
3dd2943c 2343 this! --ilya*/
dafc8851
JH
2344
2345 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2346 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2347 * PL_reglastparen), is not needed at all by the test suite
2348 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2349 * enough, for building DynaLoader, or otherwise this
2350 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2351 * will happen. Meanwhile, this code *is* needed for the
2352 * above-mentioned test suite tests to succeed. The common theme
2353 * on those tests seems to be returning null fields from matches.
2354 * --jhi */
dafc8851 2355#if 1
d6a28714
JH
2356 sp = prog->startp;
2357 ep = prog->endp;
2358 if (prog->nparens) {
097eb12c 2359 register I32 i;
eb160463 2360 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2361 *++sp = -1;
2362 *++ep = -1;
2363 }
2364 }
dafc8851 2365#endif
02db2b7b 2366 REGCP_SET(lastcp);
3b0527fe 2367 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2368 prog->endp[0] = PL_reginput - PL_bostr;
2369 return 1;
2370 }
02db2b7b 2371 REGCP_UNWIND(lastcp);
d6a28714
JH
2372 return 0;
2373}
2374
02db2b7b
IZ
2375#define RE_UNWIND_BRANCH 1
2376#define RE_UNWIND_BRANCHJ 2
2377
2378union re_unwind_t;
2379
2380typedef struct { /* XX: makes sense to enlarge it... */
2381 I32 type;
2382 I32 prev;
2383 CHECKPOINT lastcp;
2384} re_unwind_generic_t;
2385
2386typedef struct {
2387 I32 type;
2388 I32 prev;
2389 CHECKPOINT lastcp;
2390 I32 lastparen;
2391 regnode *next;
2392 char *locinput;
2393 I32 nextchr;
3a2830be 2394 int minmod;
02db2b7b
IZ
2395#ifdef DEBUGGING
2396 int regindent;
2397#endif
2398} re_unwind_branch_t;
2399
2400typedef union re_unwind_t {
2401 I32 type;
2402 re_unwind_generic_t generic;
2403 re_unwind_branch_t branch;
2404} re_unwind_t;
2405
8ba1375e
MJD
2406#define sayYES goto yes
2407#define sayNO goto no
e0f9d4a8 2408#define sayNO_ANYOF goto no_anyof
8ba1375e 2409#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2410#define sayNO_FINAL goto no_final
2411#define sayNO_SILENT goto do_no
2412#define saySAME(x) if (x) goto yes; else goto no
2413
3ab3c9b4
HS
2414#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2415#define POSCACHE_SEEN 1 /* we know what we're caching */
2416#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
7409bbd3 2417
3ab3c9b4 2418#define CACHEsayYES STMT_START { \
d8319b27 2419 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3
DM
2420 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2421 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2422 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2423 } \
2424 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2425 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2426 } \
2427 else { \
3ab3c9b4
HS
2428 /* cache records failure, but this is success */ \
2429 DEBUG_r( \
2430 PerlIO_printf(Perl_debug_log, \
2431 "%*s (remove success from failure cache)\n", \
2432 REPORT_CODE_OFF+PL_regindent*2, "") \
2433 ); \
d8319b27 2434 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2435 } \
2436 } \
2437 sayYES; \
2438} STMT_END
7409bbd3 2439
3ab3c9b4 2440#define CACHEsayNO STMT_START { \
d8319b27 2441 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3 2442 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
3ab3c9b4 2443 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
7409bbd3
DM
2444 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2445 } \
2446 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2447 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2448 } \
2449 else { \
3ab3c9b4
HS
2450 /* cache records success, but this is failure */ \
2451 DEBUG_r( \
2452 PerlIO_printf(Perl_debug_log, \
2453 "%*s (remove failure from success cache)\n", \
2454 REPORT_CODE_OFF+PL_regindent*2, "") \
2455 ); \
d8319b27 2456 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2457 } \
2458 } \
2459 sayNO; \
2460} STMT_END
2461
a3621e74
YO
2462/* this is used to determine how far from the left messages like
2463 'failed...' are printed. Currently 29 makes these messages line
2464 up with the opcode they refer to. Earlier perls used 25 which
2465 left these messages outdented making reviewing a debug output
2466 quite difficult.
2467*/
2468#define REPORT_CODE_OFF 29
2469
2470
2471/* Make sure there is a test for this +1 options in re_tests */
2472#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2473
9e137952
DM
2474/* this value indiciates that the c1/c2 "next char" test should be skipped */
2475#define CHRTEST_VOID -1000
2476
86545054
DM
2477#define SLAB_FIRST(s) (&(s)->states[0])
2478#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2479
5d9a96ca
DM
2480/* grab a new slab and return the first slot in it */
2481
2482STATIC regmatch_state *
2483S_push_slab(pTHX)
2484{
54df2634
NC
2485#if PERL_VERSION < 9
2486 dMY_CXT;
2487#endif
5d9a96ca
DM
2488 regmatch_slab *s = PL_regmatch_slab->next;
2489 if (!s) {
2490 Newx(s, 1, regmatch_slab);
2491 s->prev = PL_regmatch_slab;
2492 s->next = NULL;
2493 PL_regmatch_slab->next = s;
2494 }
2495 PL_regmatch_slab = s;
86545054 2496 return SLAB_FIRST(s);
5d9a96ca 2497}
5b47454d 2498
95b24440
DM
2499/* simulate a recursive call to regmatch */
2500
2501#define REGMATCH(ns, where) \
5d9a96ca
DM
2502 st->scan = scan; \
2503 scan = (ns); \
2504 st->resume_state = resume_##where; \
95b24440
DM
2505 goto start_recurse; \
2506 resume_point_##where:
2507
aa283a38
DM
2508
2509/* push a new regex state. Set newst to point to it */
2510
2511#define PUSH_STATE(newst, resume) \
2512 depth++; \
2513 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH STATE(%d)\n", depth)); \
2514 st->scan = scan; \
2515 st->next = next; \
2516 st->n = n; \
2517 st->locinput = locinput; \
2518 st->resume_state = resume; \
2519 newst = st+1; \
86545054 2520 if (newst > SLAB_LAST(PL_regmatch_slab)) \
aa283a38
DM
2521 newst = S_push_slab(aTHX); \
2522 PL_regmatch_state = newst; \
2523 newst->cc = 0; \
2524 newst->minmod = 0; \
2525 newst->sw = 0; \
2526 newst->logical = 0; \
2527 newst->unwind = 0; \
2528 locinput = PL_reginput; \
2529 nextchr = UCHARAT(locinput);
2530
2531#define POP_STATE \
2532 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth)); \
2533 depth--; \
2534 st--; \
86545054 2535 if (st < SLAB_FIRST(PL_regmatch_slab)) { \
aa283a38 2536 PL_regmatch_slab = PL_regmatch_slab->prev; \
86545054 2537 st = SLAB_LAST(PL_regmatch_slab); \
aa283a38
DM
2538 } \
2539 PL_regmatch_state = st; \
2540 scan = st->scan; \
2541 next = st->next; \
2542 n = st->n; \
2543 locinput = st->locinput; \
2544 nextchr = UCHARAT(locinput);
2545
d6a28714
JH
2546/*
2547 - regmatch - main matching routine
2548 *
2549 * Conceptually the strategy is simple: check to see whether the current
2550 * node matches, call self recursively to see whether the rest matches,
2551 * and then act accordingly. In practice we make some effort to avoid
2552 * recursion, in particular by going through "ordinary" nodes (that don't
2553 * need to know whether the rest of the match failed) by a loop instead of
2554 * by recursion.
2555 */
2556/* [lwall] I've hoisted the register declarations to the outer block in order to
2557 * maybe save a little bit of pushing and popping on the stack. It also takes
2558 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2559 *
2560 * This function used to be heavily recursive, but since this had the
2561 * effect of blowing the CPU stack on complex regexes, it has been
2562 * restructured to be iterative, and to save state onto the heap rather
2563 * than the stack. Essentially whereever regmatch() used to be called, it
2564 * pushes the current state, notes where to return, then jumps back into
2565 * the main loop.
2566 *
2567 * Originally the structure of this function used to look something like
2568
2569 S_regmatch() {
2570 int a = 1, b = 2;
2571 ...
2572 while (scan != NULL) {
5d9a96ca 2573 a++; // do stuff with a and b
95b24440
DM
2574 ...
2575 switch (OP(scan)) {
2576 case FOO: {
2577 int local = 3;
2578 ...
2579 if (regmatch(...)) // recurse
2580 goto yes;
2581 }
2582 ...
2583 }
2584 }
2585 yes:
2586 return 1;
2587 }
2588
2589 * Now it looks something like this:
2590
5d9a96ca 2591 typedef struct {
95b24440
DM
2592 int a, b, local;
2593 int resume_state;
5d9a96ca 2594 } regmatch_state;
95b24440
DM
2595
2596 S_regmatch() {
5d9a96ca
DM
2597 regmatch_state *st = new();
2598 int depth=0;
2599 st->a++; // do stuff with a and b
95b24440
DM
2600 ...
2601 while (scan != NULL) {
2602 ...
2603 switch (OP(scan)) {
2604 case FOO: {
5d9a96ca 2605 st->local = 3;
95b24440 2606 ...
5d9a96ca
DM
2607 st->scan = scan;
2608 scan = ...;
2609 st->resume_state = resume_FOO;
2610 goto start_recurse; // recurse
95b24440 2611
5d9a96ca
DM
2612 resume_point_FOO:
2613 if (result)
95b24440
DM
2614 goto yes;
2615 }
2616 ...
2617 }
5d9a96ca
DM
2618 start_recurse:
2619 st = new(); push a new state
2620 st->a = 1; st->b = 2;
2621 depth++;
95b24440 2622 }
5d9a96ca 2623 yes:
95b24440 2624 result = 1;
5d9a96ca
DM
2625 if (depth--) {
2626 st = pop();
95b24440
DM
2627 switch (resume_state) {
2628 case resume_FOO:
2629 goto resume_point_FOO;
2630 ...
2631 }
2632 }
2633 return result
2634 }
2635
2636 * WARNING: this means that any line in this function that contains a
2637 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2638 * regmatch() using gotos instead. Thus the values of any local variables
2639 * not saved in the regmatch_state structure will have been lost when
2640 * execution resumes on the next line .
5d9a96ca
DM
2641 *
2642 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2643 * PL_regmatch_state always points to the currently active state, and
2644 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2645 * The first time regmatch is called, the first slab is allocated, and is
2646 * never freed until interpreter desctruction. When the slab is full,
2647 * a new one is allocated chained to the end. At exit from regmatch, slabs
2648 * allocated since entry are freed.
d6a28714 2649 */
95b24440 2650
3dab1dad 2651#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2652
07be1b83
YO
2653#ifdef DEBUGGING
2654STATIC void
2655S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2656{
2657 const int docolor = *PL_colors[0];
2658 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2659 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2660 /* The part of the string before starttry has one color
2661 (pref0_len chars), between starttry and current
2662 position another one (pref_len - pref0_len chars),
2663 after the current position the third one.
2664 We assume that pref0_len <= pref_len, otherwise we
2665 decrease pref0_len. */
2666 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2667 ? (5 + taill) - l : locinput - PL_bostr;
2668 int pref0_len;
2669
2670 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2671 pref_len++;
2672 pref0_len = pref_len - (locinput - PL_reg_starttry);
2673 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2674 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2675 ? (5 + taill) - pref_len : PL_regeol - locinput);
2676 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2677 l--;
2678 if (pref0_len < 0)
2679 pref0_len = 0;
2680 if (pref0_len > pref_len)
2681 pref0_len = pref_len;
2682 {
2683 const char * const s0 =
2684 do_utf8 && OP(scan) != CANY ?
2685 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
2686 pref0_len, 60, UNI_DISPLAY_REGEX) :
2687 locinput - pref_len;
2688 const int len0 = do_utf8 ? (int)strlen(s0) : pref0_len;
2689 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2690 pv_uni_display(PERL_DEBUG_PAD(1),
2691 (U8*)(locinput - pref_len + pref0_len),
2692 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2693 locinput - pref_len + pref0_len;
2694 const int len1 = do_utf8 ? (int)strlen(s1) : pref_len - pref0_len;
2695 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2696 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
2697 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2698 locinput;
2699 const int len2 = do_utf8 ? (int)strlen(s2) : l;
2700 PerlIO_printf(Perl_debug_log,
2701 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
2702 (IV)(locinput - PL_bostr),
2703 PL_colors[4],
2704 len0, s0,
2705 PL_colors[5],
2706 PL_colors[2],
2707 len1, s1,
2708 PL_colors[3],
2709 (docolor ? "" : "> <"),
2710 PL_colors[0],
2711 len2, s2,
2712 PL_colors[1],
2713 15 - l - pref_len + 1,
2714 "");
2715 }
2716}
2717#endif
2718
d6a28714 2719STATIC I32 /* 0 failure, 1 success */
3b0527fe 2720S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2721{
54df2634
NC
2722#if PERL_VERSION < 9
2723 dMY_CXT;
2724#endif
27da23d5 2725 dVAR;
95b24440 2726 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2727 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2728
3b0527fe
DM
2729 regexp *rex = reginfo->prog;
2730
5d9a96ca
DM
2731 regmatch_slab *orig_slab;
2732 regmatch_state *orig_state;
a3621e74 2733
5d9a96ca
DM
2734 /* the current state. This is a cached copy of PL_regmatch_state */
2735 register regmatch_state *st;
95b24440 2736
5d9a96ca
DM
2737 /* cache heavy used fields of st in registers */
2738 register regnode *scan;
2739 register regnode *next;
2740 register I32 n = 0; /* initialize to shut up compiler warning */
2741 register char *locinput = PL_reginput;
95b24440 2742
5d9a96ca
DM
2743 /* these variables are NOT saved during a recusive RFEGMATCH: */
2744 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2745 bool result; /* return value of S_regmatch */
2746 regnode *inner; /* Next node in internal branch. */
2747 int depth = 0; /* depth of recursion */
aa283a38 2748 regmatch_state *newst; /* when pushing a state, this is the new one */
77cb431f
DM
2749 regmatch_state *yes_state = NULL; /* state to pop to on success of
2750 subpattern */
95b24440
DM
2751
2752#ifdef DEBUGGING
ab74612d 2753 SV *re_debug_flags = NULL;
a3621e74 2754 GET_RE_DEBUG_FLAGS;
d6a28714
JH
2755 PL_regindent++;
2756#endif
2757
5d9a96ca
DM
2758 /* on first ever call to regmatch, allocate first slab */
2759 if (!PL_regmatch_slab) {
2760 Newx(PL_regmatch_slab, 1, regmatch_slab);
2761 PL_regmatch_slab->prev = NULL;
2762 PL_regmatch_slab->next = NULL;
86545054 2763 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2764 }
2765
2766 /* remember current high-water mark for exit */
2767 /* XXX this should be done with SAVE* instead */
2768 orig_slab = PL_regmatch_slab;
2769 orig_state = PL_regmatch_state;
2770
2771 /* grab next free state slot */
2772 st = ++PL_regmatch_state;
86545054 2773 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2774 st = PL_regmatch_state = S_push_slab(aTHX);
2775
2776 st->minmod = 0;
2777 st->sw = 0;
2778 st->logical = 0;
2779 st->unwind = 0;
2780 st->cc = NULL;
d6a28714
JH
2781 /* Note that nextchr is a byte even in UTF */
2782 nextchr = UCHARAT(locinput);
2783 scan = prog;
2784 while (scan != NULL) {
8ba1375e 2785
a3621e74 2786 DEBUG_EXECUTE_r( {
6136c704 2787 SV * const prop = sv_newmortal();
07be1b83 2788 dump_exec_pos( locinput, scan, do_utf8 );
32fc9b6a 2789 regprop(rex, prop, scan);
07be1b83
YO
2790
2791 PerlIO_printf(Perl_debug_log,
2792 "%3"IVdf":%*s%s(%"IVdf")\n",
2793 (IV)(scan - rex->program), PL_regindent*2, "",
2794 SvPVX_const(prop),
2795 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2a782b5b 2796 });
d6a28714
JH
2797
2798 next = scan + NEXT_OFF(scan);
2799 if (next == scan)
2800 next = NULL;
2801
2802 switch (OP(scan)) {
2803 case BOL:
7fba1cd6 2804 if (locinput == PL_bostr)
d6a28714 2805 {
3b0527fe 2806 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2807 break;
2808 }
d6a28714
JH
2809 sayNO;
2810 case MBOL:
12d33761
HS
2811 if (locinput == PL_bostr ||
2812 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2813 {
b8c5462f
JH
2814 break;
2815 }
d6a28714
JH
2816 sayNO;
2817 case SBOL:
c2a73568 2818 if (locinput == PL_bostr)
b8c5462f 2819 break;
d6a28714
JH
2820 sayNO;
2821 case GPOS:
3b0527fe 2822 if (locinput == reginfo->ganch)
d6a28714
JH
2823 break;
2824 sayNO;
2825 case EOL:
d6a28714
JH
2826 goto seol;
2827 case MEOL:
d6a28714 2828 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2829 sayNO;
b8c5462f 2830 break;
d6a28714
JH
2831 case SEOL:
2832 seol:
2833 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2834 sayNO;
d6a28714 2835 if (PL_regeol - locinput > 1)
b8c5462f 2836 sayNO;
b8c5462f 2837 break;
d6a28714
JH
2838 case EOS:
2839 if (PL_regeol != locinput)
b8c5462f 2840 sayNO;
d6a28714 2841 break;
ffc61ed2 2842 case SANY:
d6a28714 2843 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2844 sayNO;
f33976b4
DB
2845 if (do_utf8) {
2846 locinput += PL_utf8skip[nextchr];
2847 if (locinput > PL_regeol)
2848 sayNO;
2849 nextchr = UCHARAT(locinput);
2850 }
2851 else
2852 nextchr = UCHARAT(++locinput);
2853 break;
2854 case CANY:
2855 if (!nextchr && locinput >= PL_regeol)
2856 sayNO;
b8c5462f 2857 nextchr = UCHARAT(++locinput);
a0d0e21e 2858 break;
ffc61ed2 2859 case REG_ANY:
1aa99e6b
IH
2860 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2861 sayNO;
2862 if (do_utf8) {
b8c5462f 2863 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2864 if (locinput > PL_regeol)
2865 sayNO;
a0ed51b3 2866 nextchr = UCHARAT(locinput);
a0ed51b3 2867 }
1aa99e6b
IH
2868 else
2869 nextchr = UCHARAT(++locinput);
a0ed51b3 2870 break;
5b47454d 2871 case TRIE:
3dab1dad 2872 {
07be1b83 2873 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2874 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2875 trie_type = do_utf8 ?
2876 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2877 : trie_plain;
2878
2879 /* what trie are we using right now */
be8e71aa 2880 reg_trie_data * const trie
3dab1dad
YO
2881 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2882 U32 state = trie->startstate;
2883
2884 if (trie->bitmap && trie_type != trie_utf8_fold &&
2885 !TRIE_BITMAP_TEST(trie,*locinput)
2886 ) {
2887 if (trie->states[ state ].wordnum) {
2888 DEBUG_EXECUTE_r(
2889 PerlIO_printf(Perl_debug_log,
2890 "%*s %smatched empty string...%s\n",
2891 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2892 );
2893 break;
2894 } else {
2895 DEBUG_EXECUTE_r(
2896 PerlIO_printf(Perl_debug_log,
2897 "%*s %sfailed to match start class...%s\n",
2898 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2899 );
2900 sayNO_SILENT;
2901 }
2902 }
a3621e74 2903 {
07be1b83
YO
2904 /*
2905 traverse the TRIE keeping track of all accepting states
2906 we transition through until we get to a failing node.
2907 */
2908
a3621e74 2909 U8 *uc = ( U8* )locinput;
a3621e74
YO
2910 U16 charid = 0;
2911 U32 base = 0;
2912 UV uvc = 0;
2913 STRLEN len = 0;
2914 STRLEN foldlen = 0;
a3621e74
YO
2915 U8 *uscan = (U8*)NULL;
2916 STRLEN bufflen=0;
95b24440 2917 SV *sv_accept_buff = NULL;
5b47454d 2918
d8319b27 2919 st->u.trie.accepted = 0; /* how many accepting states we have seen */
95b24440 2920 result = 0;
a3621e74
YO
2921
2922 while ( state && uc <= (U8*)PL_regeol ) {
2923
5b47454d 2924 if (trie->states[ state ].wordnum) {
d8319b27 2925 if (!st->u.trie.accepted ) {
5b47454d
DM
2926 ENTER;
2927 SAVETMPS;
2928 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2929 sv_accept_buff=newSV(bufflen *
2930 sizeof(reg_trie_accepted) - 1);
2931 SvCUR_set(sv_accept_buff,
2932 sizeof(reg_trie_accepted));
2933 SvPOK_on(sv_accept_buff);
2934 sv_2mortal(sv_accept_buff);
d8319b27 2935 st->u.trie.accept_buff =
5b47454d
DM
2936 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2937 }
2938 else {
d8319b27 2939 if (st->u.trie.accepted >= bufflen) {
5b47454d 2940 bufflen *= 2;
d8319b27 2941 st->u.trie.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2942 SvGROW(sv_accept_buff,
2943 bufflen * sizeof(reg_trie_accepted));
2944 }
2945 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2946 + sizeof(reg_trie_accepted));
2947 }
d8319b27
DM
2948 st->u.trie.accept_buff[st->u.trie.accepted].wordnum = trie->states[state].wordnum;
2949 st->u.trie.accept_buff[st->u.trie.accepted].endpos = uc;
2950 ++st->u.trie.accepted;
5b47454d 2951 }
a3621e74
YO
2952
2953 base = trie->states[ state ].trans.base;
2954
07be1b83
YO
2955 DEBUG_TRIE_EXECUTE_r({
2956 dump_exec_pos( (char *)uc, scan, do_utf8 );
a3621e74 2957 PerlIO_printf( Perl_debug_log,
e4584336 2958 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
07be1b83 2959 2+PL_regindent * 2, "", PL_colors[4],
d8319b27 2960 (UV)state, (UV)base, (UV)st->u.trie.accepted );
07be1b83 2961 });
a3621e74
YO
2962
2963 if ( base ) {
5b47454d 2964 switch (trie_type) {
3dab1dad 2965 case trie_utf8_fold:
a3621e74
YO
2966 if ( foldlen>0 ) {
2967 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2968 foldlen -= len;
2969 uscan += len;
2970 len=0;
2971 } else {
1df70142 2972 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2973 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2974 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2975 foldlen -= UNISKIP( uvc );
2976 uscan = foldbuf + UNISKIP( uvc );
2977 }
5b47454d
DM
2978 break;
2979 case trie_utf8:
2980 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2981 &len, uniflags );
2982 break;
2983 case trie_plain:
e4584336 2984 uvc = (UV)*uc;
a3621e74
YO
2985 len = 1;
2986 }
2987
5b47454d
DM
2988 if (uvc < 256) {
2989 charid = trie->charmap[ uvc ];
2990 }
2991 else {
2992 charid = 0;
2993 if (trie->widecharmap) {
3dab1dad 2994 SV** const svpp = hv_fetch(trie->widecharmap,
5b47454d
DM
2995 (char*)&uvc, sizeof(UV), 0);
2996 if (svpp)
2997 charid = (U16)SvIV(*svpp);
2998 }
2999 }
a3621e74 3000
5b47454d
DM
3001 if (charid &&
3002 (base + charid > trie->uniquecharcount )
3003 && (base + charid - 1 - trie->uniquecharcount
3004 < trie->lasttrans)
3005 && trie->trans[base + charid - 1 -
3006 trie->uniquecharcount].check == state)
3007 {
3008 state = trie->trans[base + charid - 1 -
3009 trie->uniquecharcount ].next;
3010 }
3011 else {
3012 state = 0;
3013 }
3014 uc += len;
3015
3016 }
3017 else {
a3621e74
YO
3018 state = 0;
3019 }
3020 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
3021 PerlIO_printf( Perl_debug_log,
3022 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
3023 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3024 );
3025 }
d8319b27 3026 if (!st->u.trie.accepted )
a3621e74 3027 sayNO;
a3621e74
YO
3028
3029 /*
3030 There was at least one accepting state that we
3031 transitioned through. Presumably the number of accepting
3032 states is going to be low, typically one or two. So we
3033 simply scan through to find the one with lowest wordnum.
3034 Once we find it, we swap the last state into its place
3035 and decrement the size. We then try to match the rest of
3036 the pattern at the point where the word ends, if we
3037 succeed then we end the loop, otherwise the loop
3038 eventually terminates once all of the accepting states
3039 have been tried.
3040 */
a3621e74 3041
d8319b27 3042 if ( st->u.trie.accepted == 1 ) {
f2278c82 3043 DEBUG_EXECUTE_r({
be8e71aa 3044 SV ** const tmp = RX_DEBUG(reginfo->prog)
f2278c82
YO
3045 ? av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 )
3046 : NULL;
3047 PerlIO_printf( Perl_debug_log,
3048 "%*s %sonly one match : #%d <%s>%s\n",
3049 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3050 st->u.trie.accept_buff[ 0 ].wordnum,
3051 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
3052 PL_colors[5] );
3053 });
d8319b27 3054 PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos;
a3621e74
YO
3055 /* in this case we free tmps/leave before we call regmatch
3056 as we wont be using accept_buff again. */
3057 FREETMPS;
3058 LEAVE;
07be1b83 3059 /* do we need this? why dont we just do a break? */
95b24440
DM
3060 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
3061 /*** all unsaved local vars undefined at this point */
a3621e74
YO
3062 } else {
3063 DEBUG_EXECUTE_r(
e4584336 3064 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
d8319b27 3065 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)st->u.trie.accepted,
a3621e74
YO
3066 PL_colors[5] );
3067 );
d8319b27 3068 while ( !result && st->u.trie.accepted-- ) {
a3621e74
YO
3069 U32 best = 0;
3070 U32 cur;
d8319b27 3071 for( cur = 1 ; cur <= st->u.trie.accepted ; cur++ ) {
e4584336
RB
3072 DEBUG_TRIE_EXECUTE_r(
3073 PerlIO_printf( Perl_debug_log,
3074 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3075 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27
DM
3076 (IV)best, st->u.trie.accept_buff[ best ].wordnum, (IV)cur,
3077 st->u.trie.accept_buff[ cur ].wordnum, PL_colors[5] );
e4584336 3078 );
a3621e74 3079
d8319b27
DM
3080 if (st->u.trie.accept_buff[cur].wordnum <
3081 st->u.trie.accept_buff[best].wordnum)
e822a8b4 3082 best = cur;
a3621e74 3083 }
f2278c82
YO
3084 DEBUG_EXECUTE_r({
3085 reg_trie_data * const trie = (reg_trie_data*)
3086 rex->data->data[ARG(scan)];
be8e71aa 3087 SV ** const tmp = RX_DEBUG(reginfo->prog)
f2278c82
YO
3088 ? av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 )
3089 : NULL;
3090 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
3091 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3092 st->u.trie.accept_buff[best].wordnum,
3093 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
3094 PL_colors[5] );
3095 });
d8319b27
DM
3096 if ( best<st->u.trie.accepted ) {
3097 reg_trie_accepted tmp = st->u.trie.accept_buff[ best ];
3098 st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ];
3099 st->u.trie.accept_buff[ st->u.trie.accepted ] = tmp;
3100 best = st->u.trie.accepted;
a3621e74 3101 }
d8319b27 3102 PL_reginput = (char *)st->u.trie.accept_buff[ best ].endpos;
a3621e74
YO
3103
3104 /*
3105 as far as I can tell we only need the SAVETMPS/FREETMPS
3106 for re's with EVAL in them but I'm leaving them in for
3107 all until I can be sure.
3108 */
3109 SAVETMPS;
95b24440
DM
3110 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
3111 /*** all unsaved local vars undefined at this point */
a3621e74
YO
3112 FREETMPS;
3113 }
3114 FREETMPS;
3115 LEAVE;
3116 }
3117
95b24440 3118 if (result) {
a3621e74
YO
3119 sayYES;
3120 } else {
3121 sayNO;
3122 }
3dab1dad 3123 }}
a3621e74 3124 /* unreached codepoint */
95b24440
DM
3125 case EXACT: {
3126 char *s = STRING(scan);
5d9a96ca 3127 st->ln = STR_LEN(scan);
eb160463 3128 if (do_utf8 != UTF) {
bc517b45 3129 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3130 char *l = locinput;
be8e71aa 3131 const char * const e = s + st->ln;
a72c7584 3132
5ff6fc6d
JH
3133 if (do_utf8) {
3134 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3135 while (s < e) {
a3b680e6 3136 STRLEN ulen;
1aa99e6b 3137 if (l >= PL_regeol)
5ff6fc6d
JH
3138 sayNO;
3139 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3140 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3141 uniflags))
5ff6fc6d 3142 sayNO;
bc517b45 3143 l += ulen;
5ff6fc6d 3144 s ++;
1aa99e6b 3145 }
5ff6fc6d
JH
3146 }
3147 else {
3148 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3149 while (s < e) {
a3b680e6 3150 STRLEN ulen;
1aa99e6b
IH
3151 if (l >= PL_regeol)
3152 sayNO;
5ff6fc6d 3153 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3154 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3155 uniflags))
1aa99e6b 3156 sayNO;
bc517b45 3157 s += ulen;
a72c7584 3158 l ++;
1aa99e6b 3159 }
5ff6fc6d 3160 }
1aa99e6b
IH
3161 locinput = l;
3162 nextchr = UCHARAT(locinput);
3163 break;
3164 }
bc517b45 3165 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3166 /* Inline the first character, for speed. */
3167 if (UCHARAT(s) != nextchr)
3168 sayNO;
5d9a96ca 3169 if (PL_regeol - locinput < st->ln)
d6a28714 3170 sayNO;
5d9a96ca 3171 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 3172 sayNO;
5d9a96ca 3173 locinput += st->ln;
d6a28714
JH
3174 nextchr = UCHARAT(locinput);
3175 break;
95b24440 3176 }
d6a28714 3177 case EXACTFL:
b8c5462f
JH
3178 PL_reg_flags |= RF_tainted;
3179 /* FALL THROUGH */
95b24440 3180 case EXACTF: {
be8e71aa 3181 char * const s = STRING(scan);
5d9a96ca 3182 st->ln = STR_LEN(scan);
d6a28714 3183
d07ddd77
JH
3184 if (do_utf8 || UTF) {
3185 /* Either target or the pattern are utf8. */
be8e71aa 3186 const char * const l = locinput;
d07ddd77 3187 char *e = PL_regeol;
bc517b45 3188
5d9a96ca 3189 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 3190 l, &e, 0, do_utf8)) {
5486206c
JH
3191 /* One more case for the sharp s:
3192 * pack("U0U*", 0xDF) =~ /ss/i,
3193 * the 0xC3 0x9F are the UTF-8
3194 * byte sequence for the U+00DF. */
3195 if (!(do_utf8 &&
3196 toLOWER(s[0]) == 's' &&
5d9a96ca 3197 st->ln >= 2 &&
5486206c
JH
3198 toLOWER(s[1]) == 's' &&
3199 (U8)l[0] == 0xC3 &&
3200 e - l >= 2 &&
3201 (U8)l[1] == 0x9F))
3202 sayNO;
3203 }
d07ddd77
JH
3204 locinput = e;
3205 nextchr = UCHARAT(locinput);
3206 break;
a0ed51b3 3207 }
d6a28714 3208
bc517b45
JH
3209 /* Neither the target and the pattern are utf8. */
3210
d6a28714
JH
3211 /* Inline the first character, for speed. */
3212 if (UCHARAT(s) != nextchr &&
3213 UCHARAT(s) != ((OP(scan) == EXACTF)
3214 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3215 sayNO;
5d9a96ca 3216 if (PL_regeol - locinput < st->ln)
b8c5462f 3217 sayNO;
5d9a96ca
DM
3218 if (st->ln > 1 && (OP(scan) == EXACTF
3219 ? ibcmp(s, locinput, st->ln)
3220 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 3221 sayNO;
5d9a96ca 3222 locinput += st->ln;
d6a28714 3223 nextchr = UCHARAT(locinput);
a0d0e21e 3224 break;
95b24440 3225 }
d6a28714 3226 case ANYOF:
ffc61ed2 3227 if (do_utf8) {
9e55ce06
JH
3228 STRLEN inclasslen = PL_regeol - locinput;
3229
32fc9b6a 3230 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 3231 sayNO_ANYOF;
ffc61ed2
JH
3232 if (locinput >= PL_regeol)
3233 sayNO;
0f0076b4 3234 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3235 nextchr = UCHARAT(locinput);
e0f9d4a8 3236 break;
ffc61ed2
JH
3237 }
3238 else {
3239 if (nextchr < 0)
3240 nextchr = UCHARAT(locinput);
32fc9b6a 3241 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 3242 sayNO_ANYOF;
ffc61ed2
JH
3243 if (!nextchr && locinput >= PL_regeol)
3244 sayNO;
3245 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3246 break;
3247 }
3248 no_anyof:
3249 /* If we might have the case of the German sharp s
3250 * in a casefolding Unicode character class. */
3251
ebc501f0
JH
3252 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3253 locinput += SHARP_S_SKIP;
e0f9d4a8 3254 nextchr = UCHARAT(locinput);
ffc61ed2 3255 }
e0f9d4a8
JH
3256 else
3257 sayNO;
b8c5462f 3258 break;
d6a28714 3259 case ALNUML:
b8c5462f
JH
3260 PL_reg_flags |= RF_tainted;
3261 /* FALL THROUGH */
d6a28714 3262 case ALNUM:
b8c5462f 3263 if (!nextchr)
4633a7c4 3264 sayNO;
ffc61ed2 3265 if (do_utf8) {
1a4fad37 3266 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3267 if (!(OP(scan) == ALNUM
bb7a0f54 3268 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3269 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3270 {
3271 sayNO;
a0ed51b3 3272 }
b8c5462f 3273 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3274 nextchr = UCHARAT(locinput);
3275 break;
3276 }
ffc61ed2 3277 if (!(OP(scan) == ALNUM
d6a28714 3278 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3279 sayNO;
b8c5462f 3280 nextchr = UCHARAT(++locinput);
a0d0e21e 3281 break;
d6a28714 3282 case NALNUML:
b8c5462f
JH
3283 PL_reg_flags |= RF_tainted;
3284 /* FALL THROUGH */
d6a28714
JH
3285 case NALNUM:
3286 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3287 sayNO;
ffc61ed2 3288 if (do_utf8) {
1a4fad37 3289 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3290 if (OP(scan) == NALNUM
bb7a0f54 3291 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3292 : isALNUM_LC_utf8((U8*)locinput))
3293 {
b8c5462f 3294 sayNO;
d6a28714 3295 }
b8c5462f
JH
3296 locinput += PL_utf8skip[nextchr];
3297 nextchr = UCHARAT(locinput);
3298 break;
3299 }
ffc61ed2 3300 if (OP(scan) == NALNUM
d6a28714 3301 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3302 sayNO;
76e3520e 3303 nextchr = UCHARAT(++locinput);
a0d0e21e 3304 break;
d6a28714
JH
3305 case BOUNDL:
3306 case NBOUNDL:
3280af22 3307 PL_reg_flags |= RF_tainted;
bbce6d69 3308 /* FALL THROUGH */
d6a28714
JH
3309 case BOUND:
3310 case NBOUND:
3311 /* was last char in word? */
ffc61ed2 3312 if (do_utf8) {
12d33761 3313 if (locinput == PL_bostr)
5d9a96ca 3314 st->ln = '\n';
ffc61ed2 3315 else {
a3b680e6 3316 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3317
4ad0818d 3318 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3319 }
3320 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3321 st->ln = isALNUM_uni(st->ln);
1a4fad37 3322 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3323 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3324 }
3325 else {
5d9a96ca 3326 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3327 n = isALNUM_LC_utf8((U8*)locinput);
3328 }
a0ed51b3 3329 }
d6a28714 3330 else {
5d9a96ca 3331 st->ln = (locinput != PL_bostr) ?
12d33761 3332 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3333 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3334 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3335 n = isALNUM(nextchr);
3336 }
3337 else {
5d9a96ca 3338 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3339 n = isALNUM_LC(nextchr);
3340 }
d6a28714 3341 }
5d9a96ca 3342 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3343 OP(scan) == BOUNDL))
3344 sayNO;
a0ed51b3 3345 break;
d6a28714 3346 case SPACEL:
3280af22 3347 PL_reg_flags |= RF_tainted;
bbce6d69 3348 /* FALL THROUGH */
d6a28714 3349 case SPACE:
9442cb0e 3350 if (!nextchr)
4633a7c4 3351 sayNO;
1aa99e6b 3352 if (do_utf8) {
fd400ab9 3353 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3354 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3355 if (!(OP(scan) == SPACE
bb7a0f54 3356 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3357 : isSPACE_LC_utf8((U8*)locinput)))
3358 {
3359 sayNO;
3360 }
3361 locinput += PL_utf8skip[nextchr];
3362 nextchr = UCHARAT(locinput);
3363 break;
d6a28714 3364 }
ffc61ed2
JH
3365 if (!(OP(scan) == SPACE
3366 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3367 sayNO;
3368 nextchr = UCHARAT(++locinput);
3369 }
3370 else {
3371 if (!(OP(scan) == SPACE
3372 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3373 sayNO;
3374 nextchr = UCHARAT(++locinput);
a0ed51b3 3375 }
a0ed51b3 3376 break;
d6a28714 3377 case NSPACEL:
3280af22 3378 PL_reg_flags |= RF_tainted;
bbce6d69 3379 /* FALL THROUGH */
d6a28714 3380 case NSPACE:
9442cb0e 3381 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3382 sayNO;
1aa99e6b 3383 if (do_utf8) {
1a4fad37 3384 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3385 if (OP(scan) == NSPACE
bb7a0f54 3386 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3387 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3388 {
3389 sayNO;
3390 }
3391 locinput += PL_utf8skip[nextchr];
3392 nextchr = UCHARAT(locinput);
3393 break;
a0ed51b3 3394 }
ffc61ed2 3395 if (OP(scan) == NSPACE
d6a28714 3396 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3397 sayNO;
76e3520e 3398 nextchr = UCHARAT(++locinput);
a0d0e21e 3399 break;
d6a28714 3400 case DIGITL:
a0ed51b3
LW
3401 PL_reg_flags |= RF_tainted;
3402 /* FALL THROUGH */
d6a28714 3403 case DIGIT:
9442cb0e 3404 if (!nextchr)
a0ed51b3 3405 sayNO;
1aa99e6b 3406 if (do_utf8) {
1a4fad37 3407 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3408 if (!(OP(scan) == DIGIT
bb7a0f54 3409 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3410 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3411 {
a0ed51b3 3412 sayNO;
dfe13c55 3413 }
6f06b55f 3414 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3415 nextchr = UCHARAT(locinput);
3416 break;
3417 }
ffc61ed2 3418 if (!(OP(scan) == DIGIT
9442cb0e 3419 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3420 sayNO;
3421 nextchr = UCHARAT(++locinput);
3422 break;
d6a28714 3423 case NDIGITL:
b8c5462f
JH
3424 PL_reg_flags |= RF_tainted;
3425 /* FALL THROUGH */
d6a28714 3426 case NDIGIT:
9442cb0e 3427 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3428 sayNO;
1aa99e6b 3429 if (do_utf8) {
1a4fad37 3430 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3431 if (OP(scan) == NDIGIT
bb7a0f54 3432 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3433 : isDIGIT_LC_utf8((U8*)locinput))
3434 {
a0ed51b3 3435 sayNO;
9442cb0e 3436 }
6f06b55f 3437 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3438 nextchr = UCHARAT(locinput);
3439 break;
3440 }
ffc61ed2 3441 if (OP(scan) == NDIGIT
9442cb0e 3442 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3443 sayNO;
3444 nextchr = UCHARAT(++locinput);
3445 break;
3446 case CLUMP:
b7c83a7e 3447 if (locinput >= PL_regeol)
a0ed51b3 3448 sayNO;
b7c83a7e 3449 if (do_utf8) {
1a4fad37 3450 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3451 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3452 sayNO;
3453 locinput += PL_utf8skip[nextchr];
3454 while (locinput < PL_regeol &&
3455 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3456 locinput += UTF8SKIP(locinput);
3457 if (locinput > PL_regeol)
3458 sayNO;
eb08e2da
JH
3459 }
3460 else
3461 locinput++;
a0ed51b3
LW
3462 nextchr = UCHARAT(locinput);
3463 break;
c8756f30 3464 case REFFL:
3280af22 3465 PL_reg_flags |= RF_tainted;
c8756f30 3466 /* FALL THROUGH */
c277df42 3467 case REF:
95b24440
DM
3468 case REFF: {
3469 char *s;
c277df42 3470 n = ARG(scan); /* which paren pair */
5d9a96ca 3471 st->ln = PL_regstartp[n];
2c2d71f5 3472 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3473 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3474 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3475 if (st->ln == PL_regendp[n])
a0d0e21e 3476 break;
a0ed51b3 3477
5d9a96ca 3478 s = PL_bostr + st->ln;
1aa99e6b 3479 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3480 char *l = locinput;
a3b680e6 3481 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3482 /*
3483 * Note that we can't do the "other character" lookup trick as
3484 * in the 8-bit case (no pun intended) because in Unicode we
3485 * have to map both upper and title case to lower case.
3486 */
3487 if (OP(scan) == REFF) {
3488 while (s < e) {
a3b680e6
AL
3489 STRLEN ulen1, ulen2;
3490 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3491 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3492
a0ed51b3
LW
3493 if (l >= PL_regeol)
3494 sayNO;
a2a2844f
JH
3495 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3496 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3497 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3498 sayNO;
a2a2844f
JH
3499 s += ulen1;
3500 l += ulen2;
a0ed51b3
LW
3501 }
3502 }
3503 locinput = l;
3504 nextchr = UCHARAT(locinput);
3505 break;
3506 }
3507
a0d0e21e 3508 /* Inline the first character, for speed. */
76e3520e 3509 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3510 (OP(scan) == REF ||
3511 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3512 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3513 sayNO;
5d9a96ca
DM
3514 st->ln = PL_regendp[n] - st->ln;
3515 if (locinput + st->ln > PL_regeol)
4633a7c4 3516 sayNO;
5d9a96ca
DM
3517 if (st->ln > 1 && (OP(scan) == REF
3518 ? memNE(s, locinput, st->ln)
c8756f30 3519 : (OP(scan) == REFF
5d9a96ca
DM
3520 ? ibcmp(s, locinput, st->ln)
3521 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3522 sayNO;
5d9a96ca 3523 locinput += st->ln;
76e3520e 3524 nextchr = UCHARAT(locinput);
a0d0e21e 3525 break;
95b24440 3526 }
a0d0e21e
LW
3527
3528 case NOTHING:
c277df42 3529 case TAIL:
a0d0e21e
LW
3530 break;
3531 case BACK:
3532 break;
c277df42
IZ
3533 case EVAL:
3534 {
c277df42 3535 SV *ret;
8e5e9ebe 3536 {
4aabdb9b
DM
3537 /* execute the code in the {...} */
3538 dSP;
6136c704 3539 SV ** const before = SP;
4aabdb9b
DM
3540 OP_4tree * const oop = PL_op;
3541 COP * const ocurcop = PL_curcop;
3542 PAD *old_comppad;
4aabdb9b
DM
3543
3544 n = ARG(scan);
32fc9b6a 3545 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3546 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3547 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3548 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3549
8e5e9ebe
RGS
3550 CALLRUNOPS(aTHX); /* Scalar context. */
3551 SPAGAIN;
3552 if (SP == before)
075aa684 3553 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3554 else {
3555 ret = POPs;
3556 PUTBACK;
3557 }
4aabdb9b
DM
3558
3559 PL_op = oop;
3560 PAD_RESTORE_LOCAL(old_comppad);
3561 PL_curcop = ocurcop;
3562 if (!st->logical) {
3563 /* /(?{...})/ */
3564 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3565 break;
3566 }
8e5e9ebe 3567 }
4aabdb9b
DM
3568 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3569 regexp *re;
4aabdb9b 3570 {
4f639d21
DM
3571 /* extract RE object from returned value; compiling if
3572 * necessary */
3573
6136c704 3574 MAGIC *mg = NULL;
be8e71aa 3575 const SV *sv;
faf82a0b
AE
3576 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3577 mg = mg_find(sv, PERL_MAGIC_qr);
3578 else if (SvSMAGICAL(ret)) {
3579 if (SvGMAGICAL(ret))
3580 sv_unmagic(ret, PERL_MAGIC_qr);
3581 else
3582 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3583 }
faf82a0b 3584
0f5d15d6
IZ
3585 if (mg) {
3586 re = (regexp *)mg->mg_obj;
df0003d4 3587 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3588 }
3589 else {
3590 STRLEN len;
6136c704 3591 const char * const t = SvPV_const(ret, len);
0f5d15d6 3592 PMOP pm;
a3b680e6 3593 const I32 osize = PL_regsize;
0f5d15d6 3594
5fcd1c1b 3595 Zero(&pm, 1, PMOP);
4aabdb9b 3596 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3597 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3598 if (!(SvFLAGS(ret)
faf82a0b
AE
3599 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3600 | SVs_GMG)))
14befaf4
DM
3601 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3602 PERL_MAGIC_qr,0,0);
0f5d15d6 3603 PL_regsize = osize;
0f5d15d6 3604 }
4aabdb9b 3605 }
aa283a38
DM
3606
3607 /* run the pattern returned from (??{...}) */
3608
4aabdb9b
DM
3609 DEBUG_EXECUTE_r(
3610 PerlIO_printf(Perl_debug_log,
3611 "Entering embedded \"%s%.60s%s%s\"\n",
3612 PL_colors[0],
3613 re->precomp,
3614 PL_colors[1],
3615 (strlen(re->precomp) > 60 ? "..." : ""))
3616 );
2c2d71f5 3617
4aabdb9b
DM
3618 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
3619 REGCP_SET(st->u.eval.lastcp);
4aabdb9b
DM
3620 *PL_reglastparen = 0;
3621 *PL_reglastcloseparen = 0;
4aabdb9b 3622 PL_reginput = locinput;
4aabdb9b
DM
3623
3624 /* XXXX This is too dramatic a measure... */
3625 PL_reg_maxiter = 0;
3626
5d9a96ca 3627 st->logical = 0;
aa283a38
DM
3628 st->u.eval.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3629 ((re->reganch & ROPT_UTF8) != 0);
3630 if (st->u.eval.toggleutf) PL_reg_flags ^= RF_utf8;
3631 st->u.eval.prev_rex = rex;
aa283a38 3632 rex = re;
aa283a38 3633
77cb431f
DM
3634 /* resume to current state on success */
3635 st->u.yes.prev_yes_state = yes_state;
3636 yes_state = st;
aa283a38
DM
3637 PUSH_STATE(newst, resume_EVAL);
3638 st = newst;
3639
3640 /* now continue from first node in postoned RE */
3641 next = re->program + 1;
3642 break;
4aabdb9b 3643 /* NOTREACHED */
a0ed51b3 3644 }
4aabdb9b
DM
3645 /* /(?(?{...})X|Y)/ */
3646 st->sw = SvTRUE(ret);
3647 st->logical = 0;
c277df42
IZ
3648 break;
3649 }
a0d0e21e 3650 case OPEN:
c277df42 3651 n = ARG(scan); /* which paren pair */
3280af22
NIS
3652 PL_reg_start_tmp[n] = locinput;
3653 if (n > PL_regsize)
3654 PL_regsize = n;
a0d0e21e
LW
3655 break;
3656 case CLOSE:
c277df42 3657 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3658 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3659 PL_regendp[n] = locinput - PL_bostr;
eb160463 3660 if (n > (I32)*PL_reglastparen)
3280af22 3661 *PL_reglastparen = n;
a01268b5 3662 *PL_reglastcloseparen = n;
a0d0e21e 3663 break;
c277df42
IZ
3664 case GROUPP:
3665 n = ARG(scan); /* which paren pair */
5d9a96ca 3666 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3667 break;
3668 case IFTHEN:
2c2d71f5 3669 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3670 if (st->sw)
c277df42
IZ
3671 next = NEXTOPER(NEXTOPER(scan));
3672 else {
3673 next = scan + ARG(scan);
3674 if (OP(next) == IFTHEN) /* Fake one. */
3675 next = NEXTOPER(NEXTOPER(next));
3676 }
3677 break;
3678 case LOGICAL:
5d9a96ca 3679 st->logical = scan->flags;
c277df42 3680 break;
2ab05381 3681/*******************************************************************
a0374537
DM
3682 cc points to the regmatch_state associated with the most recent CURLYX.
3683 This struct contains info about the innermost (...)* loop (an
3684 "infoblock"), and a pointer to the next outer cc.
2ab05381
IZ
3685
3686 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3687
95b24440 3688 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3689
a0374537 3690 2) This regnode populates cc, and calls regmatch() recursively
95b24440 3691 with the starting point at WHILEM node;
2ab05381
IZ
3692
3693 3) Each hit of WHILEM node tries to match A and Z (in the order
3694 depending on the current iteration, min/max of {min,max} and
3695 greediness). The information about where are nodes for "A"
a0374537 3696 and "Z" is read from cc, as is info on how many times "A"
2ab05381
IZ
3697 was already matched, and greediness.
3698
3699 4) After A matches, the same WHILEM node is hit again.
3700
95b24440 3701 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
2ab05381 3702 of the same pair. Thus when WHILEM tries to match Z, it temporarily
95b24440 3703 resets cc, since this Y(A)*Z can be a part of some other loop:
2ab05381
IZ
3704 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3705 of the external loop.
3706
a0374537 3707 Currently present infoblocks form a tree with a stem formed by st->cc
2ab05381
IZ
3708 and whatever it mentions via ->next, and additional attached trees
3709 corresponding to temporarily unset infoblocks as in "5" above.
3710
95b24440 3711 In the following picture, infoblocks for outer loop of
2ab05381
IZ
3712 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3713 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3714 infoblocks are drawn below the "reset" infoblock.
3715
3716 In fact in the picture below we do not show failed matches for Z and T
3717 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3718 more obvious *why* one needs to *temporary* unset infoblocks.]
3719
3720 Matched REx position InfoBlocks Comment
3721 (Y(A)*?Z)*?T x
3722 Y(A)*?Z)*?T x <- O
3723 Y (A)*?Z)*?T x <- O
3724 Y A)*?Z)*?T x <- O <- I
3725 YA )*?Z)*?T x <- O <- I
3726 YA A)*?Z)*?T x <- O <- I
3727 YAA )*?Z)*?T x <- O <- I
3728 YAA Z)*?T x <- O # Temporary unset I
3729 I
3730
3731 YAAZ Y(A)*?Z)*?T x <- O
3732 I
3733
3734 YAAZY (A)*?Z)*?T x <- O
3735 I
3736
3737 YAAZY A)*?Z)*?T x <- O <- I
3738 I
3739
3740 YAAZYA )*?Z)*?T x <- O <- I
3741 I
3742
3743 YAAZYA Z)*?T x <- O # Temporary unset I
3744 I,I
3745
3746 YAAZYAZ )*?T x <- O
3747 I,I
3748
3749 YAAZYAZ T x # Temporary unset O
3750 O
3751 I,I
3752
3753 YAAZYAZT x
3754 O
3755 I,I
3756 *******************************************************************/
95b24440 3757
a0d0e21e 3758 case CURLYX: {
cb434fcc
IZ
3759 /* No need to save/restore up to this paren */
3760 I32 parenfloor = scan->flags;
c277df42 3761
c2b7afd3
NC
3762 /* Dave says:
3763
3764 CURLYX and WHILEM are always paired: they're the moral
3765 equivalent of pp_enteriter anbd pp_iter.
3766
3767 The only time next could be null is if the node tree is
3768 corrupt. This was mentioned on p5p a few days ago.
3769
3770 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3771 So we'll assert that this is true:
3772 */
3773 assert(next);
30b2893d 3774 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
c277df42 3775 next += ARG(next);
cb434fcc
IZ
3776 /* XXXX Probably it is better to teach regpush to support
3777 parenfloor > PL_regsize... */
eb160463 3778 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc 3779 parenfloor = *PL_reglastparen; /* Pessimization... */
a0374537 3780
d8319b27
DM
3781 st->u.curlyx.cp = PL_savestack_ix;
3782 st->u.curlyx.outercc = st->cc;
a0374537
DM
3783 st->cc = st;
3784 /* these fields contain the state of the current curly.
3785 * they are accessed by subsequent WHILEMs;
3786 * cur and lastloc are also updated by WHILEM */
d8319b27
DM
3787 st->u.curlyx.parenfloor = parenfloor;
3788 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3789 st->u.curlyx.min = ARG1(scan);
3790 st->u.curlyx.max = ARG2(scan);
3791 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3792 st->u.curlyx.lastloc = 0;
a0374537
DM
3793 /* st->next and st->minmod are also read by WHILEM */
3794
3280af22 3795 PL_reginput = locinput;
95b24440
DM
3796 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3797 /*** all unsaved local vars undefined at this point */
d8319b27
DM
3798 regcpblow(st->u.curlyx.cp);
3799 st->cc = st->u.curlyx.outercc;
95b24440 3800 saySAME(result);
a0d0e21e 3801 }
5f66b61c 3802 /* NOTREACHED */
a0d0e21e
LW
3803 case WHILEM: {
3804 /*
3805 * This is really hard to understand, because after we match
3806 * what we're trying to match, we must make sure the rest of
2c2d71f5 3807 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3808 * to go back UP the parse tree by recursing ever deeper. And
3809 * if it fails, we have to reset our parent's current state
3810 * that we can try again after backing off.
3811 */
3812
c2b7afd3
NC
3813 /* Dave says:
3814
3815 st->cc gets initialised by CURLYX ready for use by WHILEM.
3816 So again, unless somethings been corrupted, st->cc cannot
3817 be null at that point in WHILEM.
3818
3819 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3820 So we'll assert that this is true:
3821 */
3822 assert(st->cc);
d8319b27
DM
3823 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3824 st->u.whilem.cache_offset = 0;
3825 st->u.whilem.cache_bit = 0;
c277df42 3826
d8319b27 3827 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3280af22 3828 PL_reginput = locinput;
a0d0e21e 3829
a3621e74 3830 DEBUG_EXECUTE_r(
9041c2e3 3831 PerlIO_printf(Perl_debug_log,
91f3b821 3832 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3833 REPORT_CODE_OFF+PL_regindent*2, "",
d8319b27
DM
3834 (long)n, (long)st->cc->u.curlyx.min,
3835 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
c277df42 3836 );
4633a7c4 3837
a0d0e21e
LW
3838 /* If degenerate scan matches "", assume scan done. */
3839
d8319b27
DM
3840 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3841 st->u.whilem.savecc = st->cc;
3842 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3843 if (st->cc)
d8319b27 3844 st->ln = st->cc->u.curlyx.cur;
a3621e74 3845 DEBUG_EXECUTE_r(
c3464db5
DD
3846 PerlIO_printf(Perl_debug_log,
3847 "%*s empty match detected, try continuation...\n",
3280af22 3848 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3849 );
d8319b27 3850 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
95b24440 3851 /*** all unsaved local vars undefined at this point */
d8319b27 3852 st->cc = st->u.whilem.savecc;
95b24440 3853 if (result)
4633a7c4 3854 sayYES;
d8319b27
DM
3855 if (st->cc->u.curlyx.outercc)
3856 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4633a7c4 3857 sayNO;
a0d0e21e
LW
3858 }
3859
3860 /* First just match a string of min scans. */
3861
d8319b27
DM
3862 if (n < st->cc->u.curlyx.min) {
3863 st->cc->u.curlyx.cur = n;
3864 st->cc->u.curlyx.lastloc = locinput;
3865 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
95b24440
DM
3866 /*** all unsaved local vars undefined at this point */
3867 if (result)
4633a7c4 3868 sayYES;
d8319b27
DM
3869 st->cc->u.curlyx.cur = n - 1;
3870 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
4633a7c4 3871 sayNO;
a0d0e21e
LW
3872 }
3873
2c2d71f5
JH
3874 if (scan->flags) {
3875 /* Check whether we already were at this position.
3876 Postpone detection until we know the match is not
3877 *that* much linear. */
3878 if (!PL_reg_maxiter) {
3879 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
3880 /* possible overflow for long strings and many CURLYX's */
3881 if (PL_reg_maxiter < 0)
3882 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
3883 PL_reg_leftiter = PL_reg_maxiter;
3884 }
3885 if (PL_reg_leftiter-- == 0) {
a3b680e6 3886 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
2c2d71f5 3887 if (PL_reg_poscache) {
eb160463 3888 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3889 Renew(PL_reg_poscache, size, char);
3890 PL_reg_poscache_size = size;
3891 }
3892 Zero(PL_reg_poscache, size, char);
3893 }
3894 else {
3895 PL_reg_poscache_size = size;
a02a5408 3896 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3897 }
a3621e74 3898 DEBUG_EXECUTE_r(
2c2d71f5
JH
3899 PerlIO_printf(Perl_debug_log,
3900 "%sDetected a super-linear match, switching on caching%s...\n",
3901 PL_colors[4], PL_colors[5])
3902 );
3903 }
3904 if (PL_reg_leftiter < 0) {
d8319b27 3905 st->u.whilem.cache_offset = locinput - PL_bostr;
2c2d71f5 3906
d8319b27
DM
3907 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3908 + st->u.whilem.cache_offset * (scan->flags>>4);
3909 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3910 st->u.whilem.cache_offset /= 8;
3911 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
a3621e74 3912 DEBUG_EXECUTE_r(
2c2d71f5
JH
3913 PerlIO_printf(Perl_debug_log,
3914 "%*s already tried at this position...\n",
3915 REPORT_CODE_OFF+PL_regindent*2, "")
3916 );
3ab3c9b4
HS
3917 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3918 /* cache records success */
c2b0868c
HS
3919 sayYES;
3920 else
3ab3c9b4 3921 /* cache records failure */
c2b0868c 3922 sayNO_SILENT;
2c2d71f5 3923 }
2c2d71f5
JH
3924 }
3925 }
3926
a0d0e21e
LW
3927 /* Prefer next over scan for minimal matching. */
3928
5d9a96ca 3929 if (st->cc->minmod) {
d8319b27
DM
3930 st->u.whilem.savecc = st->cc;
3931 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3932 if (st->cc)
d8319b27
DM
3933 st->ln = st->cc->u.curlyx.cur;
3934 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3935 REGCP_SET(st->u.whilem.lastcp);
3936 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
95b24440 3937 /*** all unsaved local vars undefined at this point */
d8319b27 3938 st->cc = st->u.whilem.savecc;
95b24440 3939 if (result) {
d8319b27 3940 regcpblow(st->u.whilem.cp);
3ab3c9b4 3941 CACHEsayYES; /* All done. */
5f05dabc 3942 }
d8319b27 3943 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3944 regcppop(rex);
d8319b27
DM
3945 if (st->cc->u.curlyx.outercc)
3946 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
a0d0e21e 3947
d8319b27 3948 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
9041c2e3 3949 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3950 && !(PL_reg_flags & RF_warned)) {
3951 PL_reg_flags |= RF_warned;
9014280d 3952 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3953 "Complex regular subexpression recursion",
3954 REG_INFTY - 1);
c277df42 3955 }
3ab3c9b4 3956 CACHEsayNO;
c277df42 3957 }
a687059c 3958
a3621e74 3959 DEBUG_EXECUTE_r(
c3464db5
DD
3960 PerlIO_printf(Perl_debug_log,
3961 "%*s trying longer...\n",
3280af22 3962 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3963 );
a0d0e21e 3964 /* Try scanning more and see if it helps. */
3280af22 3965 PL_reginput = locinput;
d8319b27
DM
3966 st->cc->u.curlyx.cur = n;
3967 st->cc->u.curlyx.lastloc = locinput;
3968 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3969 REGCP_SET(st->u.whilem.lastcp);
3970 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
95b24440
DM
3971 /*** all unsaved local vars undefined at this point */
3972 if (result) {
d8319b27 3973 regcpblow(st->u.whilem.cp);
3ab3c9b4 3974 CACHEsayYES;
5f05dabc 3975 }
d8319b27 3976 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3977 regcppop(rex);
d8319b27
DM
3978 st->cc->u.curlyx.cur = n - 1;
3979 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3980 CACHEsayNO;
a0d0e21e
LW
3981 }
3982
3983 /* Prefer scan over next for maximal matching. */
3984
d8319b27
DM
3985 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3986 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3987 st->cc->u.curlyx.cur = n;
3988 st->cc->u.curlyx.lastloc = locinput;
3989 REGCP_SET(st->u.whilem.lastcp);
3990 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
95b24440
DM
3991 /*** all unsaved local vars undefined at this point */
3992 if (result) {
d8319b27 3993 regcpblow(st->u.whilem.cp);
3ab3c9b4 3994 CACHEsayYES;
5f05dabc 3995 }
d8319b27 3996 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3997 regcppop(rex); /* Restore some previous $<digit>s? */
3280af22 3998 PL_reginput = locinput;
a3621e74 3999 DEBUG_EXECUTE_r(
c3464db5
DD
4000 PerlIO_printf(Perl_debug_log,
4001 "%*s failed, try continuation...\n",
3280af22 4002 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
4003 );
4004 }
9041c2e3 4005 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 4006 && !(PL_reg_flags & RF_warned)) {
3280af22 4007 PL_reg_flags |= RF_warned;
9014280d 4008 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
4009 "Complex regular subexpression recursion",
4010 REG_INFTY - 1);
a0d0e21e
LW
4011 }
4012
4013 /* Failed deeper matches of scan, so see if this one works. */
d8319b27
DM
4014 st->u.whilem.savecc = st->cc;
4015 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 4016 if (st->cc)
d8319b27
DM
4017 st->ln = st->cc->u.curlyx.cur;
4018 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
95b24440 4019 /*** all unsaved local vars undefined at this point */
d8319b27 4020 st->cc = st->u.whilem.savecc;
95b24440 4021 if (result)
3ab3c9b4 4022 CACHEsayYES;
d8319b27
DM
4023 if (st->cc->u.curlyx.outercc)
4024 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4025 st->cc->u.curlyx.cur = n - 1;
4026 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 4027 CACHEsayNO;
a0d0e21e 4028 }
5f66b61c 4029 /* NOTREACHED */
9041c2e3 4030 case BRANCHJ:
c277df42
IZ
4031 next = scan + ARG(scan);
4032 if (next == scan)
4033 next = NULL;
4034 inner = NEXTOPER(NEXTOPER(scan));
4035 goto do_branch;
9041c2e3 4036 case BRANCH:
c277df42
IZ
4037 inner = NEXTOPER(scan);
4038 do_branch:
4039 {
be8e71aa 4040 const I32 type = OP(scan);
ae5031b3 4041 if (!next || OP(next) != type) /* No choice. */
c277df42 4042 next = inner; /* Avoid recursion. */
a0d0e21e 4043 else {
a3b680e6 4044 const I32 lastparen = *PL_reglastparen;
02db2b7b 4045 /* Put unwinding data on stack */
6136c704
AL
4046 const I32 unwind1 = SSNEWt(1,re_unwind_branch_t);
4047 re_unwind_branch_t * const uw = SSPTRt(unwind1,re_unwind_branch_t);
4048
5d9a96ca
DM
4049 uw->prev = st->unwind;
4050 st->unwind = unwind1;
e822a8b4 4051 uw->type = ((type == BRANCH)
02db2b7b
IZ
4052 ? RE_UNWIND_BRANCH
4053 : RE_UNWIND_BRANCHJ);
4054 uw->lastparen = lastparen;
4055 uw->next = next;
4056 uw->locinput = locinput;
4057 uw->nextchr = nextchr;
3a2830be 4058 uw->minmod = st->minmod;
02db2b7b
IZ
4059#ifdef DEBUGGING
4060 uw->regindent = ++PL_regindent;
4061#endif
c277df42 4062
02db2b7b
IZ
4063 REGCP_SET(uw->lastcp);
4064
4065 /* Now go into the first branch */
4066 next = inner;
a687059c 4067 }
a0d0e21e
LW
4068 }
4069 break;
4070 case MINMOD:
5d9a96ca 4071 st->minmod = 1;
a0d0e21e 4072 break;
c277df42
IZ
4073 case CURLYM:
4074 {
d8319b27 4075 st->u.curlym.l = st->u.curlym.matches = 0;
9041c2e3 4076
c277df42 4077 /* We suppose that the next guy does not need
0e788c72 4078 backtracking: in particular, it is of constant non-zero length,
c277df42 4079 and has no parenths to influence future backrefs. */
5d9a96ca 4080 st->ln = ARG1(scan); /* min to match */
c277df42 4081 n = ARG2(scan); /* max to match */
d8319b27
DM
4082 st->u.curlym.paren = scan->flags;
4083 if (st->u.curlym.paren) {
4084 if (st->u.curlym.paren > PL_regsize)
4085 PL_regsize = st->u.curlym.paren;
4086 if (st->u.curlym.paren > (I32)*PL_reglastparen)
4087 *PL_reglastparen = st->u.curlym.paren;
c277df42 4088 }
dc45a647 4089 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
d8319b27 4090 if (st->u.curlym.paren)
c277df42 4091 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 4092 PL_reginput = locinput;
d8319b27 4093 st->u.curlym.maxwanted = st->minmod ? st->ln : n;
0cadcf80
DM
4094 while (PL_reginput < PL_regeol && st->u.curlym.matches < st->u.curlym.maxwanted) {
4095 /* resume to current state on success */
4096 st->u.yes.prev_yes_state = yes_state;
4097 yes_state = st;
4098 REGMATCH(scan, CURLYM1);
4099 yes_state = st->u.yes.prev_yes_state;
4100 /*** all unsaved local vars undefined at this point */
4101 if (!result)
4102 break;
4103 /* on first match, determine length, u.curlym.l */
4104 if (!st->u.curlym.matches++) {
4105 if (PL_reg_match_utf8) {
4106 char *s = locinput;
4107 while (s < PL_reginput) {
4108 st->u.curlym.l++;
4109 s += UTF8SKIP(s);
6407bf3b
DM
4110 }
4111 }
0cadcf80
DM
4112 else {
4113 st->u.curlym.l = PL_reginput - locinput;
4114 }
4115 if (st->u.curlym.l == 0) {
4116 st->u.curlym.matches = st->u.curlym.maxwanted;
4117 break;
4118 }
6407bf3b 4119 }
0cadcf80 4120 locinput = PL_reginput;
6407bf3b
DM
4121 }
4122
4123 PL_reginput = locinput;
0cadcf80 4124 if (st->u.curlym.matches < st->ln) {
5d9a96ca 4125 st->minmod = 0;
0cadcf80
DM
4126 sayNO;
4127 }
5f80c4cf 4128
0cadcf80
DM
4129 DEBUG_EXECUTE_r(
4130 PerlIO_printf(Perl_debug_log,
4131 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
4132 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
4133 (IV) st->u.curlym.matches, (IV)st->u.curlym.l)
4134 );
4135
4136 /* calculate c1 and c1 for possible match of 1st char
4137 * following curly */
9e137952 4138 st->u.curlym.c1 = st->u.curlym.c2 = CHRTEST_VOID;
0cadcf80
DM
4139 if (HAS_TEXT(next) || JUMPABLE(next)) {
4140 regnode *text_node = next;
3dab1dad
YO
4141 if (! HAS_TEXT(text_node))
4142 FIND_NEXT_IMPT(text_node);
0cadcf80 4143 if (HAS_TEXT(text_node)
3dab1dad 4144 && PL_regkind[OP(text_node)] != REF)
0cadcf80
DM
4145 {
4146 st->u.curlym.c1 = (U8)*STRING(text_node);
4147 st->u.curlym.c2 =
4148 (OP(text_node) == EXACTF || OP(text_node) == REFF)
4149 ? PL_fold[st->u.curlym.c1]
4150 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4151 ? PL_fold_locale[st->u.curlym.c1]
4152 : st->u.curlym.c1;
4153 }
4154 }
5f80c4cf 4155
0cadcf80
DM
4156 REGCP_SET(st->u.curlym.lastcp);
4157
4158 st->u.curlym.minmod = st->minmod;
4159 st->minmod = 0;
4160 while (st->u.curlym.matches >= st->ln
4161 && (st->u.curlym.matches <= n
4162 /* for REG_INFTY, ln could overflow to negative */
4163 || (n == REG_INFTY && st->u.curlym.matches >= 0)))
4164 {
4165 /* If it could work, try it. */
9e137952 4166 if (st->u.curlym.c1 == CHRTEST_VOID ||
0cadcf80
DM
4167 UCHARAT(PL_reginput) == st->u.curlym.c1 ||
4168 UCHARAT(PL_reginput) == st->u.curlym.c2)
4169 {
4170 DEBUG_EXECUTE_r(
4171 PerlIO_printf(Perl_debug_log,
4172 "%*s trying tail with matches=%"IVdf"...\n",
4173 (int)(REPORT_CODE_OFF+PL_regindent*2),
4174 "", (IV)st->u.curlym.matches)
4175 );
4176 if (st->u.curlym.paren) {
4177 if (st->u.curlym.matches) {
4178 PL_regstartp[st->u.curlym.paren]
4179 = HOPc(PL_reginput, -st->u.curlym.l) - PL_bostr;
4180 PL_regendp[st->u.curlym.paren] = PL_reginput - PL_bostr;
cca55fe3 4181 }
5f80c4cf 4182 else
0cadcf80 4183 PL_regendp[st->u.curlym.paren] = -1;
5f80c4cf 4184 }
0cadcf80
DM
4185 /* resume to current state on success */
4186 st->u.yes.prev_yes_state = yes_state;
4187 yes_state = st;
4188 REGMATCH(next, CURLYM2);
4189 yes_state = st->u.yes.prev_yes_state;
4190 /*** all unsaved local vars undefined at this point */
4191 if (result)
4192 /* XXX tmp sayYES; */
4193 sayYES_FINAL;
4194 REGCP_UNWIND(st->u.curlym.lastcp);
a0ed51b3 4195 }
0cadcf80
DM
4196 /* Couldn't or didn't -- move forward/backward. */
4197 if (st->u.curlym.minmod) {
3280af22 4198 PL_reginput = locinput;
dad79028
DM
4199 /* resume to current state on success */
4200 st->u.yes.prev_yes_state = yes_state;
4201 yes_state = st;
95b24440 4202 REGMATCH(scan, CURLYM3);
dad79028 4203 yes_state = st->u.yes.prev_yes_state;
95b24440
DM
4204 /*** all unsaved local vars undefined at this point */
4205 if (result) {
0cadcf80 4206 st->u.curlym.matches++;
3280af22 4207 locinput = PL_reginput;
c277df42
IZ
4208 }
4209 else
4210 sayNO;
4211 }
0cadcf80 4212 else {
d8319b27
DM
4213 st->u.curlym.matches--;
4214 locinput = HOPc(locinput, -st->u.curlym.l);
3280af22 4215 PL_reginput = locinput;
c277df42
IZ
4216 }
4217 }
4218 sayNO;
5f66b61c 4219 /* NOTREACHED */
c277df42
IZ
4220 break;
4221 }
4222 case CURLYN:
d8319b27
DM
4223 st->u.plus.paren = scan->flags; /* Which paren to set */
4224 if (st->u.plus.paren > PL_regsize)
4225 PL_regsize = st->u.plus.paren;
4226 if (st->u.plus.paren > (I32)*PL_reglastparen)
4227 *PL_reglastparen = st->u.plus.paren;
5d9a96ca 4228 st->ln = ARG1(scan); /* min to match */
c277df42 4229 n = ARG2(scan); /* max to match */
dc45a647 4230 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 4231 goto repeat;
a0d0e21e 4232 case CURLY:
d8319b27 4233 st->u.plus.paren = 0;
5d9a96ca 4234 st->ln = ARG1(scan); /* min to match */
a0d0e21e 4235 n = ARG2(scan); /* max to match */
dc45a647 4236 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
4237 goto repeat;
4238 case STAR:
5d9a96ca 4239 st->ln = 0;
c277df42 4240 n = REG_INFTY;
a0d0e21e 4241 scan = NEXTOPER(scan);
d8319b27 4242 st->u.plus.paren = 0;
a0d0e21e
LW
4243 goto repeat;
4244 case PLUS:
5d9a96ca 4245 st->ln = 1;
c277df42
IZ
4246 n = REG_INFTY;
4247 scan = NEXTOPER(scan);
d8319b27 4248 st->u.plus.paren = 0;
c277df42 4249 repeat:
a0d0e21e
LW
4250 /*
4251 * Lookahead to avoid useless match attempts
4252 * when we know what character comes next.
4253 */
5f80c4cf
JP
4254
4255 /*
4256 * Used to only do .*x and .*?x, but now it allows
4257 * for )'s, ('s and (?{ ... })'s to be in the way
4258 * of the quantifier and the EXACT-like node. -- japhy
4259 */
4260
cca55fe3 4261 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4262 U8 *s;
4263 regnode *text_node = next;
4264
3dab1dad
YO
4265 if (! HAS_TEXT(text_node))
4266 FIND_NEXT_IMPT(text_node);
5f80c4cf 4267
9e137952
DM
4268 if (! HAS_TEXT(text_node))
4269 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
5f80c4cf 4270 else {
3dab1dad 4271 if (PL_regkind[OP(text_node)] == REF) {
9e137952 4272 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
44a68960 4273 goto assume_ok_easy;
cca55fe3 4274 }
be8e71aa
YO
4275 else
4276 s = (U8*)STRING(text_node);
5f80c4cf
JP
4277
4278 if (!UTF) {
d8319b27 4279 st->u.plus.c2 = st->u.plus.c1 = *s;
f65d3ee7 4280 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
d8319b27 4281 st->u.plus.c2 = PL_fold[st->u.plus.c1];
f65d3ee7 4282 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
d8319b27 4283 st->u.plus.c2 = PL_fold_locale[st->u.plus.c1];
1aa99e6b 4284 }
5f80c4cf 4285 else { /* UTF */
f65d3ee7 4286 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 4287 STRLEN ulen1, ulen2;
89ebb4a3
JH
4288 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4289 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4290
4291 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4292 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4293
d8319b27 4294 st->u.plus.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
041457d9 4295 uniflags);
d8319b27 4296 st->u.plus.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
041457d9 4297 uniflags);
5f80c4cf
JP
4298 }
4299 else {
d8319b27 4300 st->u.plus.c2 = st->u.plus.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4301 uniflags);
5f80c4cf 4302 }
1aa99e6b
IH
4303 }
4304 }
bbce6d69 4305 }
a0d0e21e 4306 else
9e137952 4307 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
cca55fe3 4308 assume_ok_easy:
3280af22 4309 PL_reginput = locinput;
5d9a96ca
DM
4310 if (st->minmod) {
4311 st->minmod = 0;
32fc9b6a 4312 if (st->ln && regrepeat(rex, scan, st->ln) < st->ln)
4633a7c4 4313 sayNO;
a0ed51b3 4314 locinput = PL_reginput;
d8319b27 4315 REGCP_SET(st->u.plus.lastcp);
9e137952 4316 if (st->u.plus.c1 != CHRTEST_VOID) {
d8319b27
DM
4317 st->u.plus.old = locinput;
4318 st->u.plus.count = 0;
0fe9bf95 4319
1aa99e6b 4320 if (n == REG_INFTY) {
d8319b27 4321 st->u.plus.e = PL_regeol - 1;
1aa99e6b 4322 if (do_utf8)
d8319b27
DM
4323 while (UTF8_IS_CONTINUATION(*(U8*)st->u.plus.e))
4324 st->u.plus.e--;
1aa99e6b
IH
4325 }
4326 else if (do_utf8) {
5d9a96ca 4327 int m = n - st->ln;
d8319b27
DM
4328 for (st->u.plus.e = locinput;
4329 m >0 && st->u.plus.e + UTF8SKIP(st->u.plus.e) <= PL_regeol; m--)
4330 st->u.plus.e += UTF8SKIP(st->u.plus.e);
1aa99e6b
IH
4331 }
4332 else {
d8319b27
DM
4333 st->u.plus.e = locinput + n - st->ln;
4334 if (st->u.plus.e >= PL_regeol)
4335 st->u.plus.e = PL_regeol - 1;
1aa99e6b 4336 }
0fe9bf95
IZ
4337 while (1) {
4338 /* Find place 'next' could work */
1aa99e6b 4339 if (!do_utf8) {
d8319b27
DM
4340 if (st->u.plus.c1 == st->u.plus.c2) {
4341 while (locinput <= st->u.plus.e &&
4342 UCHARAT(locinput) != st->u.plus.c1)
1aa99e6b
IH
4343 locinput++;
4344 } else {
d8319b27
DM
4345 while (locinput <= st->u.plus.e
4346 && UCHARAT(locinput) != st->u.plus.c1
4347 && UCHARAT(locinput) != st->u.plus.c2)
1aa99e6b
IH
4348 locinput++;
4349 }
d8319b27 4350 st->u.plus.count = locinput - st->u.plus.old;
1aa99e6b
IH
4351 }
4352 else {
d8319b27 4353 if (st->u.plus.c1 == st->u.plus.c2) {
a3b680e6 4354 STRLEN len;
872c91ae
JH
4355 /* count initialised to
4356 * utf8_distance(old, locinput) */
d8319b27 4357 while (locinput <= st->u.plus.e &&
872c91ae 4358 utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4359 UTF8_MAXBYTES, &len,
d8319b27 4360 uniflags) != (UV)st->u.plus.c1) {
1aa99e6b 4361 locinput += len;
d8319b27 4362 st->u.plus.count++;
b2f2f093 4363 }
1aa99e6b 4364 } else {
872c91ae
JH
4365 /* count initialised to
4366 * utf8_distance(old, locinput) */
d8319b27 4367 while (locinput <= st->u.plus.e) {
c4fd8992
AL
4368 STRLEN len;
4369 const UV c = utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4370 UTF8_MAXBYTES, &len,
041457d9 4371 uniflags);
d8319b27 4372 if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2)
1aa99e6b 4373 break;
b2f2f093 4374 locinput += len;
d8319b27 4375 st->u.plus.count++;
1aa99e6b
IH
4376 }
4377 }
0fe9bf95 4378 }
d8319b27 4379 if (locinput > st->u.plus.e)
0fe9bf95
IZ
4380 sayNO;
4381 /* PL_reginput == old now */
d8319b27 4382 if (locinput != st->u.plus.old) {
5d9a96ca 4383 st->ln = 1; /* Did some */
32fc9b6a 4384 if (regrepeat(rex, scan, st->u.plus.count) < st->u.plus.count)
0fe9bf95
IZ
4385 sayNO;
4386 }
4387 /* PL_reginput == locinput now */
d8319b27 4388 TRYPAREN(st->u.plus.paren, st->ln, locinput, PLUS1);
95b24440 4389 /*** all unsaved local vars undefined at this point */
0fe9bf95 4390 PL_reginput = locinput; /* Could be reset... */
d8319b27 4391 REGCP_UNWIND(st->u.plus.lastcp);
0fe9bf95 4392 /* Couldn't or didn't -- move forward. */
d8319b27 4393 st->u.plus.old = locinput;
1aa99e6b
IH
4394 if (do_utf8)
4395 locinput += UTF8SKIP(locinput);
4396 else
4397 locinput++;
d8319b27 4398 st->u.plus.count = 1;
0fe9bf95
IZ
4399 }
4400 }
4401 else
5d9a96ca 4402 while (n >= st->ln || (n == REG_INFTY && st->ln > 0)) { /* ln overflow ? */
1aa99e6b 4403 UV c;
9e137952 4404 if (st->u.plus.c1 != CHRTEST_VOID) {
1aa99e6b 4405 if (do_utf8)
872c91ae 4406 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4407 UTF8_MAXBYTES, 0,
041457d9 4408 uniflags);
1aa99e6b 4409 else
9041c2e3 4410 c = UCHARAT(PL_reginput);
2390ecbc 4411 /* If it could work, try it. */
be8e71aa 4412 if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2) {
d8319b27 4413 TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS2);
95b24440 4414 /*** all unsaved local vars undefined at this point */
d8319b27 4415 REGCP_UNWIND(st->u.plus.lastcp);
2390ecbc 4416 }
1aa99e6b 4417 }
a0d0e21e 4418 /* If it could work, try it. */
be8e71aa 4419 else if (st->u.plus.c1 == CHRTEST_VOID) {
d8319b27 4420 TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS3);
95b24440 4421 /*** all unsaved local vars undefined at this point */
d8319b27 4422 REGCP_UNWIND(st->u.plus.lastcp);
bbce6d69 4423 }
c277df42 4424 /* Couldn't or didn't -- move forward. */
a0ed51b3 4425 PL_reginput = locinput;
32fc9b6a 4426 if (regrepeat(rex, scan, 1)) {
5d9a96ca 4427 st->ln++;
a0ed51b3
LW
4428 locinput = PL_reginput;
4429 }
4430 else
4633a7c4 4431 sayNO;
a0d0e21e
LW
4432 }
4433 }
4434 else {
32fc9b6a 4435 n = regrepeat(rex, scan, n);
a0ed51b3 4436 locinput = PL_reginput;
3dab1dad
YO
4437 if ((st->ln < n) && (PL_regkind[OP(next)] == EOL) &&
4438 (OP(next) != MEOL || OP(next) == SEOL || OP(next) == EOS))
15272685 4439 {
5d9a96ca 4440 st->ln = n; /* why back off? */
1aeab75a
GS
4441 /* ...because $ and \Z can match before *and* after
4442 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4443 We should back off by one in this case. */
4444 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
5d9a96ca 4445 st->ln--;
1aeab75a 4446 }
d8319b27 4447 REGCP_SET(st->u.plus.lastcp);
1d5c262f 4448 {
5d9a96ca 4449 while (n >= st->ln) {
be8e71aa 4450 UV c = 0;
9e137952 4451 if (st->u.plus.c1 != CHRTEST_VOID) {
1aa99e6b 4452 if (do_utf8)
872c91ae 4453 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4454 UTF8_MAXBYTES, 0,
041457d9 4455 uniflags);
1aa99e6b 4456 else
9041c2e3 4457 c = UCHARAT(PL_reginput);
1aa99e6b 4458 }
c277df42 4459 /* If it could work, try it. */
be8e71aa
YO
4460 if (st->u.plus.c1 == CHRTEST_VOID || c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2) {
4461 TRYPAREN(st->u.plus.paren, n, PL_reginput, PLUS4);
4462 /*** all unsaved local vars undefined at this point */
4463 REGCP_UNWIND(st->u.plus.lastcp);
4464 }
c277df42
IZ
4465 /* Couldn't or didn't -- back up. */
4466 n--;
dfe13c55 4467 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 4468 }
a0d0e21e
LW
4469 }
4470 }
4633a7c4 4471 sayNO;
c277df42 4472 break;
a0d0e21e 4473 case END:
3b0527fe 4474 if (locinput < reginfo->till) {
a3621e74 4475 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4476 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4477 PL_colors[4],
4478 (long)(locinput - PL_reg_starttry),
3b0527fe 4479 (long)(reginfo->till - PL_reg_starttry),
7821416a
IZ
4480 PL_colors[5]));
4481 sayNO_FINAL; /* Cannot match: too short. */
4482 }
4483 PL_reginput = locinput; /* put where regtry can find it */
4484 sayYES_FINAL; /* Success! */
dad79028
DM
4485
4486 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4487 DEBUG_EXECUTE_r(
4488 PerlIO_printf(Perl_debug_log,
4489 "%*s %ssubpattern success...%s\n",
4490 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
3280af22 4491 PL_reginput = locinput; /* put where regtry can find it */
dad79028
DM
4492 sayYES_FINAL; /* Success! */
4493
4494 case SUSPEND: /* (?>FOO) */
4495 st->u.ifmatch.wanted = 1;
9fe1d20c 4496 PL_reginput = locinput;
9041c2e3 4497 goto do_ifmatch;
dad79028
DM
4498
4499 case UNLESSM: /* -ve lookaround: (?!FOO), or with flags, (?<!foo) */
4500 st->u.ifmatch.wanted = 0;
4501 goto ifmatch_trivial_fail_test;
4502
4503 case IFMATCH: /* +ve lookaround: (?=FOO), or with flags, (?<=foo) */
4504 st->u.ifmatch.wanted = 1;
4505 ifmatch_trivial_fail_test:
a0ed51b3 4506 if (scan->flags) {
52657f30 4507 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4508 if (!s) {
4509 /* trivial fail */
4510 if (st->logical) {
4511 st->logical = 0;
3dab1dad 4512 st->sw = 1 - (bool)st->u.ifmatch.wanted;
dad79028
DM
4513 }
4514 else if (st->u.ifmatch.wanted)
4515 sayNO;
4516 next = scan + ARG(scan);
4517 if (next == scan)
4518 next = NULL;
4519 break;
4520 }
efb30f32 4521 PL_reginput = s;
a0ed51b3
LW
4522 }
4523 else
4524 PL_reginput = locinput;
4525
c277df42 4526 do_ifmatch:
dad79028
DM
4527 /* resume to current state on success */
4528 st->u.yes.prev_yes_state = yes_state;
4529 yes_state = st;
4530 PUSH_STATE(newst, resume_IFMATCH);
4531 st = newst;
4532 next = NEXTOPER(NEXTOPER(scan));
4533 break;
4534
c277df42 4535 case LONGJMP:
c277df42
IZ
4536 next = scan + ARG(scan);
4537 if (next == scan)
4538 next = NULL;
a0d0e21e
LW
4539 break;
4540 default:
b900a521 4541 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4542 PTR2UV(scan), OP(scan));
cea2e8a9 4543 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4544 }
95b24440 4545
02db2b7b 4546 reenter:
a0d0e21e 4547 scan = next;
95b24440
DM
4548 continue;
4549 /* NOTREACHED */
4550
4551 /* simulate recursively calling regmatch(), but without actually
4552 * recursing - ie save the current state on the heap rather than on
4553 * the stack, then re-enter the loop. This avoids complex regexes
4554 * blowing the processor stack */
4555
4556 start_recurse:
4557 {
5d9a96ca
DM
4558 /* push new state */
4559 regmatch_state *oldst = st;
4560
4561 depth++;
4562
4563 /* grab the next free state slot */
4564 st++;
86545054 4565 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
4566 st = S_push_slab(aTHX);
4567 PL_regmatch_state = st;
4568
4569 oldst->next = next;
4570 oldst->n = n;
4571 oldst->locinput = locinput;
5d9a96ca
DM
4572
4573 st->cc = oldst->cc;
95b24440
DM
4574 locinput = PL_reginput;
4575 nextchr = UCHARAT(locinput);
5d9a96ca
DM
4576 st->minmod = 0;
4577 st->sw = 0;
4578 st->logical = 0;
4579 st->unwind = 0;
95b24440
DM
4580#ifdef DEBUGGING
4581 PL_regindent++;
4582#endif
4583 }
a0d0e21e 4584 }
a687059c 4585
aa283a38
DM
4586
4587
a0d0e21e
LW
4588 /*
4589 * We get here only if there's trouble -- normally "case END" is
4590 * the terminating point.
4591 */
cea2e8a9 4592 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4593 /*NOTREACHED*/
4633a7c4
LW
4594 sayNO;
4595
7821416a 4596yes_final:
77cb431f
DM
4597
4598 if (yes_state) {
4599 /* we have successfully completed a subexpression, but we must now
4600 * pop to the state marked by yes_state and continue from there */
4601
dad79028 4602 /*XXX tmp for CURLYM*/
c4fd8992
AL
4603 regmatch_slab * const oslab = PL_regmatch_slab;
4604 regmatch_state * const ost = st;
4605 regmatch_state * const oys = yes_state;
dad79028
DM
4606 int odepth = depth;
4607
77cb431f
DM
4608 assert(st != yes_state);
4609 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4610 || yes_state > SLAB_LAST(PL_regmatch_slab))
4611 {
4612 /* not in this slab, pop slab */
4613 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4614 PL_regmatch_slab = PL_regmatch_slab->prev;
4615 st = SLAB_LAST(PL_regmatch_slab);
4616 }
4617 depth -= (st - yes_state);
dad79028 4618 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE TO (%d)\n", depth));
77cb431f
DM
4619 st = yes_state;
4620 yes_state = st->u.yes.prev_yes_state;
4621 PL_regmatch_state = st;
4622
4623 switch (st->resume_state) {
4624 case resume_EVAL:
4625 if (st->u.eval.toggleutf)
4626 PL_reg_flags ^= RF_utf8;
4627 ReREFCNT_dec(rex);
4628 rex = st->u.eval.prev_rex;
4629 /* XXXX This is too dramatic a measure... */
4630 PL_reg_maxiter = 0;
4631 /* Restore parens of the caller without popping the
4632 * savestack */
4633 {
c4fd8992 4634 const I32 tmp = PL_savestack_ix;
77cb431f
DM
4635 PL_savestack_ix = st->u.eval.lastcp;
4636 regcppop(rex);
4637 PL_savestack_ix = tmp;
4638 }
4639 PL_reginput = locinput;
4640 /* continue at the node following the (??{...}) */
4641 next = st->next;
4642 goto reenter;
4643
dad79028
DM
4644 case resume_IFMATCH:
4645 if (st->logical) {
4646 st->logical = 0;
3dab1dad 4647 st->sw = (bool)st->u.ifmatch.wanted;
dad79028
DM
4648 }
4649 else if (!st->u.ifmatch.wanted)
4650 sayNO;
4651
4652 if (OP(st->scan) == SUSPEND)
4653 locinput = PL_reginput;
4654 else {
4655 locinput = PL_reginput = st->locinput;
4656 nextchr = UCHARAT(locinput);
4657 }
4658 next = st->scan + ARG(st->scan);
4659 if (next == st->scan)
4660 next = NULL;
4661 goto reenter;
4662
4663 /* XXX tmp don't handle yes_state yet */
4664 case resume_CURLYM1:
4665 case resume_CURLYM2:
4666 case resume_CURLYM3:
dad79028
DM
4667 PL_regmatch_slab =oslab;
4668 st = ost;
4669 PL_regmatch_state = st;
4670 depth = odepth;
4671 yes_state = oys;
4672 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "XXX revering a CURLYM\n"));
4673 goto yes;
4674
77cb431f
DM
4675 default:
4676 Perl_croak(aTHX_ "unexpected yes reume state");
4677 }
4678 }
4679
a3621e74 4680 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4681 PL_colors[4], PL_colors[5]));
4633a7c4
LW
4682yes:
4683#ifdef DEBUGGING
3280af22 4684 PL_regindent--;
4633a7c4 4685#endif
02db2b7b 4686
95b24440 4687 result = 1;
aa283a38 4688 /* XXX this is duplicate(ish) code to that in the do_no section.
77cb431f
DM
4689 * eventually a yes should just pop the stack back to the current
4690 * yes_state */
aa283a38
DM
4691 if (depth) {
4692 /* restore previous state and re-enter */
4693 POP_STATE;
4694
4695 switch (st->resume_state) {
4696 case resume_TRIE1:
4697 goto resume_point_TRIE1;
4698 case resume_TRIE2:
4699 goto resume_point_TRIE2;
aa283a38
DM
4700 case resume_CURLYX:
4701 goto resume_point_CURLYX;
4702 case resume_WHILEM1:
4703 goto resume_point_WHILEM1;
4704 case resume_WHILEM2:
4705 goto resume_point_WHILEM2;
4706 case resume_WHILEM3:
4707 goto resume_point_WHILEM3;
4708 case resume_WHILEM4:
4709 goto resume_point_WHILEM4;
4710 case resume_WHILEM5:
4711 goto resume_point_WHILEM5;
4712 case resume_WHILEM6:
4713 goto resume_point_WHILEM6;
4714 case resume_CURLYM1:
4715 goto resume_point_CURLYM1;
4716 case resume_CURLYM2:
4717 goto resume_point_CURLYM2;
4718 case resume_CURLYM3:
4719 goto resume_point_CURLYM3;
aa283a38
DM
4720 case resume_PLUS1:
4721 goto resume_point_PLUS1;
4722 case resume_PLUS2:
4723 goto resume_point_PLUS2;
4724 case resume_PLUS3:
4725 goto resume_point_PLUS3;
4726 case resume_PLUS4:
4727 goto resume_point_PLUS4;
77cb431f 4728
dad79028 4729 case resume_IFMATCH:
77cb431f 4730 case resume_EVAL:
aa283a38
DM
4731 default:
4732 Perl_croak(aTHX_ "regexp resume memory corruption");
4733 }
4734 }
4735 goto final_exit;
4633a7c4
LW
4736
4737no:
a3621e74 4738 DEBUG_EXECUTE_r(
7821416a
IZ
4739 PerlIO_printf(Perl_debug_log,
4740 "%*s %sfailed...%s\n",
e4584336 4741 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a
IZ
4742 );
4743 goto do_no;
4744no_final:
4745do_no:
5d9a96ca
DM
4746 if (st->unwind) {
4747 re_unwind_t * const uw = SSPTRt(st->unwind,re_unwind_t);
02db2b7b
IZ
4748
4749 switch (uw->type) {
4750 case RE_UNWIND_BRANCH:
4751 case RE_UNWIND_BRANCHJ:
4752 {
6136c704 4753 re_unwind_branch_t * const uwb = &(uw->branch);
a3b680e6 4754 const I32 lastparen = uwb->lastparen;
9041c2e3 4755
02db2b7b
IZ
4756 REGCP_UNWIND(uwb->lastcp);
4757 for (n = *PL_reglastparen; n > lastparen; n--)
4758 PL_regendp[n] = -1;
4759 *PL_reglastparen = n;
4760 scan = next = uwb->next;
3a2830be 4761 st->minmod = uwb->minmod;
9041c2e3
NIS
4762 if ( !scan ||
4763 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b 4764 ? BRANCH : BRANCHJ) ) { /* Failure */
5d9a96ca 4765 st->unwind = uwb->prev;
02db2b7b
IZ
4766#ifdef DEBUGGING
4767 PL_regindent--;
4768#endif
4769 goto do_no;
4770 }
4771 /* Have more choice yet. Reuse the same uwb. */
02db2b7b
IZ
4772 if ((n = (uwb->type == RE_UNWIND_BRANCH
4773 ? NEXT_OFF(next) : ARG(next))))
4774 next += n;
4775 else
4776 next = NULL; /* XXXX Needn't unwinding in this case... */
4777 uwb->next = next;
4778 next = NEXTOPER(scan);
4779 if (uwb->type == RE_UNWIND_BRANCHJ)
4780 next = NEXTOPER(next);
4781 locinput = uwb->locinput;
4782 nextchr = uwb->nextchr;
4783#ifdef DEBUGGING
4784 PL_regindent = uwb->regindent;
4785#endif
4786
4787 goto reenter;
4788 }
5f66b61c 4789 /* NOTREACHED */
02db2b7b
IZ
4790 default:
4791 Perl_croak(aTHX_ "regexp unwind memory corruption");
4792 }
5f66b61c 4793 /* NOTREACHED */
02db2b7b 4794 }
aa283a38 4795
4633a7c4 4796#ifdef DEBUGGING
3280af22 4797 PL_regindent--;
4633a7c4 4798#endif
95b24440 4799 result = 0;
5d9a96ca 4800
aa283a38
DM
4801 if (depth) {
4802 /* there's a previous state to backtrack to */
4803 POP_STATE;
5d9a96ca 4804 switch (st->resume_state) {
95b24440
DM
4805 case resume_TRIE1:
4806 goto resume_point_TRIE1;
4807 case resume_TRIE2:
4808 goto resume_point_TRIE2;
aa283a38
DM
4809 case resume_EVAL:
4810 /* we have failed an (??{...}). Restore state to the outer re
4811 * then re-throw the failure */
4812 if (st->u.eval.toggleutf)
4813 PL_reg_flags ^= RF_utf8;
4814 ReREFCNT_dec(rex);
4815 rex = st->u.eval.prev_rex;
77cb431f 4816 yes_state = st->u.yes.prev_yes_state;
aa283a38
DM
4817
4818 /* XXXX This is too dramatic a measure... */
4819 PL_reg_maxiter = 0;
4820
4821 PL_reginput = locinput;
4822 REGCP_UNWIND(st->u.eval.lastcp);
4823 regcppop(rex);
4824 goto do_no;
4825
95b24440
DM
4826 case resume_CURLYX:
4827 goto resume_point_CURLYX;
4828 case resume_WHILEM1:
4829 goto resume_point_WHILEM1;
4830 case resume_WHILEM2:
4831 goto resume_point_WHILEM2;
4832 case resume_WHILEM3:
4833 goto resume_point_WHILEM3;
4834 case resume_WHILEM4:
4835 goto resume_point_WHILEM4;
4836 case resume_WHILEM5:
4837 goto resume_point_WHILEM5;
4838 case resume_WHILEM6:
4839 goto resume_point_WHILEM6;
4840 case resume_CURLYM1:
4841 goto resume_point_CURLYM1;
4842 case resume_CURLYM2:
4843 goto resume_point_CURLYM2;
4844 case resume_CURLYM3:
4845 goto resume_point_CURLYM3;
95b24440 4846 case resume_IFMATCH:
dad79028
DM
4847 yes_state = st->u.yes.prev_yes_state;
4848 if (st->logical) {
4849 st->logical = 0;
4850 st->sw = !st->u.ifmatch.wanted;
4851 }
4852 else if (st->u.ifmatch.wanted)
4853 sayNO;
4854
4855 assert(OP(scan) != SUSPEND); /* XXX DAPM tmp */
4856 locinput = PL_reginput = st->locinput;
4857 nextchr = UCHARAT(locinput);
4858 next = scan + ARG(scan);
4859 if (next == scan)
4860 next = NULL;
4861 goto reenter;
4862
95b24440
DM
4863 case resume_PLUS1:
4864 goto resume_point_PLUS1;
4865 case resume_PLUS2:
4866 goto resume_point_PLUS2;
4867 case resume_PLUS3:
4868 goto resume_point_PLUS3;
4869 case resume_PLUS4:
4870 goto resume_point_PLUS4;
95b24440
DM
4871 default:
4872 Perl_croak(aTHX_ "regexp resume memory corruption");
4873 }
95b24440 4874 }
aa283a38
DM
4875
4876final_exit:
4877
5d9a96ca
DM
4878 /* restore original high-water mark */
4879 PL_regmatch_slab = orig_slab;
4880 PL_regmatch_state = orig_state;
4881
4882 /* free all slabs above current one */
4883 if (orig_slab->next) {
c4fd8992 4884 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
4885 orig_slab->next = NULL;
4886 while (sl) {
c4fd8992 4887 regmatch_slab * const osl = sl;
5d9a96ca 4888 sl = sl->next;
ad65c075 4889 Safefree(osl);
5d9a96ca
DM
4890 }
4891 }
4892
95b24440
DM
4893 return result;
4894
a687059c
LW
4895}
4896
4897/*
4898 - regrepeat - repeatedly match something simple, report how many
4899 */
4900/*
4901 * [This routine now assumes that it will only match on things of length 1.
4902 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4903 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4904 */
76e3520e 4905STATIC I32
32fc9b6a 4906S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
a687059c 4907{
27da23d5 4908 dVAR;
a0d0e21e 4909 register char *scan;
a0d0e21e 4910 register I32 c;
3280af22 4911 register char *loceol = PL_regeol;
a0ed51b3 4912 register I32 hardcount = 0;
53c4c00c 4913 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4914
3280af22 4915 scan = PL_reginput;
faf11cac
HS
4916 if (max == REG_INFTY)
4917 max = I32_MAX;
4918 else if (max < loceol - scan)
7f596f4c 4919 loceol = scan + max;
a0d0e21e 4920 switch (OP(p)) {
22c35a8c 4921 case REG_ANY:
1aa99e6b 4922 if (do_utf8) {
ffc61ed2 4923 loceol = PL_regeol;
1aa99e6b 4924 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4925 scan += UTF8SKIP(scan);
4926 hardcount++;
4927 }
4928 } else {
4929 while (scan < loceol && *scan != '\n')
4930 scan++;
a0ed51b3
LW
4931 }
4932 break;
ffc61ed2 4933 case SANY:
def8e4ea
JH
4934 if (do_utf8) {
4935 loceol = PL_regeol;
a0804c9e 4936 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4937 scan += UTF8SKIP(scan);
4938 hardcount++;
4939 }
4940 }
4941 else
4942 scan = loceol;
a0ed51b3 4943 break;
f33976b4
DB
4944 case CANY:
4945 scan = loceol;
4946 break;
090f7165
JH
4947 case EXACT: /* length of string is 1 */
4948 c = (U8)*STRING(p);
4949 while (scan < loceol && UCHARAT(scan) == c)
4950 scan++;
bbce6d69 4951 break;
4952 case EXACTF: /* length of string is 1 */
cd439c50 4953 c = (U8)*STRING(p);
bbce6d69 4954 while (scan < loceol &&
22c35a8c 4955 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4956 scan++;
4957 break;
4958 case EXACTFL: /* length of string is 1 */
3280af22 4959 PL_reg_flags |= RF_tainted;
cd439c50 4960 c = (U8)*STRING(p);
bbce6d69 4961 while (scan < loceol &&
22c35a8c 4962 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4963 scan++;
4964 break;
4965 case ANYOF:
ffc61ed2
JH
4966 if (do_utf8) {
4967 loceol = PL_regeol;
cfc92286 4968 while (hardcount < max && scan < loceol &&
32fc9b6a 4969 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4970 scan += UTF8SKIP(scan);
4971 hardcount++;
4972 }
4973 } else {
32fc9b6a 4974 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
4975 scan++;
4976 }
a0d0e21e
LW
4977 break;
4978 case ALNUM:
1aa99e6b 4979 if (do_utf8) {
ffc61ed2 4980 loceol = PL_regeol;
1a4fad37 4981 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4982 while (hardcount < max && scan < loceol &&
3568d838 4983 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4984 scan += UTF8SKIP(scan);
4985 hardcount++;
4986 }
4987 } else {
4988 while (scan < loceol && isALNUM(*scan))
4989 scan++;
a0ed51b3
LW
4990 }
4991 break;
bbce6d69 4992 case ALNUML:
3280af22 4993 PL_reg_flags |= RF_tainted;
1aa99e6b 4994 if (do_utf8) {
ffc61ed2 4995 loceol = PL_regeol;
1aa99e6b
IH
4996 while (hardcount < max && scan < loceol &&
4997 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4998 scan += UTF8SKIP(scan);
4999 hardcount++;
5000 }
5001 } else {
5002 while (scan < loceol && isALNUM_LC(*scan))
5003 scan++;
a0ed51b3
LW
5004 }
5005 break;
a0d0e21e 5006 case NALNUM:
1aa99e6b 5007 if (do_utf8) {
ffc61ed2 5008 loceol = PL_regeol;
1a4fad37 5009 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 5010 while (hardcount < max && scan < loceol &&
3568d838 5011 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5012 scan += UTF8SKIP(scan);
5013 hardcount++;
5014 }
5015 } else {
5016 while (scan < loceol && !isALNUM(*scan))
5017 scan++;
a0ed51b3
LW
5018 }
5019 break;
bbce6d69 5020 case NALNUML:
3280af22 5021 PL_reg_flags |= RF_tainted;
1aa99e6b 5022 if (do_utf8) {
ffc61ed2 5023 loceol = PL_regeol;
1aa99e6b
IH
5024 while (hardcount < max && scan < loceol &&
5025 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5026 scan += UTF8SKIP(scan);
5027 hardcount++;
5028 }
5029 } else {
5030 while (scan < loceol && !isALNUM_LC(*scan))
5031 scan++;
a0ed51b3
LW
5032 }
5033 break;
a0d0e21e 5034 case SPACE:
1aa99e6b 5035 if (do_utf8) {
ffc61ed2 5036 loceol = PL_regeol;
1a4fad37 5037 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5038 while (hardcount < max && scan < loceol &&
3568d838
JH
5039 (*scan == ' ' ||
5040 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5041 scan += UTF8SKIP(scan);
5042 hardcount++;
5043 }
5044 } else {
5045 while (scan < loceol && isSPACE(*scan))
5046 scan++;
a0ed51b3
LW
5047 }
5048 break;
bbce6d69 5049 case SPACEL:
3280af22 5050 PL_reg_flags |= RF_tainted;
1aa99e6b 5051 if (do_utf8) {
ffc61ed2 5052 loceol = PL_regeol;
1aa99e6b 5053 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5054 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5055 scan += UTF8SKIP(scan);
5056 hardcount++;
5057 }
5058 } else {
5059 while (scan < loceol && isSPACE_LC(*scan))
5060 scan++;
a0ed51b3
LW
5061 }
5062 break;
a0d0e21e 5063 case NSPACE:
1aa99e6b 5064 if (do_utf8) {
ffc61ed2 5065 loceol = PL_regeol;
1a4fad37 5066 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5067 while (hardcount < max && scan < loceol &&
3568d838
JH
5068 !(*scan == ' ' ||
5069 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5070 scan += UTF8SKIP(scan);
5071 hardcount++;
5072 }
5073 } else {
5074 while (scan < loceol && !isSPACE(*scan))
5075 scan++;
5076 break;
a0ed51b3 5077 }
bbce6d69 5078 case NSPACEL:
3280af22 5079 PL_reg_flags |= RF_tainted;
1aa99e6b 5080 if (do_utf8) {
ffc61ed2 5081 loceol = PL_regeol;
1aa99e6b 5082 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5083 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5084 scan += UTF8SKIP(scan);
5085 hardcount++;
5086 }
5087 } else {
5088 while (scan < loceol && !isSPACE_LC(*scan))
5089 scan++;
a0ed51b3
LW
5090 }
5091 break;
a0d0e21e 5092 case DIGIT:
1aa99e6b 5093 if (do_utf8) {
ffc61ed2 5094 loceol = PL_regeol;
1a4fad37 5095 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5096 while (hardcount < max && scan < loceol &&
3568d838 5097 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5098 scan += UTF8SKIP(scan);
5099 hardcount++;
5100 }
5101 } else {
5102 while (scan < loceol && isDIGIT(*scan))
5103 scan++;
a0ed51b3
LW
5104 }
5105 break;
a0d0e21e 5106 case NDIGIT:
1aa99e6b 5107 if (do_utf8) {
ffc61ed2 5108 loceol = PL_regeol;
1a4fad37 5109 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5110 while (hardcount < max && scan < loceol &&
3568d838 5111 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5112 scan += UTF8SKIP(scan);
5113 hardcount++;
5114 }
5115 } else {
5116 while (scan < loceol && !isDIGIT(*scan))
5117 scan++;
a0ed51b3
LW
5118 }
5119 break;
a0d0e21e
LW
5120 default: /* Called on something of 0 width. */
5121 break; /* So match right here or not at all. */
5122 }
a687059c 5123
a0ed51b3
LW
5124 if (hardcount)
5125 c = hardcount;
5126 else
5127 c = scan - PL_reginput;
3280af22 5128 PL_reginput = scan;
a687059c 5129
a3621e74 5130 DEBUG_r({
be8e71aa
YO
5131 SV *re_debug_flags = NULL;
5132 SV * const prop = sv_newmortal();
5133 GET_RE_DEBUG_FLAGS;
5134 DEBUG_EXECUTE_r({
5135 regprop(prog, prop, p);
5136 PerlIO_printf(Perl_debug_log,
5137 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5138 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 5139 });
be8e71aa 5140 });
9041c2e3 5141
a0d0e21e 5142 return(c);
a687059c
LW
5143}
5144
c277df42 5145
be8e71aa 5146#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 5147/*
ffc61ed2
JH
5148- regclass_swash - prepare the utf8 swash
5149*/
5150
5151SV *
32fc9b6a 5152Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 5153{
97aff369 5154 dVAR;
9e55ce06
JH
5155 SV *sw = NULL;
5156 SV *si = NULL;
5157 SV *alt = NULL;
3dab1dad 5158 const struct reg_data * const data = prog ? prog->data : NULL;
ffc61ed2 5159
4f639d21 5160 if (data && data->count) {
a3b680e6 5161 const U32 n = ARG(node);
ffc61ed2 5162
4f639d21
DM
5163 if (data->what[n] == 's') {
5164 SV * const rv = (SV*)data->data[n];
890ce7af 5165 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 5166 SV **const ary = AvARRAY(av);
9e55ce06 5167 SV **a, **b;
9041c2e3 5168
711a919c 5169 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5170 * documentation of these array elements. */
5171
b11f357e 5172 si = *ary;
8f7f7219 5173 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
5174 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5175
ffc61ed2
JH
5176 if (a)
5177 sw = *a;
5178 else if (si && doinit) {
5179 sw = swash_init("utf8", "", si, 1, 0);
5180 (void)av_store(av, 1, sw);
5181 }
9e55ce06
JH
5182 if (b)
5183 alt = *b;
ffc61ed2
JH
5184 }
5185 }
5186
9e55ce06
JH
5187 if (listsvp)
5188 *listsvp = si;
5189 if (altsvp)
5190 *altsvp = alt;
ffc61ed2
JH
5191
5192 return sw;
5193}
76234dfb 5194#endif
ffc61ed2
JH
5195
5196/*
ba7b4546 5197 - reginclass - determine if a character falls into a character class
832705d4
JH
5198
5199 The n is the ANYOF regnode, the p is the target string, lenp
5200 is pointer to the maximum length of how far to go in the p
5201 (if the lenp is zero, UTF8SKIP(p) is used),
5202 do_utf8 tells whether the target string is in UTF-8.
5203
bbce6d69 5204 */
5205
76e3520e 5206STATIC bool
32fc9b6a 5207S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5208{
27da23d5 5209 dVAR;
a3b680e6 5210 const char flags = ANYOF_FLAGS(n);
bbce6d69 5211 bool match = FALSE;
cc07378b 5212 UV c = *p;
ae9ddab8 5213 STRLEN len = 0;
9e55ce06 5214 STRLEN plen;
1aa99e6b 5215
19f67299
TS
5216 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5217 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5218 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5219 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
19f67299
TS
5220 if (len == (STRLEN)-1)
5221 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5222 }
bbce6d69 5223
0f0076b4 5224 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5225 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5226 if (lenp)
5227 *lenp = 0;
ffc61ed2 5228 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5229 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5230 match = TRUE;
bbce6d69 5231 }
3568d838 5232 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5233 match = TRUE;
ffc61ed2 5234 if (!match) {
9e55ce06 5235 AV *av;
32fc9b6a 5236 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5237
5238 if (sw) {
3568d838 5239 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
5240 match = TRUE;
5241 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5242 if (!match && lenp && av) {
5243 I32 i;
9e55ce06 5244 for (i = 0; i <= av_len(av); i++) {
890ce7af 5245 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5246 STRLEN len;
890ce7af 5247 const char * const s = SvPV_const(sv, len);
9e55ce06 5248
061b10df 5249 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
5250 *lenp = len;
5251 match = TRUE;
5252 break;
5253 }
5254 }
5255 }
5256 if (!match) {
89ebb4a3 5257 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
5258 STRLEN tmplen;
5259
9e55ce06
JH
5260 to_utf8_fold(p, tmpbuf, &tmplen);
5261 if (swash_fetch(sw, tmpbuf, do_utf8))
5262 match = TRUE;
5263 }
ffc61ed2
JH
5264 }
5265 }
bbce6d69 5266 }
9e55ce06 5267 if (match && lenp && *lenp == 0)
0f0076b4 5268 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5269 }
1aa99e6b 5270 if (!match && c < 256) {
ffc61ed2
JH
5271 if (ANYOF_BITMAP_TEST(n, c))
5272 match = TRUE;
5273 else if (flags & ANYOF_FOLD) {
eb160463 5274 U8 f;
a0ed51b3 5275
ffc61ed2
JH
5276 if (flags & ANYOF_LOCALE) {
5277 PL_reg_flags |= RF_tainted;
5278 f = PL_fold_locale[c];
5279 }
5280 else
5281 f = PL_fold[c];
5282 if (f != c && ANYOF_BITMAP_TEST(n, f))
5283 match = TRUE;
5284 }
5285
5286 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5287 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5288 if (
5289 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5290 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5291 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5292 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5293 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5294 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5295 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5296 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5297 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5298 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5299 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5300 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5301 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5302 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5303 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5304 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5305 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5306 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5307 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5308 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5309 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5310 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5311 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5312 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5313 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5314 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5315 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5316 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5317 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5318 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5319 ) /* How's that for a conditional? */
5320 {
5321 match = TRUE;
5322 }
a0ed51b3 5323 }
a0ed51b3
LW
5324 }
5325
a0ed51b3
LW
5326 return (flags & ANYOF_INVERT) ? !match : match;
5327}
161b471a 5328
dfe13c55 5329STATIC U8 *
0ce71af7 5330S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5331{
97aff369 5332 dVAR;
a0ed51b3 5333 if (off >= 0) {
1aa99e6b 5334 while (off-- && s < lim) {
ffc61ed2 5335 /* XXX could check well-formedness here */
a0ed51b3 5336 s += UTF8SKIP(s);
ffc61ed2 5337 }
a0ed51b3
LW
5338 }
5339 else {
5340 while (off++) {
1aa99e6b 5341 if (s > lim) {
a0ed51b3 5342 s--;
ffc61ed2 5343 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5344 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5345 s--;
ffc61ed2
JH
5346 }
5347 /* XXX could check well-formedness here */
a0ed51b3
LW
5348 }
5349 }
5350 }
5351 return s;
5352}
161b471a 5353
dfe13c55 5354STATIC U8 *
0ce71af7 5355S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5356{
97aff369 5357 dVAR;
a0ed51b3 5358 if (off >= 0) {
1aa99e6b 5359 while (off-- && s < lim) {
ffc61ed2 5360 /* XXX could check well-formedness here */
a0ed51b3 5361 s += UTF8SKIP(s);
ffc61ed2 5362 }
a0ed51b3 5363 if (off >= 0)
3dab1dad 5364 return NULL;
a0ed51b3
LW
5365 }
5366 else {
5367 while (off++) {
1aa99e6b 5368 if (s > lim) {
a0ed51b3 5369 s--;
ffc61ed2 5370 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5371 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5372 s--;
ffc61ed2
JH
5373 }
5374 /* XXX could check well-formedness here */
a0ed51b3
LW
5375 }
5376 else
5377 break;
5378 }
5379 if (off <= 0)
3dab1dad 5380 return NULL;
a0ed51b3
LW
5381 }
5382 return s;
5383}
51371543 5384
51371543 5385static void
acfe0abc 5386restore_pos(pTHX_ void *arg)
51371543 5387{
97aff369 5388 dVAR;
097eb12c 5389 regexp * const rex = (regexp *)arg;
51371543
GS
5390 if (PL_reg_eval_set) {
5391 if (PL_reg_oldsaved) {
4f639d21
DM
5392 rex->subbeg = PL_reg_oldsaved;
5393 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5394#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5395 rex->saved_copy = PL_nrs;
ed252734 5396#endif
4f639d21 5397 RX_MATCH_COPIED_on(rex);
51371543
GS
5398 }
5399 PL_reg_magic->mg_len = PL_reg_oldpos;
5400 PL_reg_eval_set = 0;
5401 PL_curpm = PL_reg_oldcurpm;
5402 }
5403}
33b8afdf
JH
5404
5405STATIC void
5406S_to_utf8_substr(pTHX_ register regexp *prog)
5407{
33b8afdf 5408 if (prog->float_substr && !prog->float_utf8) {
097eb12c
AL
5409 SV* const sv = newSVsv(prog->float_substr);
5410 prog->float_utf8 = sv;
33b8afdf
JH
5411 sv_utf8_upgrade(sv);
5412 if (SvTAIL(prog->float_substr))
5413 SvTAIL_on(sv);
5414 if (prog->float_substr == prog->check_substr)
5415 prog->check_utf8 = sv;
5416 }
5417 if (prog->anchored_substr && !prog->anchored_utf8) {
097eb12c
AL
5418 SV* const sv = newSVsv(prog->anchored_substr);
5419 prog->anchored_utf8 = sv;
33b8afdf
JH
5420 sv_utf8_upgrade(sv);
5421 if (SvTAIL(prog->anchored_substr))
5422 SvTAIL_on(sv);
5423 if (prog->anchored_substr == prog->check_substr)
5424 prog->check_utf8 = sv;
5425 }
5426}
5427
5428STATIC void
5429S_to_byte_substr(pTHX_ register regexp *prog)
5430{
97aff369 5431 dVAR;
33b8afdf 5432 if (prog->float_utf8 && !prog->float_substr) {
097eb12c
AL
5433 SV* sv = newSVsv(prog->float_utf8);
5434 prog->float_substr = sv;
33b8afdf
JH
5435 if (sv_utf8_downgrade(sv, TRUE)) {
5436 if (SvTAIL(prog->float_utf8))
5437 SvTAIL_on(sv);
5438 } else {
5439 SvREFCNT_dec(sv);
5440 prog->float_substr = sv = &PL_sv_undef;
5441 }
5442 if (prog->float_utf8 == prog->check_utf8)
5443 prog->check_substr = sv;
5444 }
5445 if (prog->anchored_utf8 && !prog->anchored_substr) {
097eb12c
AL
5446 SV* sv = newSVsv(prog->anchored_utf8);
5447 prog->anchored_substr = sv;
33b8afdf
JH
5448 if (sv_utf8_downgrade(sv, TRUE)) {
5449 if (SvTAIL(prog->anchored_utf8))
5450 SvTAIL_on(sv);
5451 } else {
5452 SvREFCNT_dec(sv);
5453 prog->anchored_substr = sv = &PL_sv_undef;
5454 }
5455 if (prog->anchored_utf8 == prog->check_utf8)
5456 prog->check_substr = sv;
5457 }
5458}
66610fdd
RGS
5459
5460/*
5461 * Local variables:
5462 * c-indentation-style: bsd
5463 * c-basic-offset: 4
5464 * indent-tabs-mode: t
5465 * End:
5466 *
37442d52
RGS
5467 * ex: set ts=8 sts=4 sw=4 noet:
5468 */