This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
cflags.SH: rework the gcc warnings selection
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e 33#ifdef PERL_EXT_RE_BUILD
54df2634 34#include "re_top.h"
9041c2e3 35#endif
56953603 36
a687059c 37/*
e50aee73 38 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
39 *
40 * Copyright (c) 1986 by University of Toronto.
41 * Written by Henry Spencer. Not derived from licensed software.
42 *
43 * Permission is granted to anyone to use this software for any
44 * purpose on any computer system, and to redistribute it freely,
45 * subject to the following restrictions:
46 *
47 * 1. The author is not responsible for the consequences of use of
48 * this software, no matter how awful, even if they arise
49 * from defects in it.
50 *
51 * 2. The origin of this software must not be misrepresented, either
52 * by explicit claim or by omission.
53 *
54 * 3. Altered versions must be plainly marked as such, and must not
55 * be misrepresented as being the original software.
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
a687059c
LW
64 *
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
68 */
69#include "EXTERN.h"
864dbfa3 70#define PERL_IN_REGEXEC_C
a687059c 71#include "perl.h"
0f5d15d6 72
54df2634
NC
73#ifdef PERL_IN_XSUB_RE
74# include "re_comp.h"
75#else
76# include "regcomp.h"
77#endif
a687059c 78
c277df42
IZ
79#define RF_tainted 1 /* tainted information used? */
80#define RF_warned 2 /* warned about big count? */
ce862d02 81#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
82#define RF_utf8 8 /* String contains multibyte chars? */
83
eb160463 84#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
85
86#define RS_init 1 /* eval environment created */
87#define RS_set 2 /* replsv value is set */
c277df42 88
a687059c
LW
89#ifndef STATIC
90#define STATIC static
91#endif
92
32fc9b6a 93#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 94
c277df42
IZ
95/*
96 * Forwards.
97 */
98
33b8afdf 99#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 100#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 101
3dab1dad
YO
102#define HOPc(pos,off) \
103 (char *)(PL_reg_match_utf8 \
52657f30 104 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
105 : (U8*)(pos + off))
106#define HOPBACKc(pos, off) \
07be1b83
YO
107 (char*)(PL_reg_match_utf8\
108 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
109 : (pos - off >= PL_bostr) \
8e11feef 110 ? (U8*)pos - off \
3dab1dad 111 : NULL)
efb30f32 112
53c4c00c 113#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 114#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 115
1a4fad37
AL
116#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
117 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
118#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
119#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
120#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
121#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 122
3dab1dad
YO
123/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124
5f80c4cf 125/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
126#define JUMPABLE(rn) ( \
127 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
128 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
129 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 130 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
131)
132
cca55fe3 133#define HAS_TEXT(rn) ( \
3dab1dad 134 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 135)
e2d8ce26 136
a84d97b6
HS
137/*
138 Search for mandatory following text node; for lookahead, the text must
139 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140*/
cca55fe3 141#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
142 while (JUMPABLE(rn)) { \
143 const OPCODE type = OP(rn); \
144 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 145 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 146 else if (type == PLUS) \
cca55fe3 147 rn = NEXTOPER(rn); \
3dab1dad 148 else if (type == IFMATCH) \
a84d97b6 149 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 150 else rn += NEXT_OFF(rn); \
3dab1dad 151 } \
5f80c4cf 152} STMT_END
74750237 153
acfe0abc 154static void restore_pos(pTHX_ void *arg);
51371543 155
76e3520e 156STATIC CHECKPOINT
cea2e8a9 157S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 158{
97aff369 159 dVAR;
a3b680e6 160 const int retval = PL_savestack_ix;
b1ce53c5 161#define REGCP_PAREN_ELEMS 4
a3b680e6 162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
163 int p;
164
e49a9654
IH
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
167
a01268b5 168#define REGCP_OTHER_ELEMS 6
4b3c1a47 169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 170 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 171/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
3280af22 174 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
175 SSPUSHINT(p);
176 }
b1ce53c5 177/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
178 SSPUSHINT(PL_regsize);
179 SSPUSHINT(*PL_reglastparen);
a01268b5 180 SSPUSHINT(*PL_reglastcloseparen);
3280af22 181 SSPUSHPTR(PL_reginput);
41123dfd
JH
182#define REGCP_FRAME_ELEMS 2
183/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
184 * are needed for the regexp context stack bookkeeping. */
185 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 186 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 187
a0d0e21e
LW
188 return retval;
189}
190
c277df42 191/* These are needed since we do not localize EVAL nodes: */
a3621e74 192# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 193 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 194 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 195
a3621e74 196# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 197 PerlIO_printf(Perl_debug_log, \
faccc32b 198 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 199 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 200
76e3520e 201STATIC char *
097eb12c 202S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 203{
97aff369 204 dVAR;
b1ce53c5 205 I32 i;
a0d0e21e 206 char *input;
b1ce53c5 207
a3621e74
YO
208 GET_RE_DEBUG_FLAGS_DECL;
209
b1ce53c5 210 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 211 i = SSPOPINT;
b1ce53c5
JH
212 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
213 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 214 input = (char *) SSPOPPTR;
a01268b5 215 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
216 *PL_reglastparen = SSPOPINT;
217 PL_regsize = SSPOPINT;
b1ce53c5
JH
218
219 /* Now restore the parentheses context. */
41123dfd
JH
220 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
221 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 222 I32 tmps;
097eb12c 223 U32 paren = (U32)SSPOPINT;
3280af22 224 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
225 PL_regstartp[paren] = SSPOPINT;
226 tmps = SSPOPINT;
3280af22
NIS
227 if (paren <= *PL_reglastparen)
228 PL_regendp[paren] = tmps;
a3621e74 229 DEBUG_EXECUTE_r(
c3464db5 230 PerlIO_printf(Perl_debug_log,
b900a521 231 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 232 (UV)paren, (IV)PL_regstartp[paren],
b900a521 233 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 234 (IV)PL_regendp[paren],
3280af22 235 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 236 );
a0d0e21e 237 }
a3621e74 238 DEBUG_EXECUTE_r(
bb7a0f54 239 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 240 PerlIO_printf(Perl_debug_log,
faccc32b 241 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 242 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
243 }
244 );
daf18116 245#if 1
dafc8851
JH
246 /* It would seem that the similar code in regtry()
247 * already takes care of this, and in fact it is in
248 * a better location to since this code can #if 0-ed out
249 * but the code in regtry() is needed or otherwise tests
250 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
251 * (as of patchlevel 7877) will fail. Then again,
252 * this code seems to be necessary or otherwise
253 * building DynaLoader will fail:
254 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
255 * --jhi */
bb7a0f54 256 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
257 if (i > PL_regsize)
258 PL_regstartp[i] = -1;
259 PL_regendp[i] = -1;
a0d0e21e 260 }
dafc8851 261#endif
a0d0e21e
LW
262 return input;
263}
264
02db2b7b 265#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 266
95b24440 267#define TRYPAREN(paren, n, input, where) { \
29d1e993
HS
268 if (paren) { \
269 if (n) { \
270 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
271 PL_regendp[paren] = input - PL_bostr; \
272 } \
273 else \
274 PL_regendp[paren] = -1; \
275 } \
95b24440
DM
276 REGMATCH(next, where); \
277 if (result) \
29d1e993
HS
278 sayYES; \
279 if (paren && n) \
280 PL_regendp[paren] = -1; \
281}
282
283
a687059c 284/*
e50aee73 285 * pregexec and friends
a687059c
LW
286 */
287
76234dfb 288#ifndef PERL_IN_XSUB_RE
a687059c 289/*
c277df42 290 - pregexec - match a regexp against a string
a687059c 291 */
c277df42 292I32
864dbfa3 293Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 294 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
295/* strend: pointer to null at end of string */
296/* strbeg: real beginning of string */
297/* minend: end of match must be >=minend after stringarg. */
298/* nosave: For optimizations. */
299{
300 return
9041c2e3 301 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
302 nosave ? 0 : REXEC_COPY_STR);
303}
76234dfb 304#endif
22e551b9 305
9041c2e3 306/*
cad2e5aa
JH
307 * Need to implement the following flags for reg_anch:
308 *
309 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
310 * USE_INTUIT_ML
311 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
312 * INTUIT_AUTORITATIVE_ML
313 * INTUIT_ONCE_NOML - Intuit can match in one location only.
314 * INTUIT_ONCE_ML
315 *
316 * Another flag for this function: SECOND_TIME (so that float substrs
317 * with giant delta may be not rechecked).
318 */
319
320/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
321
3f7c398e 322/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
323 Otherwise, only SvCUR(sv) is used to get strbeg. */
324
325/* XXXX We assume that strpos is strbeg unless sv. */
326
6eb5f6b9
JH
327/* XXXX Some places assume that there is a fixed substring.
328 An update may be needed if optimizer marks as "INTUITable"
329 RExen without fixed substrings. Similarly, it is assumed that
330 lengths of all the strings are no more than minlen, thus they
331 cannot come from lookahead.
332 (Or minlen should take into account lookahead.) */
333
2c2d71f5
JH
334/* A failure to find a constant substring means that there is no need to make
335 an expensive call to REx engine, thus we celebrate a failure. Similarly,
336 finding a substring too deep into the string means that less calls to
30944b6d
IZ
337 regtry() should be needed.
338
339 REx compiler's optimizer found 4 possible hints:
340 a) Anchored substring;
341 b) Fixed substring;
342 c) Whether we are anchored (beginning-of-line or \G);
343 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 344 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
345 string which does not contradict any of them.
346 */
2c2d71f5 347
6eb5f6b9
JH
348/* Most of decisions we do here should have been done at compile time.
349 The nodes of the REx which we used for the search should have been
350 deleted from the finite automaton. */
351
cad2e5aa
JH
352char *
353Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
354 char *strend, U32 flags, re_scream_pos_data *data)
355{
97aff369 356 dVAR;
b7953727 357 register I32 start_shift = 0;
cad2e5aa 358 /* Should be nonnegative! */
b7953727 359 register I32 end_shift = 0;
2c2d71f5
JH
360 register char *s;
361 register SV *check;
a1933d95 362 char *strbeg;
cad2e5aa 363 char *t;
a3b680e6 364 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 365 I32 ml_anch;
bd61b366
SS
366 register char *other_last = NULL; /* other substr checked before this */
367 char *check_at = NULL; /* check substr found at this pos */
1df70142 368 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 369#ifdef DEBUGGING
890ce7af
AL
370 const char * const i_strpos = strpos;
371 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 372#endif
a3621e74
YO
373
374 GET_RE_DEBUG_FLAGS_DECL;
375
a30b2f1f 376 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 377
b8d68ded 378 if (prog->reganch & ROPT_UTF8) {
a3621e74 379 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
380 "UTF-8 regex...\n"));
381 PL_reg_flags |= RF_utf8;
382 }
383
a3621e74 384 DEBUG_EXECUTE_r({
1df70142 385 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
386 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
387 strpos;
1df70142 388 const int len = PL_reg_match_utf8 ?
bb7a0f54 389 (int)strlen(s) : strend - strpos;
2a782b5b
JH
390 if (!PL_colorset)
391 reginitcolors();
b8d68ded 392 if (PL_reg_match_utf8)
a3621e74 393 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 394 "UTF-8 target...\n"));
2a782b5b 395 PerlIO_printf(Perl_debug_log,
a0288114 396 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 397 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
398 prog->precomp,
399 PL_colors[1],
400 (strlen(prog->precomp) > 60 ? "..." : ""),
401 PL_colors[0],
402 (int)(len > 60 ? 60 : len),
403 s, PL_colors[1],
404 (len > 60 ? "..." : "")
405 );
406 });
cad2e5aa 407
c344f387
JH
408 /* CHR_DIST() would be more correct here but it makes things slow. */
409 if (prog->minlen > strend - strpos) {
a3621e74 410 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 411 "String too short... [re_intuit_start]\n"));
cad2e5aa 412 goto fail;
2c2d71f5 413 }
a1933d95 414 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 415 PL_regeol = strend;
33b8afdf
JH
416 if (do_utf8) {
417 if (!prog->check_utf8 && prog->check_substr)
418 to_utf8_substr(prog);
419 check = prog->check_utf8;
420 } else {
421 if (!prog->check_substr && prog->check_utf8)
422 to_byte_substr(prog);
423 check = prog->check_substr;
424 }
425 if (check == &PL_sv_undef) {
a3621e74 426 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
427 "Non-utf string cannot match utf check string\n"));
428 goto fail;
429 }
2c2d71f5 430 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
431 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
432 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 433 && !multiline ) ); /* Check after \n? */
cad2e5aa 434
7e25d62c
JH
435 if (!ml_anch) {
436 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
437 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 438 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
439 && sv && !SvROK(sv)
440 && (strpos != strbeg)) {
a3621e74 441 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
442 goto fail;
443 }
444 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 445 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 446 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
447 I32 slen;
448
1aa99e6b 449 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
450 if (SvTAIL(check)) {
451 slen = SvCUR(check); /* >= 1 */
cad2e5aa 452
9041c2e3 453 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 454 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 455 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 456 goto fail_finish;
cad2e5aa
JH
457 }
458 /* Now should match s[0..slen-2] */
459 slen--;
3f7c398e 460 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 461 || (slen > 1
3f7c398e 462 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 463 report_neq:
a3621e74 464 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
465 goto fail_finish;
466 }
cad2e5aa 467 }
3f7c398e 468 else if (*SvPVX_const(check) != *s
653099ff 469 || ((slen = SvCUR(check)) > 1
3f7c398e 470 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 471 goto report_neq;
c315bfe8 472 check_at = s;
2c2d71f5 473 goto success_at_start;
7e25d62c 474 }
cad2e5aa 475 }
2c2d71f5 476 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 477 s = strpos;
2c2d71f5 478 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 479 end_shift = prog->minlen - start_shift -
653099ff 480 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 481 if (!ml_anch) {
a3b680e6 482 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 483 - (SvTAIL(check) != 0);
a3b680e6 484 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
485
486 if (end_shift < eshift)
487 end_shift = eshift;
488 }
cad2e5aa 489 }
2c2d71f5 490 else { /* Can match at random position */
cad2e5aa
JH
491 ml_anch = 0;
492 s = strpos;
2c2d71f5
JH
493 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
494 /* Should be nonnegative! */
495 end_shift = prog->minlen - start_shift -
653099ff 496 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
497 }
498
2c2d71f5 499#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 500 if (end_shift < 0)
6bbae5e6 501 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
502#endif
503
2c2d71f5
JH
504 restart:
505 /* Find a possible match in the region s..strend by looking for
506 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 507 if (flags & REXEC_SCREAM) {
cad2e5aa 508 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 509 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 510
2c2d71f5
JH
511 if (PL_screamfirst[BmRARE(check)] >= 0
512 || ( BmRARE(check) == '\n'
513 && (BmPREVIOUS(check) == SvCUR(check) - 1)
514 && SvTAIL(check) ))
9041c2e3 515 s = screaminstr(sv, check,
2c2d71f5 516 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 517 else
2c2d71f5 518 goto fail_finish;
4addbd3b
HS
519 /* we may be pointing at the wrong string */
520 if (s && RX_MATCH_COPIED(prog))
3f7c398e 521 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
522 if (data)
523 *data->scream_olds = s;
524 }
f33976b4 525 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
526 s = fbm_instr((U8*)(s + start_shift),
527 (U8*)(strend - end_shift),
7fba1cd6 528 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 529 else
1aa99e6b
IH
530 s = fbm_instr(HOP3(s, start_shift, strend),
531 HOP3(strend, -end_shift, strbeg),
7fba1cd6 532 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
533
534 /* Update the count-of-usability, remove useless subpatterns,
535 unshift s. */
2c2d71f5 536
a0288114 537 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 538 (s ? "Found" : "Did not find"),
33b8afdf 539 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 540 PL_colors[0],
7b0972df 541 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 542 SvPVX_const(check),
2c2d71f5
JH
543 PL_colors[1], (SvTAIL(check) ? "$" : ""),
544 (s ? " at offset " : "...\n") ) );
545
546 if (!s)
547 goto fail_finish;
548
6eb5f6b9
JH
549 check_at = s;
550
2c2d71f5 551 /* Finish the diagnostic message */
a3621e74 552 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
553
554 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
555 Start with the other substr.
556 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 557 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
558 *always* match. Probably should be marked during compile...
559 Probably it is right to do no SCREAM here...
560 */
561
33b8afdf 562 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 563 /* Take into account the "other" substring. */
2c2d71f5
JH
564 /* XXXX May be hopelessly wrong for UTF... */
565 if (!other_last)
6eb5f6b9 566 other_last = strpos;
33b8afdf 567 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
568 do_other_anchored:
569 {
890ce7af
AL
570 char * const last = HOP3c(s, -start_shift, strbeg);
571 char *last1, *last2;
be8e71aa 572 char * const saved_s = s;
33b8afdf 573 SV* must;
2c2d71f5 574
2c2d71f5
JH
575 t = s - prog->check_offset_max;
576 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 577 && (!do_utf8
0ce71af7 578 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 579 && t > strpos)))
6f207bd3 580 NOOP;
2c2d71f5
JH
581 else
582 t = strpos;
1aa99e6b 583 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
584 if (t < other_last) /* These positions already checked */
585 t = other_last;
1aa99e6b 586 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
587 if (last < last1)
588 last1 = last;
589 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
590 /* On end-of-str: see comment below. */
33b8afdf
JH
591 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
592 if (must == &PL_sv_undef) {
593 s = (char*)NULL;
a3621e74 594 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
595 }
596 else
597 s = fbm_instr(
598 (unsigned char*)t,
599 HOP3(HOP3(last1, prog->anchored_offset, strend)
600 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
601 must,
7fba1cd6 602 multiline ? FBMrf_MULTILINE : 0
33b8afdf 603 );
a3621e74 604 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 605 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
606 (s ? "Found" : "Contradicts"),
607 PL_colors[0],
33b8afdf
JH
608 (int)(SvCUR(must)
609 - (SvTAIL(must)!=0)),
3f7c398e 610 SvPVX_const(must),
33b8afdf 611 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
612 if (!s) {
613 if (last1 >= last2) {
a3621e74 614 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
615 ", giving up...\n"));
616 goto fail_finish;
617 }
a3621e74 618 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 619 ", trying floating at offset %ld...\n",
be8e71aa 620 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
621 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
622 s = HOP3c(last, 1, strend);
2c2d71f5
JH
623 goto restart;
624 }
625 else {
a3621e74 626 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 627 (long)(s - i_strpos)));
1aa99e6b
IH
628 t = HOP3c(s, -prog->anchored_offset, strbeg);
629 other_last = HOP3c(s, 1, strend);
be8e71aa 630 s = saved_s;
2c2d71f5
JH
631 if (t == strpos)
632 goto try_at_start;
2c2d71f5
JH
633 goto try_at_offset;
634 }
30944b6d 635 }
2c2d71f5
JH
636 }
637 else { /* Take into account the floating substring. */
33b8afdf 638 char *last, *last1;
be8e71aa 639 char * const saved_s = s;
33b8afdf
JH
640 SV* must;
641
642 t = HOP3c(s, -start_shift, strbeg);
643 last1 = last =
644 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
645 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
646 last = HOP3c(t, prog->float_max_offset, strend);
647 s = HOP3c(t, prog->float_min_offset, strend);
648 if (s < other_last)
649 s = other_last;
2c2d71f5 650 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
651 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
652 /* fbm_instr() takes into account exact value of end-of-str
653 if the check is SvTAIL(ed). Since false positives are OK,
654 and end-of-str is not later than strend we are OK. */
655 if (must == &PL_sv_undef) {
656 s = (char*)NULL;
a3621e74 657 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
658 }
659 else
2c2d71f5 660 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
661 (unsigned char*)last + SvCUR(must)
662 - (SvTAIL(must)!=0),
7fba1cd6 663 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 664 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
665 (s ? "Found" : "Contradicts"),
666 PL_colors[0],
667 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 668 SvPVX_const(must),
33b8afdf
JH
669 PL_colors[1], (SvTAIL(must) ? "$" : "")));
670 if (!s) {
671 if (last1 == last) {
a3621e74 672 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
673 ", giving up...\n"));
674 goto fail_finish;
2c2d71f5 675 }
a3621e74 676 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 677 ", trying anchored starting at offset %ld...\n",
be8e71aa 678 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
679 other_last = last;
680 s = HOP3c(t, 1, strend);
681 goto restart;
682 }
683 else {
a3621e74 684 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
685 (long)(s - i_strpos)));
686 other_last = s; /* Fix this later. --Hugo */
be8e71aa 687 s = saved_s;
33b8afdf
JH
688 if (t == strpos)
689 goto try_at_start;
690 goto try_at_offset;
691 }
2c2d71f5 692 }
cad2e5aa 693 }
2c2d71f5
JH
694
695 t = s - prog->check_offset_max;
2c2d71f5 696 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 697 && (!do_utf8
0ce71af7 698 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
1aa99e6b 699 && t > strpos))) {
2c2d71f5
JH
700 /* Fixed substring is found far enough so that the match
701 cannot start at strpos. */
702 try_at_offset:
cad2e5aa 703 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
704 /* Eventually fbm_*() should handle this, but often
705 anchored_offset is not 0, so this check will not be wasted. */
706 /* XXXX In the code below we prefer to look for "^" even in
707 presence of anchored substrings. And we search even
708 beyond the found float position. These pessimizations
709 are historical artefacts only. */
710 find_anchor:
2c2d71f5 711 while (t < strend - prog->minlen) {
cad2e5aa 712 if (*t == '\n') {
4ee3650e 713 if (t < check_at - prog->check_offset_min) {
33b8afdf 714 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
715 /* Since we moved from the found position,
716 we definitely contradict the found anchored
30944b6d
IZ
717 substr. Due to the above check we do not
718 contradict "check" substr.
719 Thus we can arrive here only if check substr
720 is float. Redo checking for "other"=="fixed".
721 */
9041c2e3 722 strpos = t + 1;
a3621e74 723 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 724 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
725 goto do_other_anchored;
726 }
4ee3650e
GS
727 /* We don't contradict the found floating substring. */
728 /* XXXX Why not check for STCLASS? */
cad2e5aa 729 s = t + 1;
a3621e74 730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 731 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
732 goto set_useful;
733 }
4ee3650e
GS
734 /* Position contradicts check-string */
735 /* XXXX probably better to look for check-string
736 than for "\n", so one should lower the limit for t? */
a3621e74 737 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 738 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 739 other_last = strpos = s = t + 1;
cad2e5aa
JH
740 goto restart;
741 }
742 t++;
743 }
a3621e74 744 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 745 PL_colors[0], PL_colors[1]));
2c2d71f5 746 goto fail_finish;
cad2e5aa 747 }
f5952150 748 else {
a3621e74 749 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 750 PL_colors[0], PL_colors[1]));
f5952150 751 }
cad2e5aa
JH
752 s = t;
753 set_useful:
33b8afdf 754 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
755 }
756 else {
f5952150 757 /* The found string does not prohibit matching at strpos,
2c2d71f5 758 - no optimization of calling REx engine can be performed,
f5952150
GS
759 unless it was an MBOL and we are not after MBOL,
760 or a future STCLASS check will fail this. */
2c2d71f5
JH
761 try_at_start:
762 /* Even in this situation we may use MBOL flag if strpos is offset
763 wrt the start of the string. */
05b4157f 764 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 765 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
766 /* May be due to an implicit anchor of m{.*foo} */
767 && !(prog->reganch & ROPT_IMPLICIT))
768 {
cad2e5aa
JH
769 t = strpos;
770 goto find_anchor;
771 }
a3621e74 772 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 773 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 774 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 775 );
2c2d71f5 776 success_at_start:
30944b6d 777 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
778 && (do_utf8 ? (
779 prog->check_utf8 /* Could be deleted already */
780 && --BmUSEFUL(prog->check_utf8) < 0
781 && (prog->check_utf8 == prog->float_utf8)
782 ) : (
783 prog->check_substr /* Could be deleted already */
784 && --BmUSEFUL(prog->check_substr) < 0
785 && (prog->check_substr == prog->float_substr)
786 )))
66e933ab 787 {
cad2e5aa 788 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 789 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
790 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
791 if (do_utf8 ? prog->check_substr : prog->check_utf8)
792 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
793 prog->check_substr = prog->check_utf8 = NULL; /* disable */
794 prog->float_substr = prog->float_utf8 = NULL; /* clear */
795 check = NULL; /* abort */
cad2e5aa 796 s = strpos;
3cf5c195
IZ
797 /* XXXX This is a remnant of the old implementation. It
798 looks wasteful, since now INTUIT can use many
6eb5f6b9 799 other heuristics. */
cad2e5aa
JH
800 prog->reganch &= ~RE_USE_INTUIT;
801 }
802 else
803 s = strpos;
804 }
805
6eb5f6b9
JH
806 /* Last resort... */
807 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
07be1b83 808 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
6eb5f6b9
JH
809 /* minlen == 0 is possible if regstclass is \b or \B,
810 and the fixed substr is ''$.
811 Since minlen is already taken into account, s+1 is before strend;
812 accidentally, minlen >= 1 guaranties no false positives at s + 1
813 even for \b or \B. But (minlen? 1 : 0) below assumes that
814 regstclass does not come from lookahead... */
815 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
816 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 817 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 818 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 819 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 820 : 1);
07be1b83 821 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 822 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 823 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
824 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
825 cl_l, strend)
826 : strend);
07be1b83
YO
827 /*if (OP(prog->regstclass) == TRIE)
828 endpos++;*/
6eb5f6b9 829 t = s;
3b0527fe 830 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
831 if (!s) {
832#ifdef DEBUGGING
cbbf8932 833 const char *what = NULL;
6eb5f6b9
JH
834#endif
835 if (endpos == strend) {
a3621e74 836 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
837 "Could not match STCLASS...\n") );
838 goto fail;
839 }
a3621e74 840 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 841 "This position contradicts STCLASS...\n") );
653099ff
GS
842 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
843 goto fail;
6eb5f6b9 844 /* Contradict one of substrings */
33b8afdf
JH
845 if (prog->anchored_substr || prog->anchored_utf8) {
846 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 847 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 848 hop_and_restart:
1aa99e6b 849 s = HOP3c(t, 1, strend);
66e933ab
GS
850 if (s + start_shift + end_shift > strend) {
851 /* XXXX Should be taken into account earlier? */
a3621e74 852 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
853 "Could not match STCLASS...\n") );
854 goto fail;
855 }
5e39e1e5
HS
856 if (!check)
857 goto giveup;
a3621e74 858 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 859 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
860 what, (long)(s + start_shift - i_strpos)) );
861 goto restart;
862 }
66e933ab 863 /* Have both, check_string is floating */
6eb5f6b9
JH
864 if (t + start_shift >= check_at) /* Contradicts floating=check */
865 goto retry_floating_check;
866 /* Recheck anchored substring, but not floating... */
9041c2e3 867 s = check_at;
5e39e1e5
HS
868 if (!check)
869 goto giveup;
a3621e74 870 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 871 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
872 (long)(other_last - i_strpos)) );
873 goto do_other_anchored;
874 }
60e71179
GS
875 /* Another way we could have checked stclass at the
876 current position only: */
877 if (ml_anch) {
878 s = t = t + 1;
5e39e1e5
HS
879 if (!check)
880 goto giveup;
a3621e74 881 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 882 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 883 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 884 goto try_at_offset;
66e933ab 885 }
33b8afdf 886 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 887 goto fail;
6eb5f6b9
JH
888 /* Check is floating subtring. */
889 retry_floating_check:
890 t = check_at - start_shift;
a3621e74 891 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
892 goto hop_and_restart;
893 }
b7953727 894 if (t != s) {
a3621e74 895 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 896 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
897 (long)(t - i_strpos), (long)(s - i_strpos))
898 );
899 }
900 else {
a3621e74 901 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
902 "Does not contradict STCLASS...\n");
903 );
904 }
6eb5f6b9 905 }
5e39e1e5 906 giveup:
a3621e74 907 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
908 PL_colors[4], (check ? "Guessed" : "Giving up"),
909 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 910 return s;
2c2d71f5
JH
911
912 fail_finish: /* Substring not found */
33b8afdf
JH
913 if (prog->check_substr || prog->check_utf8) /* could be removed already */
914 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 915 fail:
a3621e74 916 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 917 PL_colors[4], PL_colors[5]));
bd61b366 918 return NULL;
cad2e5aa 919}
9661b544 920
6eb5f6b9 921/* We know what class REx starts with. Try to find this position... */
3b0527fe 922/* if reginfo is NULL, its a dryrun */
07be1b83
YO
923/* annoyingly all the vars in this routine have different names from their counterparts
924 in regmatch. /grrr */
3b0527fe 925
3c3eec57 926STATIC char *
07be1b83
YO
927S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
928 const char *strend, const regmatch_info *reginfo)
a687059c 929{
27da23d5 930 dVAR;
1df70142 931 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 932 char *m;
d8093b23 933 STRLEN ln;
5dab1207 934 STRLEN lnc;
078c425b 935 register STRLEN uskip;
d8093b23
G
936 unsigned int c1;
937 unsigned int c2;
6eb5f6b9
JH
938 char *e;
939 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 940 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 941
6eb5f6b9
JH
942 /* We know what class it must start with. */
943 switch (OP(c)) {
6eb5f6b9 944 case ANYOF:
388cc4de 945 if (do_utf8) {
078c425b 946 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
947 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
948 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a
DM
949 reginclass(prog, c, (U8*)s, 0, do_utf8) :
950 REGINCLASS(prog, c, (U8*)s)) {
3b0527fe 951 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
952 goto got_it;
953 else
954 tmp = doevery;
955 }
956 else
957 tmp = 1;
078c425b 958 s += uskip;
388cc4de
HS
959 }
960 }
961 else {
962 while (s < strend) {
963 STRLEN skip = 1;
964
32fc9b6a 965 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
966 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
967 /* The assignment of 2 is intentional:
968 * for the folded sharp s, the skip is 2. */
969 (skip = SHARP_S_SKIP))) {
3b0527fe 970 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
971 goto got_it;
972 else
973 tmp = doevery;
974 }
975 else
976 tmp = 1;
977 s += skip;
978 }
a0d0e21e 979 }
6eb5f6b9 980 break;
f33976b4
DB
981 case CANY:
982 while (s < strend) {
3b0527fe 983 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
984 goto got_it;
985 else
986 tmp = doevery;
987 s++;
988 }
989 break;
6eb5f6b9 990 case EXACTF:
5dab1207
NIS
991 m = STRING(c);
992 ln = STR_LEN(c); /* length to match in octets/bytes */
993 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 994 if (UTF) {
a2a2844f 995 STRLEN ulen1, ulen2;
5dab1207 996 U8 *sm = (U8 *) m;
89ebb4a3
JH
997 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
998 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 999 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1000
1001 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1002 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1003
89ebb4a3 1004 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1005 0, uniflags);
89ebb4a3 1006 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1007 0, uniflags);
5dab1207
NIS
1008 lnc = 0;
1009 while (sm < ((U8 *) m + ln)) {
1010 lnc++;
1011 sm += UTF8SKIP(sm);
1012 }
1aa99e6b
IH
1013 }
1014 else {
1015 c1 = *(U8*)m;
1016 c2 = PL_fold[c1];
1017 }
6eb5f6b9
JH
1018 goto do_exactf;
1019 case EXACTFL:
5dab1207
NIS
1020 m = STRING(c);
1021 ln = STR_LEN(c);
1022 lnc = (I32) ln;
d8093b23 1023 c1 = *(U8*)m;
6eb5f6b9
JH
1024 c2 = PL_fold_locale[c1];
1025 do_exactf:
db12adc6 1026 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1027
3b0527fe 1028 if (!reginfo && e < s)
6eb5f6b9 1029 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1030
60a8b682
JH
1031 /* The idea in the EXACTF* cases is to first find the
1032 * first character of the EXACTF* node and then, if
1033 * necessary, case-insensitively compare the full
1034 * text of the node. The c1 and c2 are the first
1035 * characters (though in Unicode it gets a bit
1036 * more complicated because there are more cases
7f16dd3d
JH
1037 * than just upper and lower: one needs to use
1038 * the so-called folding case for case-insensitive
1039 * matching (called "loose matching" in Unicode).
1040 * ibcmp_utf8() will do just that. */
60a8b682 1041
1aa99e6b 1042 if (do_utf8) {
575cac57 1043 UV c, f;
89ebb4a3 1044 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1045 STRLEN len, foldlen;
4ad0818d 1046 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1047 if (c1 == c2) {
5dab1207
NIS
1048 /* Upper and lower of 1st char are equal -
1049 * probably not a "letter". */
1aa99e6b 1050 while (s <= e) {
89ebb4a3 1051 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1052 uniflags);
80aecb99
JH
1053 if ( c == c1
1054 && (ln == len ||
3dab1dad
YO
1055 ibcmp_utf8(s, NULL, 0, do_utf8,
1056 m, NULL, ln, (bool)UTF))
3b0527fe 1057 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1058 goto got_it;
80aecb99 1059 else {
1df70142 1060 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1061 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1062 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1063 if ( f != c
1064 && (f == c1 || f == c2)
1065 && (ln == foldlen ||
66423254 1066 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1067 NULL, foldlen, do_utf8,
d07ddd77 1068 m,
3dab1dad 1069 NULL, ln, (bool)UTF))
3b0527fe 1070 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1071 goto got_it;
1072 }
1aa99e6b
IH
1073 s += len;
1074 }
09091399
JH
1075 }
1076 else {
1aa99e6b 1077 while (s <= e) {
89ebb4a3 1078 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1079 uniflags);
80aecb99 1080
60a8b682 1081 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1082 * Note that not all the possible combinations
1083 * are handled here: some of them are handled
1084 * by the standard folding rules, and some of
1085 * them (the character class or ANYOF cases)
1086 * are handled during compiletime in
1087 * regexec.c:S_regclass(). */
880bd946
JH
1088 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1089 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1090 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1091
1092 if ( (c == c1 || c == c2)
1093 && (ln == len ||
3dab1dad
YO
1094 ibcmp_utf8(s, NULL, 0, do_utf8,
1095 m, NULL, ln, (bool)UTF))
3b0527fe 1096 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1097 goto got_it;
80aecb99 1098 else {
1df70142 1099 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1100 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1101 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1102 if ( f != c
1103 && (f == c1 || f == c2)
1104 && (ln == foldlen ||
a6872d42 1105 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1106 NULL, foldlen, do_utf8,
d07ddd77 1107 m,
3dab1dad 1108 NULL, ln, (bool)UTF))
3b0527fe 1109 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1110 goto got_it;
1111 }
1aa99e6b
IH
1112 s += len;
1113 }
09091399 1114 }
1aa99e6b
IH
1115 }
1116 else {
1117 if (c1 == c2)
1118 while (s <= e) {
1119 if ( *(U8*)s == c1
1120 && (ln == 1 || !(OP(c) == EXACTF
1121 ? ibcmp(s, m, ln)
1122 : ibcmp_locale(s, m, ln)))
3b0527fe 1123 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1124 goto got_it;
1125 s++;
1126 }
1127 else
1128 while (s <= e) {
1129 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1130 && (ln == 1 || !(OP(c) == EXACTF
1131 ? ibcmp(s, m, ln)
1132 : ibcmp_locale(s, m, ln)))
3b0527fe 1133 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1134 goto got_it;
1135 s++;
1136 }
b3c9acc1
IZ
1137 }
1138 break;
bbce6d69 1139 case BOUNDL:
3280af22 1140 PL_reg_flags |= RF_tainted;
bbce6d69 1141 /* FALL THROUGH */
a0d0e21e 1142 case BOUND:
ffc61ed2 1143 if (do_utf8) {
12d33761 1144 if (s == PL_bostr)
ffc61ed2
JH
1145 tmp = '\n';
1146 else {
6136c704 1147 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1148 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1149 }
1150 tmp = ((OP(c) == BOUND ?
9041c2e3 1151 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1152 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1153 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1154 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1155 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1156 isALNUM_LC_utf8((U8*)s)))
1157 {
1158 tmp = !tmp;
3b0527fe 1159 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1160 goto got_it;
1161 }
078c425b 1162 s += uskip;
a687059c 1163 }
a0d0e21e 1164 }
667bb95a 1165 else {
12d33761 1166 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1167 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1168 while (s < strend) {
1169 if (tmp ==
1170 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1171 tmp = !tmp;
3b0527fe 1172 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1173 goto got_it;
1174 }
1175 s++;
a0ed51b3 1176 }
a0ed51b3 1177 }
3b0527fe 1178 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1179 goto got_it;
1180 break;
bbce6d69 1181 case NBOUNDL:
3280af22 1182 PL_reg_flags |= RF_tainted;
bbce6d69 1183 /* FALL THROUGH */
a0d0e21e 1184 case NBOUND:
ffc61ed2 1185 if (do_utf8) {
12d33761 1186 if (s == PL_bostr)
ffc61ed2
JH
1187 tmp = '\n';
1188 else {
6136c704 1189 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1190 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1191 }
1192 tmp = ((OP(c) == NBOUND ?
9041c2e3 1193 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1194 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1195 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1196 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1197 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1198 isALNUM_LC_utf8((U8*)s)))
1199 tmp = !tmp;
3b0527fe 1200 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2 1201 goto got_it;
078c425b 1202 s += uskip;
ffc61ed2 1203 }
a0d0e21e 1204 }
667bb95a 1205 else {
12d33761 1206 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1207 tmp = ((OP(c) == NBOUND ?
1208 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1209 while (s < strend) {
1210 if (tmp ==
1211 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1212 tmp = !tmp;
3b0527fe 1213 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1214 goto got_it;
1215 s++;
1216 }
a0ed51b3 1217 }
3b0527fe 1218 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1219 goto got_it;
1220 break;
a0d0e21e 1221 case ALNUM:
ffc61ed2 1222 if (do_utf8) {
1a4fad37 1223 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1224 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1225 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1226 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1227 goto got_it;
1228 else
1229 tmp = doevery;
1230 }
bbce6d69 1231 else
ffc61ed2 1232 tmp = 1;
078c425b 1233 s += uskip;
bbce6d69 1234 }
bbce6d69 1235 }
ffc61ed2
JH
1236 else {
1237 while (s < strend) {
1238 if (isALNUM(*s)) {
3b0527fe 1239 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1240 goto got_it;
1241 else
1242 tmp = doevery;
1243 }
a0ed51b3 1244 else
ffc61ed2
JH
1245 tmp = 1;
1246 s++;
a0ed51b3 1247 }
a0ed51b3
LW
1248 }
1249 break;
bbce6d69 1250 case ALNUML:
3280af22 1251 PL_reg_flags |= RF_tainted;
ffc61ed2 1252 if (do_utf8) {
078c425b 1253 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1254 if (isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1255 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1256 goto got_it;
1257 else
1258 tmp = doevery;
1259 }
a687059c 1260 else
ffc61ed2 1261 tmp = 1;
078c425b 1262 s += uskip;
a0d0e21e 1263 }
a0d0e21e 1264 }
ffc61ed2
JH
1265 else {
1266 while (s < strend) {
1267 if (isALNUM_LC(*s)) {
3b0527fe 1268 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1269 goto got_it;
1270 else
1271 tmp = doevery;
1272 }
a0ed51b3 1273 else
ffc61ed2
JH
1274 tmp = 1;
1275 s++;
a0ed51b3 1276 }
a0ed51b3
LW
1277 }
1278 break;
a0d0e21e 1279 case NALNUM:
ffc61ed2 1280 if (do_utf8) {
1a4fad37 1281 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1282 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1283 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1284 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1285 goto got_it;
1286 else
1287 tmp = doevery;
1288 }
bbce6d69 1289 else
ffc61ed2 1290 tmp = 1;
078c425b 1291 s += uskip;
bbce6d69 1292 }
bbce6d69 1293 }
ffc61ed2
JH
1294 else {
1295 while (s < strend) {
1296 if (!isALNUM(*s)) {
3b0527fe 1297 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1298 goto got_it;
1299 else
1300 tmp = doevery;
1301 }
a0ed51b3 1302 else
ffc61ed2
JH
1303 tmp = 1;
1304 s++;
a0ed51b3 1305 }
a0ed51b3
LW
1306 }
1307 break;
bbce6d69 1308 case NALNUML:
3280af22 1309 PL_reg_flags |= RF_tainted;
ffc61ed2 1310 if (do_utf8) {
078c425b 1311 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1312 if (!isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1313 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1314 goto got_it;
1315 else
1316 tmp = doevery;
1317 }
a687059c 1318 else
ffc61ed2 1319 tmp = 1;
078c425b 1320 s += uskip;
a687059c 1321 }
a0d0e21e 1322 }
ffc61ed2
JH
1323 else {
1324 while (s < strend) {
1325 if (!isALNUM_LC(*s)) {
3b0527fe 1326 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1327 goto got_it;
1328 else
1329 tmp = doevery;
1330 }
a0ed51b3 1331 else
ffc61ed2
JH
1332 tmp = 1;
1333 s++;
a0ed51b3 1334 }
a0ed51b3
LW
1335 }
1336 break;
a0d0e21e 1337 case SPACE:
ffc61ed2 1338 if (do_utf8) {
1a4fad37 1339 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1340 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1341 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
3b0527fe 1342 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1343 goto got_it;
1344 else
1345 tmp = doevery;
1346 }
a0d0e21e 1347 else
ffc61ed2 1348 tmp = 1;
078c425b 1349 s += uskip;
2304df62 1350 }
a0d0e21e 1351 }
ffc61ed2
JH
1352 else {
1353 while (s < strend) {
1354 if (isSPACE(*s)) {
3b0527fe 1355 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1356 goto got_it;
1357 else
1358 tmp = doevery;
1359 }
a0ed51b3 1360 else
ffc61ed2
JH
1361 tmp = 1;
1362 s++;
a0ed51b3 1363 }
a0ed51b3
LW
1364 }
1365 break;
bbce6d69 1366 case SPACEL:
3280af22 1367 PL_reg_flags |= RF_tainted;
ffc61ed2 1368 if (do_utf8) {
078c425b 1369 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1370 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
3b0527fe 1371 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1372 goto got_it;
1373 else
1374 tmp = doevery;
1375 }
bbce6d69 1376 else
ffc61ed2 1377 tmp = 1;
078c425b 1378 s += uskip;
bbce6d69 1379 }
bbce6d69 1380 }
ffc61ed2
JH
1381 else {
1382 while (s < strend) {
1383 if (isSPACE_LC(*s)) {
3b0527fe 1384 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1385 goto got_it;
1386 else
1387 tmp = doevery;
1388 }
a0ed51b3 1389 else
ffc61ed2
JH
1390 tmp = 1;
1391 s++;
a0ed51b3 1392 }
a0ed51b3
LW
1393 }
1394 break;
a0d0e21e 1395 case NSPACE:
ffc61ed2 1396 if (do_utf8) {
1a4fad37 1397 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1398 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1399 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
3b0527fe 1400 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1401 goto got_it;
1402 else
1403 tmp = doevery;
1404 }
a0d0e21e 1405 else
ffc61ed2 1406 tmp = 1;
078c425b 1407 s += uskip;
a687059c 1408 }
a0d0e21e 1409 }
ffc61ed2
JH
1410 else {
1411 while (s < strend) {
1412 if (!isSPACE(*s)) {
3b0527fe 1413 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1414 goto got_it;
1415 else
1416 tmp = doevery;
1417 }
a0ed51b3 1418 else
ffc61ed2
JH
1419 tmp = 1;
1420 s++;
a0ed51b3 1421 }
a0ed51b3
LW
1422 }
1423 break;
bbce6d69 1424 case NSPACEL:
3280af22 1425 PL_reg_flags |= RF_tainted;
ffc61ed2 1426 if (do_utf8) {
078c425b 1427 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1428 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
3b0527fe 1429 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1430 goto got_it;
1431 else
1432 tmp = doevery;
1433 }
bbce6d69 1434 else
ffc61ed2 1435 tmp = 1;
078c425b 1436 s += uskip;
bbce6d69 1437 }
bbce6d69 1438 }
ffc61ed2
JH
1439 else {
1440 while (s < strend) {
1441 if (!isSPACE_LC(*s)) {
3b0527fe 1442 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1443 goto got_it;
1444 else
1445 tmp = doevery;
1446 }
a0ed51b3 1447 else
ffc61ed2
JH
1448 tmp = 1;
1449 s++;
a0ed51b3 1450 }
a0ed51b3
LW
1451 }
1452 break;
a0d0e21e 1453 case DIGIT:
ffc61ed2 1454 if (do_utf8) {
1a4fad37 1455 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1456 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1457 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1458 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1459 goto got_it;
1460 else
1461 tmp = doevery;
1462 }
a0d0e21e 1463 else
ffc61ed2 1464 tmp = 1;
078c425b 1465 s += uskip;
2b69d0c2 1466 }
a0d0e21e 1467 }
ffc61ed2
JH
1468 else {
1469 while (s < strend) {
1470 if (isDIGIT(*s)) {
3b0527fe 1471 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1472 goto got_it;
1473 else
1474 tmp = doevery;
1475 }
a0ed51b3 1476 else
ffc61ed2
JH
1477 tmp = 1;
1478 s++;
a0ed51b3 1479 }
a0ed51b3
LW
1480 }
1481 break;
b8c5462f
JH
1482 case DIGITL:
1483 PL_reg_flags |= RF_tainted;
ffc61ed2 1484 if (do_utf8) {
078c425b 1485 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1486 if (isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1487 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1488 goto got_it;
1489 else
1490 tmp = doevery;
1491 }
b8c5462f 1492 else
ffc61ed2 1493 tmp = 1;
078c425b 1494 s += uskip;
b8c5462f 1495 }
b8c5462f 1496 }
ffc61ed2
JH
1497 else {
1498 while (s < strend) {
1499 if (isDIGIT_LC(*s)) {
3b0527fe 1500 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1501 goto got_it;
1502 else
1503 tmp = doevery;
1504 }
b8c5462f 1505 else
ffc61ed2
JH
1506 tmp = 1;
1507 s++;
b8c5462f 1508 }
b8c5462f
JH
1509 }
1510 break;
a0d0e21e 1511 case NDIGIT:
ffc61ed2 1512 if (do_utf8) {
1a4fad37 1513 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1514 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1515 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1516 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1517 goto got_it;
1518 else
1519 tmp = doevery;
1520 }
a0d0e21e 1521 else
ffc61ed2 1522 tmp = 1;
078c425b 1523 s += uskip;
a687059c 1524 }
a0d0e21e 1525 }
ffc61ed2
JH
1526 else {
1527 while (s < strend) {
1528 if (!isDIGIT(*s)) {
3b0527fe 1529 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1530 goto got_it;
1531 else
1532 tmp = doevery;
1533 }
a0ed51b3 1534 else
ffc61ed2
JH
1535 tmp = 1;
1536 s++;
a0ed51b3 1537 }
a0ed51b3
LW
1538 }
1539 break;
b8c5462f
JH
1540 case NDIGITL:
1541 PL_reg_flags |= RF_tainted;
ffc61ed2 1542 if (do_utf8) {
078c425b 1543 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1544 if (!isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1545 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1546 goto got_it;
1547 else
1548 tmp = doevery;
1549 }
b8c5462f 1550 else
ffc61ed2 1551 tmp = 1;
078c425b 1552 s += uskip;
b8c5462f 1553 }
a0ed51b3 1554 }
ffc61ed2
JH
1555 else {
1556 while (s < strend) {
1557 if (!isDIGIT_LC(*s)) {
3b0527fe 1558 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1559 goto got_it;
1560 else
1561 tmp = doevery;
1562 }
cf93c79d 1563 else
ffc61ed2
JH
1564 tmp = 1;
1565 s++;
b8c5462f 1566 }
b8c5462f
JH
1567 }
1568 break;
07be1b83
YO
1569 case TRIE:
1570 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1571 {
1572 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1573 trie_type = do_utf8 ?
1574 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1575 : trie_plain;
1576 /* what trie are we using right now */
1577 reg_ac_data *aho
1578 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1579 reg_trie_data *trie=aho->trie;
1580
1581 const char *last_start = strend - trie->minlen;
1582 const char *real_start = s;
1583 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1584 SV *sv_points;
1585 U8 **points; /* map of where we were in the input string
1586 when reading a given string. For ASCII this
1587 is unnecessary overhead as the relationship
1588 is always 1:1, but for unicode, especially
1589 case folded unicode this is not true. */
07be1b83
YO
1590
1591 GET_RE_DEBUG_FLAGS_DECL;
1592
be8e71aa
YO
1593 /* We can't just allocate points here. We need to wrap it in
1594 * an SV so it gets freed properly if there is a croak while
1595 * running the match */
1596 ENTER;
1597 SAVETMPS;
1598 sv_points=newSV(maxlen * sizeof(U8 *));
1599 SvCUR_set(sv_points,
1600 maxlen * sizeof(U8 *));
1601 SvPOK_on(sv_points);
1602 sv_2mortal(sv_points);
1603 points=(U8**)SvPV_nolen(sv_points );
07be1b83
YO
1604
1605 if (trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1606 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
07be1b83
YO
1607 s++;
1608 }
1609 }
1610
1611 while (s <= last_start) {
1612 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1613 U8 *uc = (U8*)s;
1614 U16 charid = 0;
1615 U32 base = 1;
1616 U32 state = 1;
1617 UV uvc = 0;
1618 STRLEN len = 0;
1619 STRLEN foldlen = 0;
1620 U8 *uscan = (U8*)NULL;
1621 U8 *leftmost = NULL;
1622
1623 U32 pointpos = 0;
1624
1625 while ( state && uc <= (U8*)strend ) {
1626 int failed=0;
1627 if (aho->states[ state ].wordnum) {
1628 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1629 if (!leftmost || lpos < leftmost)
1630 leftmost= lpos;
1631 if (base==0) break;
1632 }
1633 points[pointpos++ % maxlen]= uc;
1634 switch (trie_type) {
1635 case trie_utf8_fold:
1636 if ( foldlen>0 ) {
1637 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
1638 foldlen -= len;
1639 uscan += len;
1640 len=0;
1641 } else {
1642 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1643 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
1644 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
1645 foldlen -= UNISKIP( uvc );
1646 uscan = foldbuf + UNISKIP( uvc );
1647 }
1648 break;
1649 case trie_utf8:
1650 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
1651 &len, uniflags );
1652 break;
1653 case trie_plain:
1654 uvc = (UV)*uc;
1655 len = 1;
1656 }
1657
1658 if (uvc < 256) {
1659 charid = trie->charmap[ uvc ];
1660 }
1661 else {
1662 charid = 0;
1663 if (trie->widecharmap) {
1664 SV** const svpp = hv_fetch(trie->widecharmap,
1665 (char*)&uvc, sizeof(UV), 0);
1666 if (svpp)
1667 charid = (U16)SvIV(*svpp);
1668 }
1669 }
1670 DEBUG_TRIE_EXECUTE_r(
1671 PerlIO_printf(Perl_debug_log,
1672 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1673 (int)((const char*)uc - real_start), charid, uvc)
1674 );
1675 uc += len;
1676
1677 do {
1678 U32 word = aho->states[ state ].wordnum;
1679 base = aho->states[ state ].trans.base;
1680
1681 DEBUG_TRIE_EXECUTE_r(
1682 PerlIO_printf( Perl_debug_log,
1683 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1684 failed ? "Fail transition to " : "",
1685 state, base, uvc, word)
1686 );
1687 if ( base ) {
1688 U32 tmp;
1689 if (charid &&
1690 (base + charid > trie->uniquecharcount )
1691 && (base + charid - 1 - trie->uniquecharcount
1692 < trie->lasttrans)
1693 && trie->trans[base + charid - 1 -
1694 trie->uniquecharcount].check == state
1695 && (tmp=trie->trans[base + charid - 1 -
1696 trie->uniquecharcount ].next))
1697 {
1698 state = tmp;
1699 break;
1700 }
1701 else {
1702 failed++;
1703 if ( state == 1 )
1704 break;
1705 else
1706 state = aho->fail[state];
1707 }
1708 }
1709 else {
1710 /* we must be accepting here */
1711 failed++;
1712 break;
1713 }
1714 } while(state);
1715 if (failed) {
1716 if (leftmost)
1717 break;
1718 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1719 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
07be1b83
YO
1720 uc++;
1721 }
1722 }
1723 }
1724 }
1725 if ( aho->states[ state ].wordnum ) {
1726 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1727 if (!leftmost || lpos < leftmost)
1728 leftmost = lpos;
1729 }
1730 DEBUG_TRIE_EXECUTE_r(
1731 PerlIO_printf( Perl_debug_log,
1732 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1733 "All done: ",
1734 state, base, uvc)
1735 );
1736 if (leftmost) {
1737 s = (char*)leftmost;
be8e71aa
YO
1738 if (!reginfo || regtry(reginfo, s)) {
1739 FREETMPS;
1740 LEAVE;
07be1b83 1741 goto got_it;
be8e71aa 1742 }
07be1b83
YO
1743 s = HOPc(s,1);
1744 } else {
1745 break;
1746 }
1747 }
be8e71aa
YO
1748 FREETMPS;
1749 LEAVE;
07be1b83
YO
1750 }
1751 break;
b3c9acc1 1752 default:
3c3eec57
GS
1753 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1754 break;
d6a28714 1755 }
6eb5f6b9
JH
1756 return 0;
1757 got_it:
1758 return s;
1759}
1760
1761/*
1762 - regexec_flags - match a regexp against a string
1763 */
1764I32
1765Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1766 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1767/* strend: pointer to null at end of string */
1768/* strbeg: real beginning of string */
1769/* minend: end of match must be >=minend after stringarg. */
1770/* data: May be used for some additional optimizations. */
1771/* nosave: For optimizations. */
1772{
97aff369 1773 dVAR;
6eb5f6b9
JH
1774 register char *s;
1775 register regnode *c;
1776 register char *startpos = stringarg;
6eb5f6b9
JH
1777 I32 minlen; /* must match at least this many chars */
1778 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1779 I32 end_shift = 0; /* Same for the end. */ /* CC */
1780 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1781 char *scream_olds = NULL;
3dab1dad 1782 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1783 const bool do_utf8 = DO_UTF8(sv);
2757e526 1784 I32 multiline;
2a782b5b 1785#ifdef DEBUGGING
2757e526
JH
1786 SV* dsv0;
1787 SV* dsv1;
2a782b5b 1788#endif
3b0527fe 1789 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1790
1791 GET_RE_DEBUG_FLAGS_DECL;
1792
9d4ba2ae 1793 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1794
1795 /* Be paranoid... */
1796 if (prog == NULL || startpos == NULL) {
1797 Perl_croak(aTHX_ "NULL regexp parameter");
1798 return 0;
1799 }
1800
2757e526 1801 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1802 reginfo.prog = prog;
2757e526
JH
1803
1804#ifdef DEBUGGING
1805 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1806 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1807#endif
1808
bac06658
JH
1809 RX_MATCH_UTF8_set(prog, do_utf8);
1810
6eb5f6b9 1811 minlen = prog->minlen;
61a36c01 1812 if (strend - startpos < minlen) {
a3621e74 1813 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1814 "String too short [regexec_flags]...\n"));
1815 goto phooey;
1aa99e6b 1816 }
6eb5f6b9 1817
6eb5f6b9
JH
1818 /* Check validity of program. */
1819 if (UCHARAT(prog->program) != REG_MAGIC) {
1820 Perl_croak(aTHX_ "corrupted regexp program");
1821 }
1822
1823 PL_reg_flags = 0;
1824 PL_reg_eval_set = 0;
1825 PL_reg_maxiter = 0;
1826
1827 if (prog->reganch & ROPT_UTF8)
1828 PL_reg_flags |= RF_utf8;
1829
1830 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1831 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1832 PL_bostr = strbeg;
3b0527fe 1833 reginfo.sv = sv;
6eb5f6b9
JH
1834
1835 /* Mark end of line for $ (and such) */
1836 PL_regeol = strend;
1837
1838 /* see how far we have to get to not match where we matched before */
3b0527fe 1839 reginfo.till = startpos+minend;
6eb5f6b9 1840
6eb5f6b9
JH
1841 /* If there is a "must appear" string, look for it. */
1842 s = startpos;
1843
3b0527fe 1844 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1845 MAGIC *mg;
1846
1847 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1848 reginfo.ganch = startpos;
6eb5f6b9
JH
1849 else if (sv && SvTYPE(sv) >= SVt_PVMG
1850 && SvMAGIC(sv)
14befaf4
DM
1851 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1852 && mg->mg_len >= 0) {
3b0527fe 1853 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1854 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1855 if (s > reginfo.ganch)
6eb5f6b9 1856 goto phooey;
3b0527fe 1857 s = reginfo.ganch;
6eb5f6b9
JH
1858 }
1859 }
1860 else /* pos() not defined */
3b0527fe 1861 reginfo.ganch = strbeg;
6eb5f6b9
JH
1862 }
1863
a0714e2c 1864 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1865 re_scream_pos_data d;
1866
1867 d.scream_olds = &scream_olds;
1868 d.scream_pos = &scream_pos;
1869 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1870 if (!s) {
a3621e74 1871 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1872 goto phooey; /* not present */
3fa9c3d7 1873 }
6eb5f6b9
JH
1874 }
1875
a3621e74 1876 DEBUG_EXECUTE_r({
1df70142
AL
1877 const char * const s0 = UTF
1878 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1879 UNI_DISPLAY_REGEX)
1880 : prog->precomp;
bb7a0f54 1881 const int len0 = UTF ? (int)SvCUR(dsv0) : prog->prelen;
1df70142 1882 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1883 UNI_DISPLAY_REGEX) : startpos;
bb7a0f54 1884 const int len1 = do_utf8 ? (int)SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1885 if (!PL_colorset)
1886 reginitcolors();
1887 PerlIO_printf(Perl_debug_log,
a0288114 1888 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1889 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1890 len0, len0, s0,
2a782b5b 1891 PL_colors[1],
9e55ce06 1892 len0 > 60 ? "..." : "",
2a782b5b 1893 PL_colors[0],
9e55ce06
JH
1894 (int)(len1 > 60 ? 60 : len1),
1895 s1, PL_colors[1],
1896 (len1 > 60 ? "..." : "")
2a782b5b
JH
1897 );
1898 });
6eb5f6b9
JH
1899
1900 /* Simplest case: anchored match need be tried only once. */
1901 /* [unless only anchor is BOL and multiline is set] */
1902 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1903 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1904 goto got_it;
7fba1cd6 1905 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1906 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1907 {
1908 char *end;
1909
1910 if (minlen)
1911 dontbother = minlen - 1;
1aa99e6b 1912 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1913 /* for multiline we only have to try after newlines */
33b8afdf 1914 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1915 if (s == startpos)
1916 goto after_try;
1917 while (1) {
3b0527fe 1918 if (regtry(&reginfo, s))
6eb5f6b9
JH
1919 goto got_it;
1920 after_try:
1921 if (s >= end)
1922 goto phooey;
1923 if (prog->reganch & RE_USE_INTUIT) {
1924 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1925 if (!s)
1926 goto phooey;
1927 }
1928 else
1929 s++;
1930 }
1931 } else {
1932 if (s > startpos)
1933 s--;
1934 while (s < end) {
1935 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1936 if (regtry(&reginfo, s))
6eb5f6b9
JH
1937 goto got_it;
1938 }
1939 }
1940 }
1941 }
1942 goto phooey;
1943 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1944 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1945 goto got_it;
1946 goto phooey;
1947 }
1948
1949 /* Messy cases: unanchored match. */
33b8afdf 1950 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1951 /* we have /x+whatever/ */
1952 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1953 char ch;
bf93d4cc
GS
1954#ifdef DEBUGGING
1955 int did_match = 0;
1956#endif
33b8afdf
JH
1957 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1958 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1959 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1960
1aa99e6b 1961 if (do_utf8) {
6eb5f6b9
JH
1962 while (s < strend) {
1963 if (*s == ch) {
a3621e74 1964 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1965 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1966 s += UTF8SKIP(s);
1967 while (s < strend && *s == ch)
1968 s += UTF8SKIP(s);
1969 }
1970 s += UTF8SKIP(s);
1971 }
1972 }
1973 else {
1974 while (s < strend) {
1975 if (*s == ch) {
a3621e74 1976 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1977 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1978 s++;
1979 while (s < strend && *s == ch)
1980 s++;
1981 }
1982 s++;
1983 }
1984 }
a3621e74 1985 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1986 PerlIO_printf(Perl_debug_log,
b7953727
JH
1987 "Did not find anchored character...\n")
1988 );
6eb5f6b9 1989 }
a0714e2c
SS
1990 else if (prog->anchored_substr != NULL
1991 || prog->anchored_utf8 != NULL
1992 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1993 && prog->float_max_offset < strend - s)) {
1994 SV *must;
1995 I32 back_max;
1996 I32 back_min;
1997 char *last;
6eb5f6b9 1998 char *last1; /* Last position checked before */
bf93d4cc
GS
1999#ifdef DEBUGGING
2000 int did_match = 0;
2001#endif
33b8afdf
JH
2002 if (prog->anchored_substr || prog->anchored_utf8) {
2003 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2004 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2005 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2006 back_max = back_min = prog->anchored_offset;
2007 } else {
2008 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2009 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2010 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2011 back_max = prog->float_max_offset;
2012 back_min = prog->float_min_offset;
2013 }
2014 if (must == &PL_sv_undef)
2015 /* could not downgrade utf8 check substring, so must fail */
2016 goto phooey;
2017
2018 last = HOP3c(strend, /* Cannot start after this */
2019 -(I32)(CHR_SVLEN(must)
2020 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
2021
2022 if (s > PL_bostr)
2023 last1 = HOPc(s, -1);
2024 else
2025 last1 = s - 1; /* bogus */
2026
a0288114 2027 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2028 check_substr==must. */
2029 scream_pos = -1;
2030 dontbother = end_shift;
2031 strend = HOPc(strend, -dontbother);
2032 while ( (s <= last) &&
9041c2e3 2033 ((flags & REXEC_SCREAM)
1aa99e6b 2034 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 2035 end_shift, &scream_pos, 0))
1aa99e6b 2036 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 2037 (unsigned char*)strend, must,
7fba1cd6 2038 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
2039 /* we may be pointing at the wrong string */
2040 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 2041 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2042 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2043 if (HOPc(s, -back_max) > last1) {
2044 last1 = HOPc(s, -back_min);
2045 s = HOPc(s, -back_max);
2046 }
2047 else {
52657f30 2048 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2049
2050 last1 = HOPc(s, -back_min);
52657f30 2051 s = t;
6eb5f6b9 2052 }
1aa99e6b 2053 if (do_utf8) {
6eb5f6b9 2054 while (s <= last1) {
3b0527fe 2055 if (regtry(&reginfo, s))
6eb5f6b9
JH
2056 goto got_it;
2057 s += UTF8SKIP(s);
2058 }
2059 }
2060 else {
2061 while (s <= last1) {
3b0527fe 2062 if (regtry(&reginfo, s))
6eb5f6b9
JH
2063 goto got_it;
2064 s++;
2065 }
2066 }
2067 }
a3621e74 2068 DEBUG_EXECUTE_r(if (!did_match)
b7953727 2069 PerlIO_printf(Perl_debug_log,
a0288114 2070 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 2071 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
2072 ? "anchored" : "floating"),
2073 PL_colors[0],
2074 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 2075 SvPVX_const(must),
b7953727
JH
2076 PL_colors[1], (SvTAIL(must) ? "$" : ""))
2077 );
6eb5f6b9
JH
2078 goto phooey;
2079 }
155aba94 2080 else if ((c = prog->regstclass)) {
f14c76ed 2081 if (minlen) {
be8e71aa 2082 const OPCODE op = OP(prog->regstclass);
66e933ab 2083 /* don't bother with what can't match */
07be1b83 2084 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
f14c76ed
RGS
2085 strend = HOPc(strend, -(minlen - 1));
2086 }
a3621e74 2087 DEBUG_EXECUTE_r({
be8e71aa 2088 SV * const prop = sv_newmortal();
cfd0369c
NC
2089 const char *s0;
2090 const char *s1;
9e55ce06
JH
2091 int len0;
2092 int len1;
2093
32fc9b6a 2094 regprop(prog, prop, c);
9e55ce06 2095 s0 = UTF ?
3f7c398e 2096 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 2097 UNI_DISPLAY_REGEX) :
cfd0369c 2098 SvPVX_const(prop);
9e55ce06
JH
2099 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
2100 s1 = UTF ?
c728cb41 2101 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
bb7a0f54 2102 len1 = UTF ? (int)SvCUR(dsv1) : strend - s;
9e55ce06 2103 PerlIO_printf(Perl_debug_log,
07be1b83 2104 "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
9e55ce06 2105 len0, len0, s0,
07be1b83 2106 len1, len1, s1, (int)(strend - s));
ffc61ed2 2107 });
3b0527fe 2108 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2109 goto got_it;
07be1b83 2110 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2111 }
2112 else {
2113 dontbother = 0;
a0714e2c 2114 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2115 /* Trim the end. */
d6a28714 2116 char *last;
33b8afdf
JH
2117 SV* float_real;
2118
2119 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2120 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2121 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2122
2123 if (flags & REXEC_SCREAM) {
33b8afdf 2124 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2125 end_shift, &scream_pos, 1); /* last one */
2126 if (!last)
ffc61ed2 2127 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
2128 /* we may be pointing at the wrong string */
2129 else if (RX_MATCH_COPIED(prog))
3f7c398e 2130 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2131 }
d6a28714
JH
2132 else {
2133 STRLEN len;
cfd0369c 2134 const char * const little = SvPV_const(float_real, len);
d6a28714 2135
33b8afdf 2136 if (SvTAIL(float_real)) {
d6a28714
JH
2137 if (memEQ(strend - len + 1, little, len - 1))
2138 last = strend - len + 1;
7fba1cd6 2139 else if (!multiline)
9041c2e3 2140 last = memEQ(strend - len, little, len)
bd61b366 2141 ? strend - len : NULL;
b8c5462f 2142 else
d6a28714
JH
2143 goto find_last;
2144 } else {
2145 find_last:
9041c2e3 2146 if (len)
d6a28714 2147 last = rninstr(s, strend, little, little + len);
b8c5462f 2148 else
a0288114 2149 last = strend; /* matching "$" */
b8c5462f 2150 }
b8c5462f 2151 }
bf93d4cc 2152 if (last == NULL) {
a3621e74 2153 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2154 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2155 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2156 goto phooey; /* Should not happen! */
2157 }
d6a28714
JH
2158 dontbother = strend - last + prog->float_min_offset;
2159 }
2160 if (minlen && (dontbother < minlen))
2161 dontbother = minlen - 1;
2162 strend -= dontbother; /* this one's always in bytes! */
2163 /* We don't know much -- general case. */
1aa99e6b 2164 if (do_utf8) {
d6a28714 2165 for (;;) {
3b0527fe 2166 if (regtry(&reginfo, s))
d6a28714
JH
2167 goto got_it;
2168 if (s >= strend)
2169 break;
b8c5462f 2170 s += UTF8SKIP(s);
d6a28714
JH
2171 };
2172 }
2173 else {
2174 do {
3b0527fe 2175 if (regtry(&reginfo, s))
d6a28714
JH
2176 goto got_it;
2177 } while (s++ < strend);
2178 }
2179 }
2180
2181 /* Failure. */
2182 goto phooey;
2183
2184got_it:
2185 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2186
2187 if (PL_reg_eval_set) {
2188 /* Preserve the current value of $^R */
2189 if (oreplsv != GvSV(PL_replgv))
2190 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2191 restored, the value remains
2192 the same. */
4f639d21 2193 restore_pos(aTHX_ prog);
d6a28714
JH
2194 }
2195
2196 /* make sure $`, $&, $', and $digit will work later */
2197 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2198 RX_MATCH_COPY_FREE(prog);
d6a28714 2199 if (flags & REXEC_COPY_STR) {
be8e71aa 2200 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2201#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2202 if ((SvIsCOW(sv)
2203 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2204 if (DEBUG_C_TEST) {
2205 PerlIO_printf(Perl_debug_log,
2206 "Copy on write: regexp capture, type %d\n",
2207 (int) SvTYPE(sv));
2208 }
2209 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2210 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2211 assert (SvPOKp(prog->saved_copy));
2212 } else
2213#endif
2214 {
2215 RX_MATCH_COPIED_on(prog);
2216 s = savepvn(strbeg, i);
2217 prog->subbeg = s;
2218 }
d6a28714 2219 prog->sublen = i;
d6a28714
JH
2220 }
2221 else {
2222 prog->subbeg = strbeg;
2223 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2224 }
2225 }
9041c2e3 2226
d6a28714
JH
2227 return 1;
2228
2229phooey:
a3621e74 2230 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2231 PL_colors[4], PL_colors[5]));
d6a28714 2232 if (PL_reg_eval_set)
4f639d21 2233 restore_pos(aTHX_ prog);
d6a28714
JH
2234 return 0;
2235}
2236
2237/*
2238 - regtry - try match at specific point
2239 */
2240STATIC I32 /* 0 failure, 1 success */
3b0527fe 2241S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2242{
97aff369 2243 dVAR;
d6a28714
JH
2244 register I32 *sp;
2245 register I32 *ep;
2246 CHECKPOINT lastcp;
3b0527fe 2247 regexp *prog = reginfo->prog;
a3621e74 2248 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2249
02db2b7b
IZ
2250#ifdef DEBUGGING
2251 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2252#endif
d6a28714
JH
2253 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2254 MAGIC *mg;
2255
2256 PL_reg_eval_set = RS_init;
a3621e74 2257 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2258 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2259 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2260 ));
e8347627 2261 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2262 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2263 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2264 SAVETMPS;
2265 /* Apparently this is not needed, judging by wantarray. */
e8347627 2266 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2267 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2268
3b0527fe 2269 if (reginfo->sv) {
d6a28714 2270 /* Make $_ available to executed code. */
3b0527fe 2271 if (reginfo->sv != DEFSV) {
59f00321 2272 SAVE_DEFSV;
3b0527fe 2273 DEFSV = reginfo->sv;
b8c5462f 2274 }
d6a28714 2275
3b0527fe
DM
2276 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2277 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2278 /* prepare for quick setting of pos */
d300d9fa
NC
2279#ifdef PERL_OLD_COPY_ON_WRITE
2280 if (SvIsCOW(sv))
2281 sv_force_normal_flags(sv, 0);
2282#endif
3dab1dad 2283 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2284 &PL_vtbl_mglob, NULL, 0);
d6a28714 2285 mg->mg_len = -1;
b8c5462f 2286 }
d6a28714
JH
2287 PL_reg_magic = mg;
2288 PL_reg_oldpos = mg->mg_len;
4f639d21 2289 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2290 }
09687e5a 2291 if (!PL_reg_curpm) {
a02a5408 2292 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2293#ifdef USE_ITHREADS
2294 {
be8e71aa 2295 SV* const repointer = newSViv(0);
577e12cc 2296 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2297 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2298 av_push(PL_regex_padav,repointer);
2299 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2300 PL_regex_pad = AvARRAY(PL_regex_padav);
2301 }
2302#endif
2303 }
aaa362c4 2304 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2305 PL_reg_oldcurpm = PL_curpm;
2306 PL_curpm = PL_reg_curpm;
2307 if (RX_MATCH_COPIED(prog)) {
2308 /* Here is a serious problem: we cannot rewrite subbeg,
2309 since it may be needed if this match fails. Thus
2310 $` inside (?{}) could fail... */
2311 PL_reg_oldsaved = prog->subbeg;
2312 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2313#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2314 PL_nrs = prog->saved_copy;
2315#endif
d6a28714
JH
2316 RX_MATCH_COPIED_off(prog);
2317 }
2318 else
bd61b366 2319 PL_reg_oldsaved = NULL;
d6a28714
JH
2320 prog->subbeg = PL_bostr;
2321 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2322 }
973dddac 2323 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2324 PL_reginput = startpos;
2325 PL_regstartp = prog->startp;
2326 PL_regendp = prog->endp;
2327 PL_reglastparen = &prog->lastparen;
a01268b5 2328 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2329 prog->lastparen = 0;
03994de8 2330 prog->lastcloseparen = 0;
d6a28714 2331 PL_regsize = 0;
a3621e74 2332 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2333 if (PL_reg_start_tmpl <= prog->nparens) {
2334 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2335 if(PL_reg_start_tmp)
2336 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2337 else
a02a5408 2338 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2339 }
2340
2341 /* XXXX What this code is doing here?!!! There should be no need
2342 to do this again and again, PL_reglastparen should take care of
3dd2943c 2343 this! --ilya*/
dafc8851
JH
2344
2345 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2346 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2347 * PL_reglastparen), is not needed at all by the test suite
2348 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2349 * enough, for building DynaLoader, or otherwise this
2350 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2351 * will happen. Meanwhile, this code *is* needed for the
2352 * above-mentioned test suite tests to succeed. The common theme
2353 * on those tests seems to be returning null fields from matches.
2354 * --jhi */
dafc8851 2355#if 1
d6a28714
JH
2356 sp = prog->startp;
2357 ep = prog->endp;
2358 if (prog->nparens) {
097eb12c 2359 register I32 i;
eb160463 2360 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2361 *++sp = -1;
2362 *++ep = -1;
2363 }
2364 }
dafc8851 2365#endif
02db2b7b 2366 REGCP_SET(lastcp);
3b0527fe 2367 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2368 prog->endp[0] = PL_reginput - PL_bostr;
2369 return 1;
2370 }
02db2b7b 2371 REGCP_UNWIND(lastcp);
d6a28714
JH
2372 return 0;
2373}
2374
02db2b7b
IZ
2375#define RE_UNWIND_BRANCH 1
2376#define RE_UNWIND_BRANCHJ 2
2377
2378union re_unwind_t;
2379
2380typedef struct { /* XX: makes sense to enlarge it... */
2381 I32 type;
2382 I32 prev;
2383 CHECKPOINT lastcp;
2384} re_unwind_generic_t;
2385
2386typedef struct {
2387 I32 type;
2388 I32 prev;
2389 CHECKPOINT lastcp;
2390 I32 lastparen;
2391 regnode *next;
2392 char *locinput;
2393 I32 nextchr;
3a2830be 2394 int minmod;
02db2b7b
IZ
2395#ifdef DEBUGGING
2396 int regindent;
2397#endif
2398} re_unwind_branch_t;
2399
2400typedef union re_unwind_t {
2401 I32 type;
2402 re_unwind_generic_t generic;
2403 re_unwind_branch_t branch;
2404} re_unwind_t;
2405
8ba1375e
MJD
2406#define sayYES goto yes
2407#define sayNO goto no
e0f9d4a8 2408#define sayNO_ANYOF goto no_anyof
8ba1375e 2409#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2410#define sayNO_FINAL goto no_final
2411#define sayNO_SILENT goto do_no
2412#define saySAME(x) if (x) goto yes; else goto no
2413
3ab3c9b4
HS
2414#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2415#define POSCACHE_SEEN 1 /* we know what we're caching */
2416#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
7409bbd3 2417
3ab3c9b4 2418#define CACHEsayYES STMT_START { \
d8319b27 2419 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3
DM
2420 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2421 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2422 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2423 } \
2424 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2425 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2426 } \
2427 else { \
3ab3c9b4
HS
2428 /* cache records failure, but this is success */ \
2429 DEBUG_r( \
2430 PerlIO_printf(Perl_debug_log, \
2431 "%*s (remove success from failure cache)\n", \
2432 REPORT_CODE_OFF+PL_regindent*2, "") \
2433 ); \
d8319b27 2434 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2435 } \
2436 } \
2437 sayYES; \
2438} STMT_END
7409bbd3 2439
3ab3c9b4 2440#define CACHEsayNO STMT_START { \
d8319b27 2441 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3 2442 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
3ab3c9b4 2443 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
7409bbd3
DM
2444 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2445 } \
2446 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2447 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2448 } \
2449 else { \
3ab3c9b4
HS
2450 /* cache records success, but this is failure */ \
2451 DEBUG_r( \
2452 PerlIO_printf(Perl_debug_log, \
2453 "%*s (remove failure from success cache)\n", \
2454 REPORT_CODE_OFF+PL_regindent*2, "") \
2455 ); \
d8319b27 2456 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2457 } \
2458 } \
2459 sayNO; \
2460} STMT_END
2461
a3621e74
YO
2462/* this is used to determine how far from the left messages like
2463 'failed...' are printed. Currently 29 makes these messages line
2464 up with the opcode they refer to. Earlier perls used 25 which
2465 left these messages outdented making reviewing a debug output
2466 quite difficult.
2467*/
2468#define REPORT_CODE_OFF 29
2469
2470
2471/* Make sure there is a test for this +1 options in re_tests */
2472#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2473
9e137952
DM
2474/* this value indiciates that the c1/c2 "next char" test should be skipped */
2475#define CHRTEST_VOID -1000
2476
86545054
DM
2477#define SLAB_FIRST(s) (&(s)->states[0])
2478#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2479
5d9a96ca
DM
2480/* grab a new slab and return the first slot in it */
2481
2482STATIC regmatch_state *
2483S_push_slab(pTHX)
2484{
54df2634
NC
2485#if PERL_VERSION < 9
2486 dMY_CXT;
2487#endif
5d9a96ca
DM
2488 regmatch_slab *s = PL_regmatch_slab->next;
2489 if (!s) {
2490 Newx(s, 1, regmatch_slab);
2491 s->prev = PL_regmatch_slab;
2492 s->next = NULL;
2493 PL_regmatch_slab->next = s;
2494 }
2495 PL_regmatch_slab = s;
86545054 2496 return SLAB_FIRST(s);
5d9a96ca 2497}
5b47454d 2498
95b24440
DM
2499/* simulate a recursive call to regmatch */
2500
2501#define REGMATCH(ns, where) \
5d9a96ca
DM
2502 st->scan = scan; \
2503 scan = (ns); \
2504 st->resume_state = resume_##where; \
95b24440
DM
2505 goto start_recurse; \
2506 resume_point_##where:
2507
aa283a38
DM
2508
2509/* push a new regex state. Set newst to point to it */
2510
2511#define PUSH_STATE(newst, resume) \
2512 depth++; \
2513 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH STATE(%d)\n", depth)); \
2514 st->scan = scan; \
2515 st->next = next; \
2516 st->n = n; \
2517 st->locinput = locinput; \
2518 st->resume_state = resume; \
2519 newst = st+1; \
86545054 2520 if (newst > SLAB_LAST(PL_regmatch_slab)) \
aa283a38
DM
2521 newst = S_push_slab(aTHX); \
2522 PL_regmatch_state = newst; \
2523 newst->cc = 0; \
2524 newst->minmod = 0; \
2525 newst->sw = 0; \
2526 newst->logical = 0; \
2527 newst->unwind = 0; \
2528 locinput = PL_reginput; \
2529 nextchr = UCHARAT(locinput);
2530
2531#define POP_STATE \
2532 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth)); \
2533 depth--; \
2534 st--; \
86545054 2535 if (st < SLAB_FIRST(PL_regmatch_slab)) { \
aa283a38 2536 PL_regmatch_slab = PL_regmatch_slab->prev; \
86545054 2537 st = SLAB_LAST(PL_regmatch_slab); \
aa283a38
DM
2538 } \
2539 PL_regmatch_state = st; \
2540 scan = st->scan; \
2541 next = st->next; \
2542 n = st->n; \
2543 locinput = st->locinput; \
2544 nextchr = UCHARAT(locinput);
2545
d6a28714
JH
2546/*
2547 - regmatch - main matching routine
2548 *
2549 * Conceptually the strategy is simple: check to see whether the current
2550 * node matches, call self recursively to see whether the rest matches,
2551 * and then act accordingly. In practice we make some effort to avoid
2552 * recursion, in particular by going through "ordinary" nodes (that don't
2553 * need to know whether the rest of the match failed) by a loop instead of
2554 * by recursion.
2555 */
2556/* [lwall] I've hoisted the register declarations to the outer block in order to
2557 * maybe save a little bit of pushing and popping on the stack. It also takes
2558 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2559 *
2560 * This function used to be heavily recursive, but since this had the
2561 * effect of blowing the CPU stack on complex regexes, it has been
2562 * restructured to be iterative, and to save state onto the heap rather
2563 * than the stack. Essentially whereever regmatch() used to be called, it
2564 * pushes the current state, notes where to return, then jumps back into
2565 * the main loop.
2566 *
2567 * Originally the structure of this function used to look something like
2568
2569 S_regmatch() {
2570 int a = 1, b = 2;
2571 ...
2572 while (scan != NULL) {
5d9a96ca 2573 a++; // do stuff with a and b
95b24440
DM
2574 ...
2575 switch (OP(scan)) {
2576 case FOO: {
2577 int local = 3;
2578 ...
2579 if (regmatch(...)) // recurse
2580 goto yes;
2581 }
2582 ...
2583 }
2584 }
2585 yes:
2586 return 1;
2587 }
2588
2589 * Now it looks something like this:
2590
5d9a96ca 2591 typedef struct {
95b24440
DM
2592 int a, b, local;
2593 int resume_state;
5d9a96ca 2594 } regmatch_state;
95b24440
DM
2595
2596 S_regmatch() {
5d9a96ca
DM
2597 regmatch_state *st = new();
2598 int depth=0;
2599 st->a++; // do stuff with a and b
95b24440
DM
2600 ...
2601 while (scan != NULL) {
2602 ...
2603 switch (OP(scan)) {
2604 case FOO: {
5d9a96ca 2605 st->local = 3;
95b24440 2606 ...
5d9a96ca
DM
2607 st->scan = scan;
2608 scan = ...;
2609 st->resume_state = resume_FOO;
2610 goto start_recurse; // recurse
95b24440 2611
5d9a96ca
DM
2612 resume_point_FOO:
2613 if (result)
95b24440
DM
2614 goto yes;
2615 }
2616 ...
2617 }
5d9a96ca
DM
2618 start_recurse:
2619 st = new(); push a new state
2620 st->a = 1; st->b = 2;
2621 depth++;
95b24440 2622 }
5d9a96ca 2623 yes:
95b24440 2624 result = 1;
5d9a96ca
DM
2625 if (depth--) {
2626 st = pop();
95b24440
DM
2627 switch (resume_state) {
2628 case resume_FOO:
2629 goto resume_point_FOO;
2630 ...
2631 }
2632 }
2633 return result
2634 }
2635
2636 * WARNING: this means that any line in this function that contains a
2637 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2638 * regmatch() using gotos instead. Thus the values of any local variables
2639 * not saved in the regmatch_state structure will have been lost when
2640 * execution resumes on the next line .
5d9a96ca
DM
2641 *
2642 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2643 * PL_regmatch_state always points to the currently active state, and
2644 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2645 * The first time regmatch is called, the first slab is allocated, and is
2646 * never freed until interpreter desctruction. When the slab is full,
2647 * a new one is allocated chained to the end. At exit from regmatch, slabs
2648 * allocated since entry are freed.
d6a28714 2649 */
95b24440 2650
3dab1dad 2651#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2652
07be1b83
YO
2653#ifdef DEBUGGING
2654STATIC void
2655S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2656{
2657 const int docolor = *PL_colors[0];
2658 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2659 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2660 /* The part of the string before starttry has one color
2661 (pref0_len chars), between starttry and current
2662 position another one (pref_len - pref0_len chars),
2663 after the current position the third one.
2664 We assume that pref0_len <= pref_len, otherwise we
2665 decrease pref0_len. */
2666 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2667 ? (5 + taill) - l : locinput - PL_bostr;
2668 int pref0_len;
2669
2670 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2671 pref_len++;
2672 pref0_len = pref_len - (locinput - PL_reg_starttry);
2673 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2674 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2675 ? (5 + taill) - pref_len : PL_regeol - locinput);
2676 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2677 l--;
2678 if (pref0_len < 0)
2679 pref0_len = 0;
2680 if (pref0_len > pref_len)
2681 pref0_len = pref_len;
2682 {
2683 const char * const s0 =
2684 do_utf8 && OP(scan) != CANY ?
2685 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
2686 pref0_len, 60, UNI_DISPLAY_REGEX) :
2687 locinput - pref_len;
2688 const int len0 = do_utf8 ? (int)strlen(s0) : pref0_len;
2689 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2690 pv_uni_display(PERL_DEBUG_PAD(1),
2691 (U8*)(locinput - pref_len + pref0_len),
2692 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2693 locinput - pref_len + pref0_len;
2694 const int len1 = do_utf8 ? (int)strlen(s1) : pref_len - pref0_len;
2695 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2696 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
2697 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2698 locinput;
2699 const int len2 = do_utf8 ? (int)strlen(s2) : l;
2700 PerlIO_printf(Perl_debug_log,
2701 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
2702 (IV)(locinput - PL_bostr),
2703 PL_colors[4],
2704 len0, s0,
2705 PL_colors[5],
2706 PL_colors[2],
2707 len1, s1,
2708 PL_colors[3],
2709 (docolor ? "" : "> <"),
2710 PL_colors[0],
2711 len2, s2,
2712 PL_colors[1],
2713 15 - l - pref_len + 1,
2714 "");
2715 }
2716}
2717#endif
2718
d6a28714 2719STATIC I32 /* 0 failure, 1 success */
3b0527fe 2720S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2721{
54df2634
NC
2722#if PERL_VERSION < 9
2723 dMY_CXT;
2724#endif
27da23d5 2725 dVAR;
95b24440 2726 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2727 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2728
3b0527fe
DM
2729 regexp *rex = reginfo->prog;
2730
5d9a96ca
DM
2731 regmatch_slab *orig_slab;
2732 regmatch_state *orig_state;
a3621e74 2733
5d9a96ca
DM
2734 /* the current state. This is a cached copy of PL_regmatch_state */
2735 register regmatch_state *st;
95b24440 2736
5d9a96ca
DM
2737 /* cache heavy used fields of st in registers */
2738 register regnode *scan;
2739 register regnode *next;
2740 register I32 n = 0; /* initialize to shut up compiler warning */
2741 register char *locinput = PL_reginput;
95b24440 2742
5d9a96ca
DM
2743 /* these variables are NOT saved during a recusive RFEGMATCH: */
2744 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2745 bool result; /* return value of S_regmatch */
2746 regnode *inner; /* Next node in internal branch. */
2747 int depth = 0; /* depth of recursion */
aa283a38 2748 regmatch_state *newst; /* when pushing a state, this is the new one */
77cb431f
DM
2749 regmatch_state *yes_state = NULL; /* state to pop to on success of
2750 subpattern */
95b24440
DM
2751
2752#ifdef DEBUGGING
e68ec53f 2753 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2754 PL_regindent++;
2755#endif
2756
5d9a96ca
DM
2757 /* on first ever call to regmatch, allocate first slab */
2758 if (!PL_regmatch_slab) {
2759 Newx(PL_regmatch_slab, 1, regmatch_slab);
2760 PL_regmatch_slab->prev = NULL;
2761 PL_regmatch_slab->next = NULL;
86545054 2762 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2763 }
2764
2765 /* remember current high-water mark for exit */
2766 /* XXX this should be done with SAVE* instead */
2767 orig_slab = PL_regmatch_slab;
2768 orig_state = PL_regmatch_state;
2769
2770 /* grab next free state slot */
2771 st = ++PL_regmatch_state;
86545054 2772 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2773 st = PL_regmatch_state = S_push_slab(aTHX);
2774
2775 st->minmod = 0;
2776 st->sw = 0;
2777 st->logical = 0;
2778 st->unwind = 0;
2779 st->cc = NULL;
d6a28714
JH
2780 /* Note that nextchr is a byte even in UTF */
2781 nextchr = UCHARAT(locinput);
2782 scan = prog;
2783 while (scan != NULL) {
8ba1375e 2784
a3621e74 2785 DEBUG_EXECUTE_r( {
6136c704 2786 SV * const prop = sv_newmortal();
07be1b83 2787 dump_exec_pos( locinput, scan, do_utf8 );
32fc9b6a 2788 regprop(rex, prop, scan);
07be1b83
YO
2789
2790 PerlIO_printf(Perl_debug_log,
2791 "%3"IVdf":%*s%s(%"IVdf")\n",
2792 (IV)(scan - rex->program), PL_regindent*2, "",
2793 SvPVX_const(prop),
2794 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2a782b5b 2795 });
d6a28714
JH
2796
2797 next = scan + NEXT_OFF(scan);
2798 if (next == scan)
2799 next = NULL;
2800
2801 switch (OP(scan)) {
2802 case BOL:
7fba1cd6 2803 if (locinput == PL_bostr)
d6a28714 2804 {
3b0527fe 2805 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2806 break;
2807 }
d6a28714
JH
2808 sayNO;
2809 case MBOL:
12d33761
HS
2810 if (locinput == PL_bostr ||
2811 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2812 {
b8c5462f
JH
2813 break;
2814 }
d6a28714
JH
2815 sayNO;
2816 case SBOL:
c2a73568 2817 if (locinput == PL_bostr)
b8c5462f 2818 break;
d6a28714
JH
2819 sayNO;
2820 case GPOS:
3b0527fe 2821 if (locinput == reginfo->ganch)
d6a28714
JH
2822 break;
2823 sayNO;
2824 case EOL:
d6a28714
JH
2825 goto seol;
2826 case MEOL:
d6a28714 2827 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2828 sayNO;
b8c5462f 2829 break;
d6a28714
JH
2830 case SEOL:
2831 seol:
2832 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2833 sayNO;
d6a28714 2834 if (PL_regeol - locinput > 1)
b8c5462f 2835 sayNO;
b8c5462f 2836 break;
d6a28714
JH
2837 case EOS:
2838 if (PL_regeol != locinput)
b8c5462f 2839 sayNO;
d6a28714 2840 break;
ffc61ed2 2841 case SANY:
d6a28714 2842 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2843 sayNO;
f33976b4
DB
2844 if (do_utf8) {
2845 locinput += PL_utf8skip[nextchr];
2846 if (locinput > PL_regeol)
2847 sayNO;
2848 nextchr = UCHARAT(locinput);
2849 }
2850 else
2851 nextchr = UCHARAT(++locinput);
2852 break;
2853 case CANY:
2854 if (!nextchr && locinput >= PL_regeol)
2855 sayNO;
b8c5462f 2856 nextchr = UCHARAT(++locinput);
a0d0e21e 2857 break;
ffc61ed2 2858 case REG_ANY:
1aa99e6b
IH
2859 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2860 sayNO;
2861 if (do_utf8) {
b8c5462f 2862 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2863 if (locinput > PL_regeol)
2864 sayNO;
a0ed51b3 2865 nextchr = UCHARAT(locinput);
a0ed51b3 2866 }
1aa99e6b
IH
2867 else
2868 nextchr = UCHARAT(++locinput);
a0ed51b3 2869 break;
5b47454d 2870 case TRIE:
3dab1dad 2871 {
07be1b83 2872 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2873 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2874 trie_type = do_utf8 ?
2875 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2876 : trie_plain;
2877
2878 /* what trie are we using right now */
be8e71aa 2879 reg_trie_data * const trie
3dab1dad
YO
2880 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2881 U32 state = trie->startstate;
2882
2883 if (trie->bitmap && trie_type != trie_utf8_fold &&
2884 !TRIE_BITMAP_TEST(trie,*locinput)
2885 ) {
2886 if (trie->states[ state ].wordnum) {
2887 DEBUG_EXECUTE_r(
2888 PerlIO_printf(Perl_debug_log,
2889 "%*s %smatched empty string...%s\n",
2890 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2891 );
2892 break;
2893 } else {
2894 DEBUG_EXECUTE_r(
2895 PerlIO_printf(Perl_debug_log,
2896 "%*s %sfailed to match start class...%s\n",
2897 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2898 );
2899 sayNO_SILENT;
2900 }
2901 }
a3621e74 2902 {
07be1b83
YO
2903 /*
2904 traverse the TRIE keeping track of all accepting states
2905 we transition through until we get to a failing node.
2906 */
2907
a3621e74 2908 U8 *uc = ( U8* )locinput;
a3621e74
YO
2909 U16 charid = 0;
2910 U32 base = 0;
2911 UV uvc = 0;
2912 STRLEN len = 0;
2913 STRLEN foldlen = 0;
a3621e74
YO
2914 U8 *uscan = (U8*)NULL;
2915 STRLEN bufflen=0;
95b24440 2916 SV *sv_accept_buff = NULL;
5b47454d 2917
d8319b27 2918 st->u.trie.accepted = 0; /* how many accepting states we have seen */
95b24440 2919 result = 0;
a3621e74
YO
2920
2921 while ( state && uc <= (U8*)PL_regeol ) {
2922
5b47454d 2923 if (trie->states[ state ].wordnum) {
d8319b27 2924 if (!st->u.trie.accepted ) {
5b47454d
DM
2925 ENTER;
2926 SAVETMPS;
2927 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2928 sv_accept_buff=newSV(bufflen *
2929 sizeof(reg_trie_accepted) - 1);
2930 SvCUR_set(sv_accept_buff,
2931 sizeof(reg_trie_accepted));
2932 SvPOK_on(sv_accept_buff);
2933 sv_2mortal(sv_accept_buff);
d8319b27 2934 st->u.trie.accept_buff =
5b47454d
DM
2935 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2936 }
2937 else {
d8319b27 2938 if (st->u.trie.accepted >= bufflen) {
5b47454d 2939 bufflen *= 2;
d8319b27 2940 st->u.trie.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2941 SvGROW(sv_accept_buff,
2942 bufflen * sizeof(reg_trie_accepted));
2943 }
2944 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2945 + sizeof(reg_trie_accepted));
2946 }
d8319b27
DM
2947 st->u.trie.accept_buff[st->u.trie.accepted].wordnum = trie->states[state].wordnum;
2948 st->u.trie.accept_buff[st->u.trie.accepted].endpos = uc;
2949 ++st->u.trie.accepted;
5b47454d 2950 }
a3621e74
YO
2951
2952 base = trie->states[ state ].trans.base;
2953
07be1b83
YO
2954 DEBUG_TRIE_EXECUTE_r({
2955 dump_exec_pos( (char *)uc, scan, do_utf8 );
a3621e74 2956 PerlIO_printf( Perl_debug_log,
e4584336 2957 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
07be1b83 2958 2+PL_regindent * 2, "", PL_colors[4],
d8319b27 2959 (UV)state, (UV)base, (UV)st->u.trie.accepted );
07be1b83 2960 });
a3621e74
YO
2961
2962 if ( base ) {
5b47454d 2963 switch (trie_type) {
3dab1dad 2964 case trie_utf8_fold:
a3621e74
YO
2965 if ( foldlen>0 ) {
2966 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2967 foldlen -= len;
2968 uscan += len;
2969 len=0;
2970 } else {
1df70142 2971 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2972 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2973 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2974 foldlen -= UNISKIP( uvc );
2975 uscan = foldbuf + UNISKIP( uvc );
2976 }
5b47454d
DM
2977 break;
2978 case trie_utf8:
2979 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2980 &len, uniflags );
2981 break;
2982 case trie_plain:
e4584336 2983 uvc = (UV)*uc;
a3621e74
YO
2984 len = 1;
2985 }
2986
5b47454d
DM
2987 if (uvc < 256) {
2988 charid = trie->charmap[ uvc ];
2989 }
2990 else {
2991 charid = 0;
2992 if (trie->widecharmap) {
3dab1dad 2993 SV** const svpp = hv_fetch(trie->widecharmap,
5b47454d
DM
2994 (char*)&uvc, sizeof(UV), 0);
2995 if (svpp)
2996 charid = (U16)SvIV(*svpp);
2997 }
2998 }
a3621e74 2999
5b47454d
DM
3000 if (charid &&
3001 (base + charid > trie->uniquecharcount )
3002 && (base + charid - 1 - trie->uniquecharcount
3003 < trie->lasttrans)
3004 && trie->trans[base + charid - 1 -
3005 trie->uniquecharcount].check == state)
3006 {
3007 state = trie->trans[base + charid - 1 -
3008 trie->uniquecharcount ].next;
3009 }
3010 else {
3011 state = 0;
3012 }
3013 uc += len;
3014
3015 }
3016 else {
a3621e74
YO
3017 state = 0;
3018 }
3019 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
3020 PerlIO_printf( Perl_debug_log,
3021 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
3022 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3023 );
3024 }
d8319b27 3025 if (!st->u.trie.accepted )
a3621e74 3026 sayNO;
a3621e74
YO
3027
3028 /*
3029 There was at least one accepting state that we
3030 transitioned through. Presumably the number of accepting
3031 states is going to be low, typically one or two. So we
3032 simply scan through to find the one with lowest wordnum.
3033 Once we find it, we swap the last state into its place
3034 and decrement the size. We then try to match the rest of
3035 the pattern at the point where the word ends, if we
3036 succeed then we end the loop, otherwise the loop
3037 eventually terminates once all of the accepting states
3038 have been tried.
3039 */
a3621e74 3040
d8319b27 3041 if ( st->u.trie.accepted == 1 ) {
f2278c82 3042 DEBUG_EXECUTE_r({
be8e71aa 3043 SV ** const tmp = RX_DEBUG(reginfo->prog)
f2278c82
YO
3044 ? av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 )
3045 : NULL;
3046 PerlIO_printf( Perl_debug_log,
3047 "%*s %sonly one match : #%d <%s>%s\n",
3048 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3049 st->u.trie.accept_buff[ 0 ].wordnum,
3050 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
3051 PL_colors[5] );
3052 });
d8319b27 3053 PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos;
a3621e74
YO
3054 /* in this case we free tmps/leave before we call regmatch
3055 as we wont be using accept_buff again. */
3056 FREETMPS;
3057 LEAVE;
07be1b83 3058 /* do we need this? why dont we just do a break? */
95b24440
DM
3059 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
3060 /*** all unsaved local vars undefined at this point */
a3621e74
YO
3061 } else {
3062 DEBUG_EXECUTE_r(
e4584336 3063 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
d8319b27 3064 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)st->u.trie.accepted,
a3621e74
YO
3065 PL_colors[5] );
3066 );
d8319b27 3067 while ( !result && st->u.trie.accepted-- ) {
a3621e74
YO
3068 U32 best = 0;
3069 U32 cur;
d8319b27 3070 for( cur = 1 ; cur <= st->u.trie.accepted ; cur++ ) {
e4584336
RB
3071 DEBUG_TRIE_EXECUTE_r(
3072 PerlIO_printf( Perl_debug_log,
3073 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3074 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27
DM
3075 (IV)best, st->u.trie.accept_buff[ best ].wordnum, (IV)cur,
3076 st->u.trie.accept_buff[ cur ].wordnum, PL_colors[5] );
e4584336 3077 );
a3621e74 3078
d8319b27
DM
3079 if (st->u.trie.accept_buff[cur].wordnum <
3080 st->u.trie.accept_buff[best].wordnum)
e822a8b4 3081 best = cur;
a3621e74 3082 }
f2278c82
YO
3083 DEBUG_EXECUTE_r({
3084 reg_trie_data * const trie = (reg_trie_data*)
3085 rex->data->data[ARG(scan)];
be8e71aa 3086 SV ** const tmp = RX_DEBUG(reginfo->prog)
f2278c82
YO
3087 ? av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 )
3088 : NULL;
3089 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
3090 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3091 st->u.trie.accept_buff[best].wordnum,
3092 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
3093 PL_colors[5] );
3094 });
d8319b27
DM
3095 if ( best<st->u.trie.accepted ) {
3096 reg_trie_accepted tmp = st->u.trie.accept_buff[ best ];
3097 st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ];
3098 st->u.trie.accept_buff[ st->u.trie.accepted ] = tmp;
3099 best = st->u.trie.accepted;
a3621e74 3100 }
d8319b27 3101 PL_reginput = (char *)st->u.trie.accept_buff[ best ].endpos;
a3621e74
YO
3102
3103 /*
3104 as far as I can tell we only need the SAVETMPS/FREETMPS
3105 for re's with EVAL in them but I'm leaving them in for
3106 all until I can be sure.
3107 */
3108 SAVETMPS;
95b24440
DM
3109 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
3110 /*** all unsaved local vars undefined at this point */
a3621e74
YO
3111 FREETMPS;
3112 }
3113 FREETMPS;
3114 LEAVE;
3115 }
3116
95b24440 3117 if (result) {
a3621e74
YO
3118 sayYES;
3119 } else {
3120 sayNO;
3121 }
3dab1dad 3122 }}
a3621e74 3123 /* unreached codepoint */
95b24440
DM
3124 case EXACT: {
3125 char *s = STRING(scan);
5d9a96ca 3126 st->ln = STR_LEN(scan);
eb160463 3127 if (do_utf8 != UTF) {
bc517b45 3128 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3129 char *l = locinput;
be8e71aa 3130 const char * const e = s + st->ln;
a72c7584 3131
5ff6fc6d
JH
3132 if (do_utf8) {
3133 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3134 while (s < e) {
a3b680e6 3135 STRLEN ulen;
1aa99e6b 3136 if (l >= PL_regeol)
5ff6fc6d
JH
3137 sayNO;
3138 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3139 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3140 uniflags))
5ff6fc6d 3141 sayNO;
bc517b45 3142 l += ulen;
5ff6fc6d 3143 s ++;
1aa99e6b 3144 }
5ff6fc6d
JH
3145 }
3146 else {
3147 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3148 while (s < e) {
a3b680e6 3149 STRLEN ulen;
1aa99e6b
IH
3150 if (l >= PL_regeol)
3151 sayNO;
5ff6fc6d 3152 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3153 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3154 uniflags))
1aa99e6b 3155 sayNO;
bc517b45 3156 s += ulen;
a72c7584 3157 l ++;
1aa99e6b 3158 }
5ff6fc6d 3159 }
1aa99e6b
IH
3160 locinput = l;
3161 nextchr = UCHARAT(locinput);
3162 break;
3163 }
bc517b45 3164 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3165 /* Inline the first character, for speed. */
3166 if (UCHARAT(s) != nextchr)
3167 sayNO;
5d9a96ca 3168 if (PL_regeol - locinput < st->ln)
d6a28714 3169 sayNO;
5d9a96ca 3170 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 3171 sayNO;
5d9a96ca 3172 locinput += st->ln;
d6a28714
JH
3173 nextchr = UCHARAT(locinput);
3174 break;
95b24440 3175 }
d6a28714 3176 case EXACTFL:
b8c5462f
JH
3177 PL_reg_flags |= RF_tainted;
3178 /* FALL THROUGH */
95b24440 3179 case EXACTF: {
be8e71aa 3180 char * const s = STRING(scan);
5d9a96ca 3181 st->ln = STR_LEN(scan);
d6a28714 3182
d07ddd77
JH
3183 if (do_utf8 || UTF) {
3184 /* Either target or the pattern are utf8. */
be8e71aa 3185 const char * const l = locinput;
d07ddd77 3186 char *e = PL_regeol;
bc517b45 3187
5d9a96ca 3188 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 3189 l, &e, 0, do_utf8)) {
5486206c
JH
3190 /* One more case for the sharp s:
3191 * pack("U0U*", 0xDF) =~ /ss/i,
3192 * the 0xC3 0x9F are the UTF-8
3193 * byte sequence for the U+00DF. */
3194 if (!(do_utf8 &&
3195 toLOWER(s[0]) == 's' &&
5d9a96ca 3196 st->ln >= 2 &&
5486206c
JH
3197 toLOWER(s[1]) == 's' &&
3198 (U8)l[0] == 0xC3 &&
3199 e - l >= 2 &&
3200 (U8)l[1] == 0x9F))
3201 sayNO;
3202 }
d07ddd77
JH
3203 locinput = e;
3204 nextchr = UCHARAT(locinput);
3205 break;
a0ed51b3 3206 }
d6a28714 3207
bc517b45
JH
3208 /* Neither the target and the pattern are utf8. */
3209
d6a28714
JH
3210 /* Inline the first character, for speed. */
3211 if (UCHARAT(s) != nextchr &&
3212 UCHARAT(s) != ((OP(scan) == EXACTF)
3213 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3214 sayNO;
5d9a96ca 3215 if (PL_regeol - locinput < st->ln)
b8c5462f 3216 sayNO;
5d9a96ca
DM
3217 if (st->ln > 1 && (OP(scan) == EXACTF
3218 ? ibcmp(s, locinput, st->ln)
3219 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 3220 sayNO;
5d9a96ca 3221 locinput += st->ln;
d6a28714 3222 nextchr = UCHARAT(locinput);
a0d0e21e 3223 break;
95b24440 3224 }
d6a28714 3225 case ANYOF:
ffc61ed2 3226 if (do_utf8) {
9e55ce06
JH
3227 STRLEN inclasslen = PL_regeol - locinput;
3228
32fc9b6a 3229 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 3230 sayNO_ANYOF;
ffc61ed2
JH
3231 if (locinput >= PL_regeol)
3232 sayNO;
0f0076b4 3233 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3234 nextchr = UCHARAT(locinput);
e0f9d4a8 3235 break;
ffc61ed2
JH
3236 }
3237 else {
3238 if (nextchr < 0)
3239 nextchr = UCHARAT(locinput);
32fc9b6a 3240 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 3241 sayNO_ANYOF;
ffc61ed2
JH
3242 if (!nextchr && locinput >= PL_regeol)
3243 sayNO;
3244 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3245 break;
3246 }
3247 no_anyof:
3248 /* If we might have the case of the German sharp s
3249 * in a casefolding Unicode character class. */
3250
ebc501f0
JH
3251 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3252 locinput += SHARP_S_SKIP;
e0f9d4a8 3253 nextchr = UCHARAT(locinput);
ffc61ed2 3254 }
e0f9d4a8
JH
3255 else
3256 sayNO;
b8c5462f 3257 break;
d6a28714 3258 case ALNUML:
b8c5462f
JH
3259 PL_reg_flags |= RF_tainted;
3260 /* FALL THROUGH */
d6a28714 3261 case ALNUM:
b8c5462f 3262 if (!nextchr)
4633a7c4 3263 sayNO;
ffc61ed2 3264 if (do_utf8) {
1a4fad37 3265 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3266 if (!(OP(scan) == ALNUM
bb7a0f54 3267 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3268 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3269 {
3270 sayNO;
a0ed51b3 3271 }
b8c5462f 3272 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3273 nextchr = UCHARAT(locinput);
3274 break;
3275 }
ffc61ed2 3276 if (!(OP(scan) == ALNUM
d6a28714 3277 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3278 sayNO;
b8c5462f 3279 nextchr = UCHARAT(++locinput);
a0d0e21e 3280 break;
d6a28714 3281 case NALNUML:
b8c5462f
JH
3282 PL_reg_flags |= RF_tainted;
3283 /* FALL THROUGH */
d6a28714
JH
3284 case NALNUM:
3285 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3286 sayNO;
ffc61ed2 3287 if (do_utf8) {
1a4fad37 3288 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3289 if (OP(scan) == NALNUM
bb7a0f54 3290 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3291 : isALNUM_LC_utf8((U8*)locinput))
3292 {
b8c5462f 3293 sayNO;
d6a28714 3294 }
b8c5462f
JH
3295 locinput += PL_utf8skip[nextchr];
3296 nextchr = UCHARAT(locinput);
3297 break;
3298 }
ffc61ed2 3299 if (OP(scan) == NALNUM
d6a28714 3300 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3301 sayNO;
76e3520e 3302 nextchr = UCHARAT(++locinput);
a0d0e21e 3303 break;
d6a28714
JH
3304 case BOUNDL:
3305 case NBOUNDL:
3280af22 3306 PL_reg_flags |= RF_tainted;
bbce6d69 3307 /* FALL THROUGH */
d6a28714
JH
3308 case BOUND:
3309 case NBOUND:
3310 /* was last char in word? */
ffc61ed2 3311 if (do_utf8) {
12d33761 3312 if (locinput == PL_bostr)
5d9a96ca 3313 st->ln = '\n';
ffc61ed2 3314 else {
a3b680e6 3315 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3316
4ad0818d 3317 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3318 }
3319 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3320 st->ln = isALNUM_uni(st->ln);
1a4fad37 3321 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3322 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3323 }
3324 else {
5d9a96ca 3325 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3326 n = isALNUM_LC_utf8((U8*)locinput);
3327 }
a0ed51b3 3328 }
d6a28714 3329 else {
5d9a96ca 3330 st->ln = (locinput != PL_bostr) ?
12d33761 3331 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3332 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3333 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3334 n = isALNUM(nextchr);
3335 }
3336 else {
5d9a96ca 3337 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3338 n = isALNUM_LC(nextchr);
3339 }
d6a28714 3340 }
5d9a96ca 3341 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3342 OP(scan) == BOUNDL))
3343 sayNO;
a0ed51b3 3344 break;
d6a28714 3345 case SPACEL:
3280af22 3346 PL_reg_flags |= RF_tainted;
bbce6d69 3347 /* FALL THROUGH */
d6a28714 3348 case SPACE:
9442cb0e 3349 if (!nextchr)
4633a7c4 3350 sayNO;
1aa99e6b 3351 if (do_utf8) {
fd400ab9 3352 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3353 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3354 if (!(OP(scan) == SPACE
bb7a0f54 3355 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3356 : isSPACE_LC_utf8((U8*)locinput)))
3357 {
3358 sayNO;
3359 }
3360 locinput += PL_utf8skip[nextchr];
3361 nextchr = UCHARAT(locinput);
3362 break;
d6a28714 3363 }
ffc61ed2
JH
3364 if (!(OP(scan) == SPACE
3365 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3366 sayNO;
3367 nextchr = UCHARAT(++locinput);
3368 }
3369 else {
3370 if (!(OP(scan) == SPACE
3371 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3372 sayNO;
3373 nextchr = UCHARAT(++locinput);
a0ed51b3 3374 }
a0ed51b3 3375 break;
d6a28714 3376 case NSPACEL:
3280af22 3377 PL_reg_flags |= RF_tainted;
bbce6d69 3378 /* FALL THROUGH */
d6a28714 3379 case NSPACE:
9442cb0e 3380 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3381 sayNO;
1aa99e6b 3382 if (do_utf8) {
1a4fad37 3383 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3384 if (OP(scan) == NSPACE
bb7a0f54 3385 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3386 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3387 {
3388 sayNO;
3389 }
3390 locinput += PL_utf8skip[nextchr];
3391 nextchr = UCHARAT(locinput);
3392 break;
a0ed51b3 3393 }
ffc61ed2 3394 if (OP(scan) == NSPACE
d6a28714 3395 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3396 sayNO;
76e3520e 3397 nextchr = UCHARAT(++locinput);
a0d0e21e 3398 break;
d6a28714 3399 case DIGITL:
a0ed51b3
LW
3400 PL_reg_flags |= RF_tainted;
3401 /* FALL THROUGH */
d6a28714 3402 case DIGIT:
9442cb0e 3403 if (!nextchr)
a0ed51b3 3404 sayNO;
1aa99e6b 3405 if (do_utf8) {
1a4fad37 3406 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3407 if (!(OP(scan) == DIGIT
bb7a0f54 3408 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3409 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3410 {
a0ed51b3 3411 sayNO;
dfe13c55 3412 }
6f06b55f 3413 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3414 nextchr = UCHARAT(locinput);
3415 break;
3416 }
ffc61ed2 3417 if (!(OP(scan) == DIGIT
9442cb0e 3418 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3419 sayNO;
3420 nextchr = UCHARAT(++locinput);
3421 break;
d6a28714 3422 case NDIGITL:
b8c5462f
JH
3423 PL_reg_flags |= RF_tainted;
3424 /* FALL THROUGH */
d6a28714 3425 case NDIGIT:
9442cb0e 3426 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3427 sayNO;
1aa99e6b 3428 if (do_utf8) {
1a4fad37 3429 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3430 if (OP(scan) == NDIGIT
bb7a0f54 3431 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3432 : isDIGIT_LC_utf8((U8*)locinput))
3433 {
a0ed51b3 3434 sayNO;
9442cb0e 3435 }
6f06b55f 3436 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3437 nextchr = UCHARAT(locinput);
3438 break;
3439 }
ffc61ed2 3440 if (OP(scan) == NDIGIT
9442cb0e 3441 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3442 sayNO;
3443 nextchr = UCHARAT(++locinput);
3444 break;
3445 case CLUMP:
b7c83a7e 3446 if (locinput >= PL_regeol)
a0ed51b3 3447 sayNO;
b7c83a7e 3448 if (do_utf8) {
1a4fad37 3449 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3450 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3451 sayNO;
3452 locinput += PL_utf8skip[nextchr];
3453 while (locinput < PL_regeol &&
3454 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3455 locinput += UTF8SKIP(locinput);
3456 if (locinput > PL_regeol)
3457 sayNO;
eb08e2da
JH
3458 }
3459 else
3460 locinput++;
a0ed51b3
LW
3461 nextchr = UCHARAT(locinput);
3462 break;
c8756f30 3463 case REFFL:
3280af22 3464 PL_reg_flags |= RF_tainted;
c8756f30 3465 /* FALL THROUGH */
c277df42 3466 case REF:
95b24440
DM
3467 case REFF: {
3468 char *s;
c277df42 3469 n = ARG(scan); /* which paren pair */
5d9a96ca 3470 st->ln = PL_regstartp[n];
2c2d71f5 3471 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3472 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3473 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3474 if (st->ln == PL_regendp[n])
a0d0e21e 3475 break;
a0ed51b3 3476
5d9a96ca 3477 s = PL_bostr + st->ln;
1aa99e6b 3478 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3479 char *l = locinput;
a3b680e6 3480 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3481 /*
3482 * Note that we can't do the "other character" lookup trick as
3483 * in the 8-bit case (no pun intended) because in Unicode we
3484 * have to map both upper and title case to lower case.
3485 */
3486 if (OP(scan) == REFF) {
3487 while (s < e) {
a3b680e6
AL
3488 STRLEN ulen1, ulen2;
3489 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3490 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3491
a0ed51b3
LW
3492 if (l >= PL_regeol)
3493 sayNO;
a2a2844f
JH
3494 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3495 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3496 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3497 sayNO;
a2a2844f
JH
3498 s += ulen1;
3499 l += ulen2;
a0ed51b3
LW
3500 }
3501 }
3502 locinput = l;
3503 nextchr = UCHARAT(locinput);
3504 break;
3505 }
3506
a0d0e21e 3507 /* Inline the first character, for speed. */
76e3520e 3508 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3509 (OP(scan) == REF ||
3510 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3511 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3512 sayNO;
5d9a96ca
DM
3513 st->ln = PL_regendp[n] - st->ln;
3514 if (locinput + st->ln > PL_regeol)
4633a7c4 3515 sayNO;
5d9a96ca
DM
3516 if (st->ln > 1 && (OP(scan) == REF
3517 ? memNE(s, locinput, st->ln)
c8756f30 3518 : (OP(scan) == REFF
5d9a96ca
DM
3519 ? ibcmp(s, locinput, st->ln)
3520 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3521 sayNO;
5d9a96ca 3522 locinput += st->ln;
76e3520e 3523 nextchr = UCHARAT(locinput);
a0d0e21e 3524 break;
95b24440 3525 }
a0d0e21e
LW
3526
3527 case NOTHING:
c277df42 3528 case TAIL:
a0d0e21e
LW
3529 break;
3530 case BACK:
3531 break;
c277df42
IZ
3532 case EVAL:
3533 {
c277df42 3534 SV *ret;
8e5e9ebe 3535 {
4aabdb9b
DM
3536 /* execute the code in the {...} */
3537 dSP;
6136c704 3538 SV ** const before = SP;
4aabdb9b
DM
3539 OP_4tree * const oop = PL_op;
3540 COP * const ocurcop = PL_curcop;
3541 PAD *old_comppad;
4aabdb9b
DM
3542
3543 n = ARG(scan);
32fc9b6a 3544 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3545 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3546 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3547 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3548
8e5e9ebe
RGS
3549 CALLRUNOPS(aTHX); /* Scalar context. */
3550 SPAGAIN;
3551 if (SP == before)
075aa684 3552 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3553 else {
3554 ret = POPs;
3555 PUTBACK;
3556 }
4aabdb9b
DM
3557
3558 PL_op = oop;
3559 PAD_RESTORE_LOCAL(old_comppad);
3560 PL_curcop = ocurcop;
3561 if (!st->logical) {
3562 /* /(?{...})/ */
3563 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3564 break;
3565 }
8e5e9ebe 3566 }
4aabdb9b
DM
3567 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3568 regexp *re;
4aabdb9b 3569 {
4f639d21
DM
3570 /* extract RE object from returned value; compiling if
3571 * necessary */
3572
6136c704 3573 MAGIC *mg = NULL;
be8e71aa 3574 const SV *sv;
faf82a0b
AE
3575 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3576 mg = mg_find(sv, PERL_MAGIC_qr);
3577 else if (SvSMAGICAL(ret)) {
3578 if (SvGMAGICAL(ret))
3579 sv_unmagic(ret, PERL_MAGIC_qr);
3580 else
3581 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3582 }
faf82a0b 3583
0f5d15d6
IZ
3584 if (mg) {
3585 re = (regexp *)mg->mg_obj;
df0003d4 3586 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3587 }
3588 else {
3589 STRLEN len;
6136c704 3590 const char * const t = SvPV_const(ret, len);
0f5d15d6 3591 PMOP pm;
a3b680e6 3592 const I32 osize = PL_regsize;
0f5d15d6 3593
5fcd1c1b 3594 Zero(&pm, 1, PMOP);
4aabdb9b 3595 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3596 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3597 if (!(SvFLAGS(ret)
faf82a0b
AE
3598 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3599 | SVs_GMG)))
14befaf4
DM
3600 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3601 PERL_MAGIC_qr,0,0);
0f5d15d6 3602 PL_regsize = osize;
0f5d15d6 3603 }
4aabdb9b 3604 }
aa283a38
DM
3605
3606 /* run the pattern returned from (??{...}) */
3607
4aabdb9b
DM
3608 DEBUG_EXECUTE_r(
3609 PerlIO_printf(Perl_debug_log,
3610 "Entering embedded \"%s%.60s%s%s\"\n",
3611 PL_colors[0],
3612 re->precomp,
3613 PL_colors[1],
3614 (strlen(re->precomp) > 60 ? "..." : ""))
3615 );
2c2d71f5 3616
4aabdb9b
DM
3617 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
3618 REGCP_SET(st->u.eval.lastcp);
4aabdb9b
DM
3619 *PL_reglastparen = 0;
3620 *PL_reglastcloseparen = 0;
4aabdb9b 3621 PL_reginput = locinput;
4aabdb9b
DM
3622
3623 /* XXXX This is too dramatic a measure... */
3624 PL_reg_maxiter = 0;
3625
5d9a96ca 3626 st->logical = 0;
aa283a38
DM
3627 st->u.eval.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3628 ((re->reganch & ROPT_UTF8) != 0);
3629 if (st->u.eval.toggleutf) PL_reg_flags ^= RF_utf8;
3630 st->u.eval.prev_rex = rex;
aa283a38 3631 rex = re;
aa283a38 3632
77cb431f
DM
3633 /* resume to current state on success */
3634 st->u.yes.prev_yes_state = yes_state;
3635 yes_state = st;
aa283a38
DM
3636 PUSH_STATE(newst, resume_EVAL);
3637 st = newst;
3638
3639 /* now continue from first node in postoned RE */
3640 next = re->program + 1;
3641 break;
4aabdb9b 3642 /* NOTREACHED */
a0ed51b3 3643 }
4aabdb9b
DM
3644 /* /(?(?{...})X|Y)/ */
3645 st->sw = SvTRUE(ret);
3646 st->logical = 0;
c277df42
IZ
3647 break;
3648 }
a0d0e21e 3649 case OPEN:
c277df42 3650 n = ARG(scan); /* which paren pair */
3280af22
NIS
3651 PL_reg_start_tmp[n] = locinput;
3652 if (n > PL_regsize)
3653 PL_regsize = n;
a0d0e21e
LW
3654 break;
3655 case CLOSE:
c277df42 3656 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3657 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3658 PL_regendp[n] = locinput - PL_bostr;
eb160463 3659 if (n > (I32)*PL_reglastparen)
3280af22 3660 *PL_reglastparen = n;
a01268b5 3661 *PL_reglastcloseparen = n;
a0d0e21e 3662 break;
c277df42
IZ
3663 case GROUPP:
3664 n = ARG(scan); /* which paren pair */
5d9a96ca 3665 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3666 break;
3667 case IFTHEN:
2c2d71f5 3668 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3669 if (st->sw)
c277df42
IZ
3670 next = NEXTOPER(NEXTOPER(scan));
3671 else {
3672 next = scan + ARG(scan);
3673 if (OP(next) == IFTHEN) /* Fake one. */
3674 next = NEXTOPER(NEXTOPER(next));
3675 }
3676 break;
3677 case LOGICAL:
5d9a96ca 3678 st->logical = scan->flags;
c277df42 3679 break;
2ab05381 3680/*******************************************************************
a0374537
DM
3681 cc points to the regmatch_state associated with the most recent CURLYX.
3682 This struct contains info about the innermost (...)* loop (an
3683 "infoblock"), and a pointer to the next outer cc.
2ab05381
IZ
3684
3685 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3686
95b24440 3687 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3688
a0374537 3689 2) This regnode populates cc, and calls regmatch() recursively
95b24440 3690 with the starting point at WHILEM node;
2ab05381
IZ
3691
3692 3) Each hit of WHILEM node tries to match A and Z (in the order
3693 depending on the current iteration, min/max of {min,max} and
3694 greediness). The information about where are nodes for "A"
a0374537 3695 and "Z" is read from cc, as is info on how many times "A"
2ab05381
IZ
3696 was already matched, and greediness.
3697
3698 4) After A matches, the same WHILEM node is hit again.
3699
95b24440 3700 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
2ab05381 3701 of the same pair. Thus when WHILEM tries to match Z, it temporarily
95b24440 3702 resets cc, since this Y(A)*Z can be a part of some other loop:
2ab05381
IZ
3703 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3704 of the external loop.
3705
a0374537 3706 Currently present infoblocks form a tree with a stem formed by st->cc
2ab05381
IZ
3707 and whatever it mentions via ->next, and additional attached trees
3708 corresponding to temporarily unset infoblocks as in "5" above.
3709
95b24440 3710 In the following picture, infoblocks for outer loop of
2ab05381
IZ
3711 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3712 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3713 infoblocks are drawn below the "reset" infoblock.
3714
3715 In fact in the picture below we do not show failed matches for Z and T
3716 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3717 more obvious *why* one needs to *temporary* unset infoblocks.]
3718
3719 Matched REx position InfoBlocks Comment
3720 (Y(A)*?Z)*?T x
3721 Y(A)*?Z)*?T x <- O
3722 Y (A)*?Z)*?T x <- O
3723 Y A)*?Z)*?T x <- O <- I
3724 YA )*?Z)*?T x <- O <- I
3725 YA A)*?Z)*?T x <- O <- I
3726 YAA )*?Z)*?T x <- O <- I
3727 YAA Z)*?T x <- O # Temporary unset I
3728 I
3729
3730 YAAZ Y(A)*?Z)*?T x <- O
3731 I
3732
3733 YAAZY (A)*?Z)*?T x <- O
3734 I
3735
3736 YAAZY A)*?Z)*?T x <- O <- I
3737 I
3738
3739 YAAZYA )*?Z)*?T x <- O <- I
3740 I
3741
3742 YAAZYA Z)*?T x <- O # Temporary unset I
3743 I,I
3744
3745 YAAZYAZ )*?T x <- O
3746 I,I
3747
3748 YAAZYAZ T x # Temporary unset O
3749 O
3750 I,I
3751
3752 YAAZYAZT x
3753 O
3754 I,I
3755 *******************************************************************/
95b24440 3756
a0d0e21e 3757 case CURLYX: {
cb434fcc
IZ
3758 /* No need to save/restore up to this paren */
3759 I32 parenfloor = scan->flags;
c277df42 3760
c2b7afd3
NC
3761 /* Dave says:
3762
3763 CURLYX and WHILEM are always paired: they're the moral
3764 equivalent of pp_enteriter anbd pp_iter.
3765
3766 The only time next could be null is if the node tree is
3767 corrupt. This was mentioned on p5p a few days ago.
3768
3769 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3770 So we'll assert that this is true:
3771 */
3772 assert(next);
30b2893d 3773 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
c277df42 3774 next += ARG(next);
cb434fcc
IZ
3775 /* XXXX Probably it is better to teach regpush to support
3776 parenfloor > PL_regsize... */
eb160463 3777 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc 3778 parenfloor = *PL_reglastparen; /* Pessimization... */
a0374537 3779
d8319b27
DM
3780 st->u.curlyx.cp = PL_savestack_ix;
3781 st->u.curlyx.outercc = st->cc;
a0374537
DM
3782 st->cc = st;
3783 /* these fields contain the state of the current curly.
3784 * they are accessed by subsequent WHILEMs;
3785 * cur and lastloc are also updated by WHILEM */
d8319b27
DM
3786 st->u.curlyx.parenfloor = parenfloor;
3787 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3788 st->u.curlyx.min = ARG1(scan);
3789 st->u.curlyx.max = ARG2(scan);
3790 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3791 st->u.curlyx.lastloc = 0;
a0374537
DM
3792 /* st->next and st->minmod are also read by WHILEM */
3793
3280af22 3794 PL_reginput = locinput;
95b24440
DM
3795 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3796 /*** all unsaved local vars undefined at this point */
d8319b27
DM
3797 regcpblow(st->u.curlyx.cp);
3798 st->cc = st->u.curlyx.outercc;
95b24440 3799 saySAME(result);
a0d0e21e 3800 }
5f66b61c 3801 /* NOTREACHED */
a0d0e21e
LW
3802 case WHILEM: {
3803 /*
3804 * This is really hard to understand, because after we match
3805 * what we're trying to match, we must make sure the rest of
2c2d71f5 3806 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3807 * to go back UP the parse tree by recursing ever deeper. And
3808 * if it fails, we have to reset our parent's current state
3809 * that we can try again after backing off.
3810 */
3811
c2b7afd3
NC
3812 /* Dave says:
3813
3814 st->cc gets initialised by CURLYX ready for use by WHILEM.
3815 So again, unless somethings been corrupted, st->cc cannot
3816 be null at that point in WHILEM.
3817
3818 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3819 So we'll assert that this is true:
3820 */
3821 assert(st->cc);
d8319b27
DM
3822 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3823 st->u.whilem.cache_offset = 0;
3824 st->u.whilem.cache_bit = 0;
c277df42 3825
d8319b27 3826 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3280af22 3827 PL_reginput = locinput;
a0d0e21e 3828
a3621e74 3829 DEBUG_EXECUTE_r(
9041c2e3 3830 PerlIO_printf(Perl_debug_log,
91f3b821 3831 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3832 REPORT_CODE_OFF+PL_regindent*2, "",
d8319b27
DM
3833 (long)n, (long)st->cc->u.curlyx.min,
3834 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
c277df42 3835 );
4633a7c4 3836
a0d0e21e
LW
3837 /* If degenerate scan matches "", assume scan done. */
3838
d8319b27
DM
3839 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3840 st->u.whilem.savecc = st->cc;
3841 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3842 if (st->cc)
d8319b27 3843 st->ln = st->cc->u.curlyx.cur;
a3621e74 3844 DEBUG_EXECUTE_r(
c3464db5
DD
3845 PerlIO_printf(Perl_debug_log,
3846 "%*s empty match detected, try continuation...\n",
3280af22 3847 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3848 );
d8319b27 3849 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
95b24440 3850 /*** all unsaved local vars undefined at this point */
d8319b27 3851 st->cc = st->u.whilem.savecc;
95b24440 3852 if (result)
4633a7c4 3853 sayYES;
d8319b27
DM
3854 if (st->cc->u.curlyx.outercc)
3855 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4633a7c4 3856 sayNO;
a0d0e21e
LW
3857 }
3858
3859 /* First just match a string of min scans. */
3860
d8319b27
DM
3861 if (n < st->cc->u.curlyx.min) {
3862 st->cc->u.curlyx.cur = n;
3863 st->cc->u.curlyx.lastloc = locinput;
3864 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
95b24440
DM
3865 /*** all unsaved local vars undefined at this point */
3866 if (result)
4633a7c4 3867 sayYES;
d8319b27
DM
3868 st->cc->u.curlyx.cur = n - 1;
3869 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
4633a7c4 3870 sayNO;
a0d0e21e
LW
3871 }
3872
2c2d71f5
JH
3873 if (scan->flags) {
3874 /* Check whether we already were at this position.
3875 Postpone detection until we know the match is not
3876 *that* much linear. */
3877 if (!PL_reg_maxiter) {
3878 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
3879 /* possible overflow for long strings and many CURLYX's */
3880 if (PL_reg_maxiter < 0)
3881 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
3882 PL_reg_leftiter = PL_reg_maxiter;
3883 }
3884 if (PL_reg_leftiter-- == 0) {
a3b680e6 3885 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
2c2d71f5 3886 if (PL_reg_poscache) {
eb160463 3887 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3888 Renew(PL_reg_poscache, size, char);
3889 PL_reg_poscache_size = size;
3890 }
3891 Zero(PL_reg_poscache, size, char);
3892 }
3893 else {
3894 PL_reg_poscache_size = size;
a02a5408 3895 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3896 }
a3621e74 3897 DEBUG_EXECUTE_r(
2c2d71f5
JH
3898 PerlIO_printf(Perl_debug_log,
3899 "%sDetected a super-linear match, switching on caching%s...\n",
3900 PL_colors[4], PL_colors[5])
3901 );
3902 }
3903 if (PL_reg_leftiter < 0) {
d8319b27 3904 st->u.whilem.cache_offset = locinput - PL_bostr;
2c2d71f5 3905
d8319b27
DM
3906 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3907 + st->u.whilem.cache_offset * (scan->flags>>4);
3908 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3909 st->u.whilem.cache_offset /= 8;
3910 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
a3621e74 3911 DEBUG_EXECUTE_r(
2c2d71f5
JH
3912 PerlIO_printf(Perl_debug_log,
3913 "%*s already tried at this position...\n",
3914 REPORT_CODE_OFF+PL_regindent*2, "")
3915 );
3ab3c9b4
HS
3916 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3917 /* cache records success */
c2b0868c
HS
3918 sayYES;
3919 else
3ab3c9b4 3920 /* cache records failure */
c2b0868c 3921 sayNO_SILENT;
2c2d71f5 3922 }
2c2d71f5
JH
3923 }
3924 }
3925
a0d0e21e
LW
3926 /* Prefer next over scan for minimal matching. */
3927
5d9a96ca 3928 if (st->cc->minmod) {
d8319b27
DM
3929 st->u.whilem.savecc = st->cc;
3930 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3931 if (st->cc)
d8319b27
DM
3932 st->ln = st->cc->u.curlyx.cur;
3933 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3934 REGCP_SET(st->u.whilem.lastcp);
3935 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
95b24440 3936 /*** all unsaved local vars undefined at this point */
d8319b27 3937 st->cc = st->u.whilem.savecc;
95b24440 3938 if (result) {
d8319b27 3939 regcpblow(st->u.whilem.cp);
3ab3c9b4 3940 CACHEsayYES; /* All done. */
5f05dabc 3941 }
d8319b27 3942 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3943 regcppop(rex);
d8319b27
DM
3944 if (st->cc->u.curlyx.outercc)
3945 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
a0d0e21e 3946
d8319b27 3947 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
9041c2e3 3948 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3949 && !(PL_reg_flags & RF_warned)) {
3950 PL_reg_flags |= RF_warned;
9014280d 3951 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3952 "Complex regular subexpression recursion",
3953 REG_INFTY - 1);
c277df42 3954 }
3ab3c9b4 3955 CACHEsayNO;
c277df42 3956 }
a687059c 3957
a3621e74 3958 DEBUG_EXECUTE_r(
c3464db5
DD
3959 PerlIO_printf(Perl_debug_log,
3960 "%*s trying longer...\n",
3280af22 3961 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3962 );
a0d0e21e 3963 /* Try scanning more and see if it helps. */
3280af22 3964 PL_reginput = locinput;
d8319b27
DM
3965 st->cc->u.curlyx.cur = n;
3966 st->cc->u.curlyx.lastloc = locinput;
3967 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3968 REGCP_SET(st->u.whilem.lastcp);
3969 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
95b24440
DM
3970 /*** all unsaved local vars undefined at this point */
3971 if (result) {
d8319b27 3972 regcpblow(st->u.whilem.cp);
3ab3c9b4 3973 CACHEsayYES;
5f05dabc 3974 }
d8319b27 3975 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3976 regcppop(rex);
d8319b27
DM
3977 st->cc->u.curlyx.cur = n - 1;
3978 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3979 CACHEsayNO;
a0d0e21e
LW
3980 }
3981
3982 /* Prefer scan over next for maximal matching. */
3983
d8319b27
DM
3984 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3985 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3986 st->cc->u.curlyx.cur = n;
3987 st->cc->u.curlyx.lastloc = locinput;
3988 REGCP_SET(st->u.whilem.lastcp);
3989 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
95b24440
DM
3990 /*** all unsaved local vars undefined at this point */
3991 if (result) {
d8319b27 3992 regcpblow(st->u.whilem.cp);
3ab3c9b4 3993 CACHEsayYES;
5f05dabc 3994 }
d8319b27 3995 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3996 regcppop(rex); /* Restore some previous $<digit>s? */
3280af22 3997 PL_reginput = locinput;
a3621e74 3998 DEBUG_EXECUTE_r(
c3464db5
DD
3999 PerlIO_printf(Perl_debug_log,
4000 "%*s failed, try continuation...\n",
3280af22 4001 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
4002 );
4003 }
9041c2e3 4004 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 4005 && !(PL_reg_flags & RF_warned)) {
3280af22 4006 PL_reg_flags |= RF_warned;
9014280d 4007 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
4008 "Complex regular subexpression recursion",
4009 REG_INFTY - 1);
a0d0e21e
LW
4010 }
4011
4012 /* Failed deeper matches of scan, so see if this one works. */
d8319b27
DM
4013 st->u.whilem.savecc = st->cc;
4014 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 4015 if (st->cc)
d8319b27
DM
4016 st->ln = st->cc->u.curlyx.cur;
4017 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
95b24440 4018 /*** all unsaved local vars undefined at this point */
d8319b27 4019 st->cc = st->u.whilem.savecc;
95b24440 4020 if (result)
3ab3c9b4 4021 CACHEsayYES;
d8319b27
DM
4022 if (st->cc->u.curlyx.outercc)
4023 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4024 st->cc->u.curlyx.cur = n - 1;
4025 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 4026 CACHEsayNO;
a0d0e21e 4027 }
5f66b61c 4028 /* NOTREACHED */
9041c2e3 4029 case BRANCHJ:
c277df42
IZ
4030 next = scan + ARG(scan);
4031 if (next == scan)
4032 next = NULL;
4033 inner = NEXTOPER(NEXTOPER(scan));
4034 goto do_branch;
9041c2e3 4035 case BRANCH:
c277df42
IZ
4036 inner = NEXTOPER(scan);
4037 do_branch:
4038 {
be8e71aa 4039 const I32 type = OP(scan);
ae5031b3 4040 if (!next || OP(next) != type) /* No choice. */
c277df42 4041 next = inner; /* Avoid recursion. */
a0d0e21e 4042 else {
a3b680e6 4043 const I32 lastparen = *PL_reglastparen;
02db2b7b 4044 /* Put unwinding data on stack */
6136c704
AL
4045 const I32 unwind1 = SSNEWt(1,re_unwind_branch_t);
4046 re_unwind_branch_t * const uw = SSPTRt(unwind1,re_unwind_branch_t);
4047
5d9a96ca
DM
4048 uw->prev = st->unwind;
4049 st->unwind = unwind1;
e822a8b4 4050 uw->type = ((type == BRANCH)
02db2b7b
IZ
4051 ? RE_UNWIND_BRANCH
4052 : RE_UNWIND_BRANCHJ);
4053 uw->lastparen = lastparen;
4054 uw->next = next;
4055 uw->locinput = locinput;
4056 uw->nextchr = nextchr;
3a2830be 4057 uw->minmod = st->minmod;
02db2b7b
IZ
4058#ifdef DEBUGGING
4059 uw->regindent = ++PL_regindent;
4060#endif
c277df42 4061
02db2b7b
IZ
4062 REGCP_SET(uw->lastcp);
4063
4064 /* Now go into the first branch */
4065 next = inner;
a687059c 4066 }
a0d0e21e
LW
4067 }
4068 break;
4069 case MINMOD:
5d9a96ca 4070 st->minmod = 1;
a0d0e21e 4071 break;
c277df42
IZ
4072 case CURLYM:
4073 {
d8319b27 4074 st->u.curlym.l = st->u.curlym.matches = 0;
9041c2e3 4075
c277df42 4076 /* We suppose that the next guy does not need
0e788c72 4077 backtracking: in particular, it is of constant non-zero length,
c277df42 4078 and has no parenths to influence future backrefs. */
5d9a96ca 4079 st->ln = ARG1(scan); /* min to match */
c277df42 4080 n = ARG2(scan); /* max to match */
d8319b27
DM
4081 st->u.curlym.paren = scan->flags;
4082 if (st->u.curlym.paren) {
4083 if (st->u.curlym.paren > PL_regsize)
4084 PL_regsize = st->u.curlym.paren;
4085 if (st->u.curlym.paren > (I32)*PL_reglastparen)
4086 *PL_reglastparen = st->u.curlym.paren;
c277df42 4087 }
dc45a647 4088 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
d8319b27 4089 if (st->u.curlym.paren)
c277df42 4090 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 4091 PL_reginput = locinput;
d8319b27 4092 st->u.curlym.maxwanted = st->minmod ? st->ln : n;
0cadcf80
DM
4093 while (PL_reginput < PL_regeol && st->u.curlym.matches < st->u.curlym.maxwanted) {
4094 /* resume to current state on success */
4095 st->u.yes.prev_yes_state = yes_state;
4096 yes_state = st;
4097 REGMATCH(scan, CURLYM1);
4098 yes_state = st->u.yes.prev_yes_state;
4099 /*** all unsaved local vars undefined at this point */
4100 if (!result)
4101 break;
4102 /* on first match, determine length, u.curlym.l */
4103 if (!st->u.curlym.matches++) {
4104 if (PL_reg_match_utf8) {
4105 char *s = locinput;
4106 while (s < PL_reginput) {
4107 st->u.curlym.l++;
4108 s += UTF8SKIP(s);
6407bf3b
DM
4109 }
4110 }
0cadcf80
DM
4111 else {
4112 st->u.curlym.l = PL_reginput - locinput;
4113 }
4114 if (st->u.curlym.l == 0) {
4115 st->u.curlym.matches = st->u.curlym.maxwanted;
4116 break;
4117 }
6407bf3b 4118 }
0cadcf80 4119 locinput = PL_reginput;
6407bf3b
DM
4120 }
4121
4122 PL_reginput = locinput;
0cadcf80 4123 if (st->u.curlym.matches < st->ln) {
5d9a96ca 4124 st->minmod = 0;
0cadcf80
DM
4125 sayNO;
4126 }
5f80c4cf 4127
0cadcf80
DM
4128 DEBUG_EXECUTE_r(
4129 PerlIO_printf(Perl_debug_log,
4130 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
4131 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
4132 (IV) st->u.curlym.matches, (IV)st->u.curlym.l)
4133 );
4134
4135 /* calculate c1 and c1 for possible match of 1st char
4136 * following curly */
9e137952 4137 st->u.curlym.c1 = st->u.curlym.c2 = CHRTEST_VOID;
0cadcf80
DM
4138 if (HAS_TEXT(next) || JUMPABLE(next)) {
4139 regnode *text_node = next;
3dab1dad
YO
4140 if (! HAS_TEXT(text_node))
4141 FIND_NEXT_IMPT(text_node);
0cadcf80 4142 if (HAS_TEXT(text_node)
3dab1dad 4143 && PL_regkind[OP(text_node)] != REF)
0cadcf80
DM
4144 {
4145 st->u.curlym.c1 = (U8)*STRING(text_node);
4146 st->u.curlym.c2 =
4147 (OP(text_node) == EXACTF || OP(text_node) == REFF)
4148 ? PL_fold[st->u.curlym.c1]
4149 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4150 ? PL_fold_locale[st->u.curlym.c1]
4151 : st->u.curlym.c1;
4152 }
4153 }
5f80c4cf 4154
0cadcf80
DM
4155 REGCP_SET(st->u.curlym.lastcp);
4156
4157 st->u.curlym.minmod = st->minmod;
4158 st->minmod = 0;
4159 while (st->u.curlym.matches >= st->ln
4160 && (st->u.curlym.matches <= n
4161 /* for REG_INFTY, ln could overflow to negative */
4162 || (n == REG_INFTY && st->u.curlym.matches >= 0)))
4163 {
4164 /* If it could work, try it. */
9e137952 4165 if (st->u.curlym.c1 == CHRTEST_VOID ||
0cadcf80
DM
4166 UCHARAT(PL_reginput) == st->u.curlym.c1 ||
4167 UCHARAT(PL_reginput) == st->u.curlym.c2)
4168 {
4169 DEBUG_EXECUTE_r(
4170 PerlIO_printf(Perl_debug_log,
4171 "%*s trying tail with matches=%"IVdf"...\n",
4172 (int)(REPORT_CODE_OFF+PL_regindent*2),
4173 "", (IV)st->u.curlym.matches)
4174 );
4175 if (st->u.curlym.paren) {
4176 if (st->u.curlym.matches) {
4177 PL_regstartp[st->u.curlym.paren]
4178 = HOPc(PL_reginput, -st->u.curlym.l) - PL_bostr;
4179 PL_regendp[st->u.curlym.paren] = PL_reginput - PL_bostr;
cca55fe3 4180 }
5f80c4cf 4181 else
0cadcf80 4182 PL_regendp[st->u.curlym.paren] = -1;
5f80c4cf 4183 }
0cadcf80
DM
4184 /* resume to current state on success */
4185 st->u.yes.prev_yes_state = yes_state;
4186 yes_state = st;
4187 REGMATCH(next, CURLYM2);
4188 yes_state = st->u.yes.prev_yes_state;
4189 /*** all unsaved local vars undefined at this point */
4190 if (result)
4191 /* XXX tmp sayYES; */
4192 sayYES_FINAL;
4193 REGCP_UNWIND(st->u.curlym.lastcp);
a0ed51b3 4194 }
0cadcf80
DM
4195 /* Couldn't or didn't -- move forward/backward. */
4196 if (st->u.curlym.minmod) {
3280af22 4197 PL_reginput = locinput;
dad79028
DM
4198 /* resume to current state on success */
4199 st->u.yes.prev_yes_state = yes_state;
4200 yes_state = st;
95b24440 4201 REGMATCH(scan, CURLYM3);
dad79028 4202 yes_state = st->u.yes.prev_yes_state;
95b24440
DM
4203 /*** all unsaved local vars undefined at this point */
4204 if (result) {
0cadcf80 4205 st->u.curlym.matches++;
3280af22 4206 locinput = PL_reginput;
c277df42
IZ
4207 }
4208 else
4209 sayNO;
4210 }
0cadcf80 4211 else {
d8319b27
DM
4212 st->u.curlym.matches--;
4213 locinput = HOPc(locinput, -st->u.curlym.l);
3280af22 4214 PL_reginput = locinput;
c277df42
IZ
4215 }
4216 }
4217 sayNO;
5f66b61c 4218 /* NOTREACHED */
c277df42
IZ
4219 break;
4220 }
4221 case CURLYN:
d8319b27
DM
4222 st->u.plus.paren = scan->flags; /* Which paren to set */
4223 if (st->u.plus.paren > PL_regsize)
4224 PL_regsize = st->u.plus.paren;
4225 if (st->u.plus.paren > (I32)*PL_reglastparen)
4226 *PL_reglastparen = st->u.plus.paren;
5d9a96ca 4227 st->ln = ARG1(scan); /* min to match */
c277df42 4228 n = ARG2(scan); /* max to match */
dc45a647 4229 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 4230 goto repeat;
a0d0e21e 4231 case CURLY:
d8319b27 4232 st->u.plus.paren = 0;
5d9a96ca 4233 st->ln = ARG1(scan); /* min to match */
a0d0e21e 4234 n = ARG2(scan); /* max to match */
dc45a647 4235 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
4236 goto repeat;
4237 case STAR:
5d9a96ca 4238 st->ln = 0;
c277df42 4239 n = REG_INFTY;
a0d0e21e 4240 scan = NEXTOPER(scan);
d8319b27 4241 st->u.plus.paren = 0;
a0d0e21e
LW
4242 goto repeat;
4243 case PLUS:
5d9a96ca 4244 st->ln = 1;
c277df42
IZ
4245 n = REG_INFTY;
4246 scan = NEXTOPER(scan);
d8319b27 4247 st->u.plus.paren = 0;
c277df42 4248 repeat:
a0d0e21e
LW
4249 /*
4250 * Lookahead to avoid useless match attempts
4251 * when we know what character comes next.
4252 */
5f80c4cf
JP
4253
4254 /*
4255 * Used to only do .*x and .*?x, but now it allows
4256 * for )'s, ('s and (?{ ... })'s to be in the way
4257 * of the quantifier and the EXACT-like node. -- japhy
4258 */
4259
cca55fe3 4260 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4261 U8 *s;
4262 regnode *text_node = next;
4263
3dab1dad
YO
4264 if (! HAS_TEXT(text_node))
4265 FIND_NEXT_IMPT(text_node);
5f80c4cf 4266
9e137952
DM
4267 if (! HAS_TEXT(text_node))
4268 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
5f80c4cf 4269 else {
3dab1dad 4270 if (PL_regkind[OP(text_node)] == REF) {
9e137952 4271 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
44a68960 4272 goto assume_ok_easy;
cca55fe3 4273 }
be8e71aa
YO
4274 else
4275 s = (U8*)STRING(text_node);
5f80c4cf
JP
4276
4277 if (!UTF) {
d8319b27 4278 st->u.plus.c2 = st->u.plus.c1 = *s;
f65d3ee7 4279 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
d8319b27 4280 st->u.plus.c2 = PL_fold[st->u.plus.c1];
f65d3ee7 4281 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
d8319b27 4282 st->u.plus.c2 = PL_fold_locale[st->u.plus.c1];
1aa99e6b 4283 }
5f80c4cf 4284 else { /* UTF */
f65d3ee7 4285 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 4286 STRLEN ulen1, ulen2;
89ebb4a3
JH
4287 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4288 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4289
4290 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4291 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4292
d8319b27 4293 st->u.plus.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
041457d9 4294 uniflags);
d8319b27 4295 st->u.plus.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
041457d9 4296 uniflags);
5f80c4cf
JP
4297 }
4298 else {
d8319b27 4299 st->u.plus.c2 = st->u.plus.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4300 uniflags);
5f80c4cf 4301 }
1aa99e6b
IH
4302 }
4303 }
bbce6d69 4304 }
a0d0e21e 4305 else
9e137952 4306 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
cca55fe3 4307 assume_ok_easy:
3280af22 4308 PL_reginput = locinput;
5d9a96ca
DM
4309 if (st->minmod) {
4310 st->minmod = 0;
32fc9b6a 4311 if (st->ln && regrepeat(rex, scan, st->ln) < st->ln)
4633a7c4 4312 sayNO;
a0ed51b3 4313 locinput = PL_reginput;
d8319b27 4314 REGCP_SET(st->u.plus.lastcp);
9e137952 4315 if (st->u.plus.c1 != CHRTEST_VOID) {
d8319b27
DM
4316 st->u.plus.old = locinput;
4317 st->u.plus.count = 0;
0fe9bf95 4318
1aa99e6b 4319 if (n == REG_INFTY) {
d8319b27 4320 st->u.plus.e = PL_regeol - 1;
1aa99e6b 4321 if (do_utf8)
d8319b27
DM
4322 while (UTF8_IS_CONTINUATION(*(U8*)st->u.plus.e))
4323 st->u.plus.e--;
1aa99e6b
IH
4324 }
4325 else if (do_utf8) {
5d9a96ca 4326 int m = n - st->ln;
d8319b27
DM
4327 for (st->u.plus.e = locinput;
4328 m >0 && st->u.plus.e + UTF8SKIP(st->u.plus.e) <= PL_regeol; m--)
4329 st->u.plus.e += UTF8SKIP(st->u.plus.e);
1aa99e6b
IH
4330 }
4331 else {
d8319b27
DM
4332 st->u.plus.e = locinput + n - st->ln;
4333 if (st->u.plus.e >= PL_regeol)
4334 st->u.plus.e = PL_regeol - 1;
1aa99e6b 4335 }
0fe9bf95
IZ
4336 while (1) {
4337 /* Find place 'next' could work */
1aa99e6b 4338 if (!do_utf8) {
d8319b27
DM
4339 if (st->u.plus.c1 == st->u.plus.c2) {
4340 while (locinput <= st->u.plus.e &&
4341 UCHARAT(locinput) != st->u.plus.c1)
1aa99e6b
IH
4342 locinput++;
4343 } else {
d8319b27
DM
4344 while (locinput <= st->u.plus.e
4345 && UCHARAT(locinput) != st->u.plus.c1
4346 && UCHARAT(locinput) != st->u.plus.c2)
1aa99e6b
IH
4347 locinput++;
4348 }
d8319b27 4349 st->u.plus.count = locinput - st->u.plus.old;
1aa99e6b
IH
4350 }
4351 else {
d8319b27 4352 if (st->u.plus.c1 == st->u.plus.c2) {
a3b680e6 4353 STRLEN len;
872c91ae
JH
4354 /* count initialised to
4355 * utf8_distance(old, locinput) */
d8319b27 4356 while (locinput <= st->u.plus.e &&
872c91ae 4357 utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4358 UTF8_MAXBYTES, &len,
d8319b27 4359 uniflags) != (UV)st->u.plus.c1) {
1aa99e6b 4360 locinput += len;
d8319b27 4361 st->u.plus.count++;
b2f2f093 4362 }
1aa99e6b 4363 } else {
872c91ae
JH
4364 /* count initialised to
4365 * utf8_distance(old, locinput) */
d8319b27 4366 while (locinput <= st->u.plus.e) {
c4fd8992
AL
4367 STRLEN len;
4368 const UV c = utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4369 UTF8_MAXBYTES, &len,
041457d9 4370 uniflags);
d8319b27 4371 if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2)
1aa99e6b 4372 break;
b2f2f093 4373 locinput += len;
d8319b27 4374 st->u.plus.count++;
1aa99e6b
IH
4375 }
4376 }
0fe9bf95 4377 }
d8319b27 4378 if (locinput > st->u.plus.e)
0fe9bf95
IZ
4379 sayNO;
4380 /* PL_reginput == old now */
d8319b27 4381 if (locinput != st->u.plus.old) {
5d9a96ca 4382 st->ln = 1; /* Did some */
32fc9b6a 4383 if (regrepeat(rex, scan, st->u.plus.count) < st->u.plus.count)
0fe9bf95
IZ
4384 sayNO;
4385 }
4386 /* PL_reginput == locinput now */
d8319b27 4387 TRYPAREN(st->u.plus.paren, st->ln, locinput, PLUS1);
95b24440 4388 /*** all unsaved local vars undefined at this point */
0fe9bf95 4389 PL_reginput = locinput; /* Could be reset... */
d8319b27 4390 REGCP_UNWIND(st->u.plus.lastcp);
0fe9bf95 4391 /* Couldn't or didn't -- move forward. */
d8319b27 4392 st->u.plus.old = locinput;
1aa99e6b
IH
4393 if (do_utf8)
4394 locinput += UTF8SKIP(locinput);
4395 else
4396 locinput++;
d8319b27 4397 st->u.plus.count = 1;
0fe9bf95
IZ
4398 }
4399 }
4400 else
5d9a96ca 4401 while (n >= st->ln || (n == REG_INFTY && st->ln > 0)) { /* ln overflow ? */
1aa99e6b 4402 UV c;
9e137952 4403 if (st->u.plus.c1 != CHRTEST_VOID) {
1aa99e6b 4404 if (do_utf8)
872c91ae 4405 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4406 UTF8_MAXBYTES, 0,
041457d9 4407 uniflags);
1aa99e6b 4408 else
9041c2e3 4409 c = UCHARAT(PL_reginput);
2390ecbc 4410 /* If it could work, try it. */
be8e71aa 4411 if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2) {
d8319b27 4412 TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS2);
95b24440 4413 /*** all unsaved local vars undefined at this point */
d8319b27 4414 REGCP_UNWIND(st->u.plus.lastcp);
2390ecbc 4415 }
1aa99e6b 4416 }
a0d0e21e 4417 /* If it could work, try it. */
be8e71aa 4418 else if (st->u.plus.c1 == CHRTEST_VOID) {
d8319b27 4419 TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS3);
95b24440 4420 /*** all unsaved local vars undefined at this point */
d8319b27 4421 REGCP_UNWIND(st->u.plus.lastcp);
bbce6d69 4422 }
c277df42 4423 /* Couldn't or didn't -- move forward. */
a0ed51b3 4424 PL_reginput = locinput;
32fc9b6a 4425 if (regrepeat(rex, scan, 1)) {
5d9a96ca 4426 st->ln++;
a0ed51b3
LW
4427 locinput = PL_reginput;
4428 }
4429 else
4633a7c4 4430 sayNO;
a0d0e21e
LW
4431 }
4432 }
4433 else {
32fc9b6a 4434 n = regrepeat(rex, scan, n);
a0ed51b3 4435 locinput = PL_reginput;
3dab1dad
YO
4436 if ((st->ln < n) && (PL_regkind[OP(next)] == EOL) &&
4437 (OP(next) != MEOL || OP(next) == SEOL || OP(next) == EOS))
15272685 4438 {
5d9a96ca 4439 st->ln = n; /* why back off? */
1aeab75a
GS
4440 /* ...because $ and \Z can match before *and* after
4441 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4442 We should back off by one in this case. */
4443 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
5d9a96ca 4444 st->ln--;
1aeab75a 4445 }
d8319b27 4446 REGCP_SET(st->u.plus.lastcp);
1d5c262f 4447 {
5d9a96ca 4448 while (n >= st->ln) {
be8e71aa 4449 UV c = 0;
9e137952 4450 if (st->u.plus.c1 != CHRTEST_VOID) {
1aa99e6b 4451 if (do_utf8)
872c91ae 4452 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4453 UTF8_MAXBYTES, 0,
041457d9 4454 uniflags);
1aa99e6b 4455 else
9041c2e3 4456 c = UCHARAT(PL_reginput);
1aa99e6b 4457 }
c277df42 4458 /* If it could work, try it. */
be8e71aa
YO
4459 if (st->u.plus.c1 == CHRTEST_VOID || c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2) {
4460 TRYPAREN(st->u.plus.paren, n, PL_reginput, PLUS4);
4461 /*** all unsaved local vars undefined at this point */
4462 REGCP_UNWIND(st->u.plus.lastcp);
4463 }
c277df42
IZ
4464 /* Couldn't or didn't -- back up. */
4465 n--;
dfe13c55 4466 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 4467 }
a0d0e21e
LW
4468 }
4469 }
4633a7c4 4470 sayNO;
c277df42 4471 break;
a0d0e21e 4472 case END:
3b0527fe 4473 if (locinput < reginfo->till) {
a3621e74 4474 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4475 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4476 PL_colors[4],
4477 (long)(locinput - PL_reg_starttry),
3b0527fe 4478 (long)(reginfo->till - PL_reg_starttry),
7821416a
IZ
4479 PL_colors[5]));
4480 sayNO_FINAL; /* Cannot match: too short. */
4481 }
4482 PL_reginput = locinput; /* put where regtry can find it */
4483 sayYES_FINAL; /* Success! */
dad79028
DM
4484
4485 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4486 DEBUG_EXECUTE_r(
4487 PerlIO_printf(Perl_debug_log,
4488 "%*s %ssubpattern success...%s\n",
4489 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
3280af22 4490 PL_reginput = locinput; /* put where regtry can find it */
dad79028
DM
4491 sayYES_FINAL; /* Success! */
4492
4493 case SUSPEND: /* (?>FOO) */
4494 st->u.ifmatch.wanted = 1;
9fe1d20c 4495 PL_reginput = locinput;
9041c2e3 4496 goto do_ifmatch;
dad79028
DM
4497
4498 case UNLESSM: /* -ve lookaround: (?!FOO), or with flags, (?<!foo) */
4499 st->u.ifmatch.wanted = 0;
4500 goto ifmatch_trivial_fail_test;
4501
4502 case IFMATCH: /* +ve lookaround: (?=FOO), or with flags, (?<=foo) */
4503 st->u.ifmatch.wanted = 1;
4504 ifmatch_trivial_fail_test:
a0ed51b3 4505 if (scan->flags) {
52657f30 4506 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4507 if (!s) {
4508 /* trivial fail */
4509 if (st->logical) {
4510 st->logical = 0;
3dab1dad 4511 st->sw = 1 - (bool)st->u.ifmatch.wanted;
dad79028
DM
4512 }
4513 else if (st->u.ifmatch.wanted)
4514 sayNO;
4515 next = scan + ARG(scan);
4516 if (next == scan)
4517 next = NULL;
4518 break;
4519 }
efb30f32 4520 PL_reginput = s;
a0ed51b3
LW
4521 }
4522 else
4523 PL_reginput = locinput;
4524
c277df42 4525 do_ifmatch:
dad79028
DM
4526 /* resume to current state on success */
4527 st->u.yes.prev_yes_state = yes_state;
4528 yes_state = st;
4529 PUSH_STATE(newst, resume_IFMATCH);
4530 st = newst;
4531 next = NEXTOPER(NEXTOPER(scan));
4532 break;
4533
c277df42 4534 case LONGJMP:
c277df42
IZ
4535 next = scan + ARG(scan);
4536 if (next == scan)
4537 next = NULL;
a0d0e21e
LW
4538 break;
4539 default:
b900a521 4540 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4541 PTR2UV(scan), OP(scan));
cea2e8a9 4542 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4543 }
95b24440 4544
02db2b7b 4545 reenter:
a0d0e21e 4546 scan = next;
95b24440
DM
4547 continue;
4548 /* NOTREACHED */
4549
4550 /* simulate recursively calling regmatch(), but without actually
4551 * recursing - ie save the current state on the heap rather than on
4552 * the stack, then re-enter the loop. This avoids complex regexes
4553 * blowing the processor stack */
4554
4555 start_recurse:
4556 {
5d9a96ca
DM
4557 /* push new state */
4558 regmatch_state *oldst = st;
4559
4560 depth++;
4561
4562 /* grab the next free state slot */
4563 st++;
86545054 4564 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
4565 st = S_push_slab(aTHX);
4566 PL_regmatch_state = st;
4567
4568 oldst->next = next;
4569 oldst->n = n;
4570 oldst->locinput = locinput;
5d9a96ca
DM
4571
4572 st->cc = oldst->cc;
95b24440
DM
4573 locinput = PL_reginput;
4574 nextchr = UCHARAT(locinput);
5d9a96ca
DM
4575 st->minmod = 0;
4576 st->sw = 0;
4577 st->logical = 0;
4578 st->unwind = 0;
95b24440
DM
4579#ifdef DEBUGGING
4580 PL_regindent++;
4581#endif
4582 }
a0d0e21e 4583 }
a687059c 4584
aa283a38
DM
4585
4586
a0d0e21e
LW
4587 /*
4588 * We get here only if there's trouble -- normally "case END" is
4589 * the terminating point.
4590 */
cea2e8a9 4591 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4592 /*NOTREACHED*/
4633a7c4
LW
4593 sayNO;
4594
7821416a 4595yes_final:
77cb431f
DM
4596
4597 if (yes_state) {
4598 /* we have successfully completed a subexpression, but we must now
4599 * pop to the state marked by yes_state and continue from there */
4600
dad79028 4601 /*XXX tmp for CURLYM*/
c4fd8992
AL
4602 regmatch_slab * const oslab = PL_regmatch_slab;
4603 regmatch_state * const ost = st;
4604 regmatch_state * const oys = yes_state;
dad79028
DM
4605 int odepth = depth;
4606
77cb431f
DM
4607 assert(st != yes_state);
4608 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4609 || yes_state > SLAB_LAST(PL_regmatch_slab))
4610 {
4611 /* not in this slab, pop slab */
4612 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4613 PL_regmatch_slab = PL_regmatch_slab->prev;
4614 st = SLAB_LAST(PL_regmatch_slab);
4615 }
4616 depth -= (st - yes_state);
dad79028 4617 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE TO (%d)\n", depth));
77cb431f
DM
4618 st = yes_state;
4619 yes_state = st->u.yes.prev_yes_state;
4620 PL_regmatch_state = st;
4621
4622 switch (st->resume_state) {
4623 case resume_EVAL:
4624 if (st->u.eval.toggleutf)
4625 PL_reg_flags ^= RF_utf8;
4626 ReREFCNT_dec(rex);
4627 rex = st->u.eval.prev_rex;
4628 /* XXXX This is too dramatic a measure... */
4629 PL_reg_maxiter = 0;
4630 /* Restore parens of the caller without popping the
4631 * savestack */
4632 {
c4fd8992 4633 const I32 tmp = PL_savestack_ix;
77cb431f
DM
4634 PL_savestack_ix = st->u.eval.lastcp;
4635 regcppop(rex);
4636 PL_savestack_ix = tmp;
4637 }
4638 PL_reginput = locinput;
4639 /* continue at the node following the (??{...}) */
4640 next = st->next;
4641 goto reenter;
4642
dad79028
DM
4643 case resume_IFMATCH:
4644 if (st->logical) {
4645 st->logical = 0;
3dab1dad 4646 st->sw = (bool)st->u.ifmatch.wanted;
dad79028
DM
4647 }
4648 else if (!st->u.ifmatch.wanted)
4649 sayNO;
4650
4651 if (OP(st->scan) == SUSPEND)
4652 locinput = PL_reginput;
4653 else {
4654 locinput = PL_reginput = st->locinput;
4655 nextchr = UCHARAT(locinput);
4656 }
4657 next = st->scan + ARG(st->scan);
4658 if (next == st->scan)
4659 next = NULL;
4660 goto reenter;
4661
4662 /* XXX tmp don't handle yes_state yet */
4663 case resume_CURLYM1:
4664 case resume_CURLYM2:
4665 case resume_CURLYM3:
dad79028
DM
4666 PL_regmatch_slab =oslab;
4667 st = ost;
4668 PL_regmatch_state = st;
4669 depth = odepth;
4670 yes_state = oys;
4671 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "XXX revering a CURLYM\n"));
4672 goto yes;
4673
77cb431f
DM
4674 default:
4675 Perl_croak(aTHX_ "unexpected yes reume state");
4676 }
4677 }
4678
a3621e74 4679 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4680 PL_colors[4], PL_colors[5]));
4633a7c4
LW
4681yes:
4682#ifdef DEBUGGING
3280af22 4683 PL_regindent--;
4633a7c4 4684#endif
02db2b7b 4685
95b24440 4686 result = 1;
aa283a38 4687 /* XXX this is duplicate(ish) code to that in the do_no section.
77cb431f
DM
4688 * eventually a yes should just pop the stack back to the current
4689 * yes_state */
aa283a38
DM
4690 if (depth) {
4691 /* restore previous state and re-enter */
4692 POP_STATE;
4693
4694 switch (st->resume_state) {
4695 case resume_TRIE1:
4696 goto resume_point_TRIE1;
4697 case resume_TRIE2:
4698 goto resume_point_TRIE2;
aa283a38
DM
4699 case resume_CURLYX:
4700 goto resume_point_CURLYX;
4701 case resume_WHILEM1:
4702 goto resume_point_WHILEM1;
4703 case resume_WHILEM2:
4704 goto resume_point_WHILEM2;
4705 case resume_WHILEM3:
4706 goto resume_point_WHILEM3;
4707 case resume_WHILEM4:
4708 goto resume_point_WHILEM4;
4709 case resume_WHILEM5:
4710 goto resume_point_WHILEM5;
4711 case resume_WHILEM6:
4712 goto resume_point_WHILEM6;
4713 case resume_CURLYM1:
4714 goto resume_point_CURLYM1;
4715 case resume_CURLYM2:
4716 goto resume_point_CURLYM2;
4717 case resume_CURLYM3:
4718 goto resume_point_CURLYM3;
aa283a38
DM
4719 case resume_PLUS1:
4720 goto resume_point_PLUS1;
4721 case resume_PLUS2:
4722 goto resume_point_PLUS2;
4723 case resume_PLUS3:
4724 goto resume_point_PLUS3;
4725 case resume_PLUS4:
4726 goto resume_point_PLUS4;
77cb431f 4727
dad79028 4728 case resume_IFMATCH:
77cb431f 4729 case resume_EVAL:
aa283a38
DM
4730 default:
4731 Perl_croak(aTHX_ "regexp resume memory corruption");
4732 }
4733 }
4734 goto final_exit;
4633a7c4
LW
4735
4736no:
a3621e74 4737 DEBUG_EXECUTE_r(
7821416a
IZ
4738 PerlIO_printf(Perl_debug_log,
4739 "%*s %sfailed...%s\n",
e4584336 4740 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a
IZ
4741 );
4742 goto do_no;
4743no_final:
4744do_no:
5d9a96ca
DM
4745 if (st->unwind) {
4746 re_unwind_t * const uw = SSPTRt(st->unwind,re_unwind_t);
02db2b7b
IZ
4747
4748 switch (uw->type) {
4749 case RE_UNWIND_BRANCH:
4750 case RE_UNWIND_BRANCHJ:
4751 {
6136c704 4752 re_unwind_branch_t * const uwb = &(uw->branch);
a3b680e6 4753 const I32 lastparen = uwb->lastparen;
9041c2e3 4754
02db2b7b
IZ
4755 REGCP_UNWIND(uwb->lastcp);
4756 for (n = *PL_reglastparen; n > lastparen; n--)
4757 PL_regendp[n] = -1;
4758 *PL_reglastparen = n;
4759 scan = next = uwb->next;
3a2830be 4760 st->minmod = uwb->minmod;
9041c2e3
NIS
4761 if ( !scan ||
4762 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b 4763 ? BRANCH : BRANCHJ) ) { /* Failure */
5d9a96ca 4764 st->unwind = uwb->prev;
02db2b7b
IZ
4765#ifdef DEBUGGING
4766 PL_regindent--;
4767#endif
4768 goto do_no;
4769 }
4770 /* Have more choice yet. Reuse the same uwb. */
02db2b7b
IZ
4771 if ((n = (uwb->type == RE_UNWIND_BRANCH
4772 ? NEXT_OFF(next) : ARG(next))))
4773 next += n;
4774 else
4775 next = NULL; /* XXXX Needn't unwinding in this case... */
4776 uwb->next = next;
4777 next = NEXTOPER(scan);
4778 if (uwb->type == RE_UNWIND_BRANCHJ)
4779 next = NEXTOPER(next);
4780 locinput = uwb->locinput;
4781 nextchr = uwb->nextchr;
4782#ifdef DEBUGGING
4783 PL_regindent = uwb->regindent;
4784#endif
4785
4786 goto reenter;
4787 }
5f66b61c 4788 /* NOTREACHED */
02db2b7b
IZ
4789 default:
4790 Perl_croak(aTHX_ "regexp unwind memory corruption");
4791 }
5f66b61c 4792 /* NOTREACHED */
02db2b7b 4793 }
aa283a38 4794
4633a7c4 4795#ifdef DEBUGGING
3280af22 4796 PL_regindent--;
4633a7c4 4797#endif
95b24440 4798 result = 0;
5d9a96ca 4799
aa283a38
DM
4800 if (depth) {
4801 /* there's a previous state to backtrack to */
4802 POP_STATE;
5d9a96ca 4803 switch (st->resume_state) {
95b24440
DM
4804 case resume_TRIE1:
4805 goto resume_point_TRIE1;
4806 case resume_TRIE2:
4807 goto resume_point_TRIE2;
aa283a38
DM
4808 case resume_EVAL:
4809 /* we have failed an (??{...}). Restore state to the outer re
4810 * then re-throw the failure */
4811 if (st->u.eval.toggleutf)
4812 PL_reg_flags ^= RF_utf8;
4813 ReREFCNT_dec(rex);
4814 rex = st->u.eval.prev_rex;
77cb431f 4815 yes_state = st->u.yes.prev_yes_state;
aa283a38
DM
4816
4817 /* XXXX This is too dramatic a measure... */
4818 PL_reg_maxiter = 0;
4819
4820 PL_reginput = locinput;
4821 REGCP_UNWIND(st->u.eval.lastcp);
4822 regcppop(rex);
4823 goto do_no;
4824
95b24440
DM
4825 case resume_CURLYX:
4826 goto resume_point_CURLYX;
4827 case resume_WHILEM1:
4828 goto resume_point_WHILEM1;
4829 case resume_WHILEM2:
4830 goto resume_point_WHILEM2;
4831 case resume_WHILEM3:
4832 goto resume_point_WHILEM3;
4833 case resume_WHILEM4:
4834 goto resume_point_WHILEM4;
4835 case resume_WHILEM5:
4836 goto resume_point_WHILEM5;
4837 case resume_WHILEM6:
4838 goto resume_point_WHILEM6;
4839 case resume_CURLYM1:
4840 goto resume_point_CURLYM1;
4841 case resume_CURLYM2:
4842 goto resume_point_CURLYM2;
4843 case resume_CURLYM3:
4844 goto resume_point_CURLYM3;
95b24440 4845 case resume_IFMATCH:
dad79028
DM
4846 yes_state = st->u.yes.prev_yes_state;
4847 if (st->logical) {
4848 st->logical = 0;
4849 st->sw = !st->u.ifmatch.wanted;
4850 }
4851 else if (st->u.ifmatch.wanted)
4852 sayNO;
4853
4854 assert(OP(scan) != SUSPEND); /* XXX DAPM tmp */
4855 locinput = PL_reginput = st->locinput;
4856 nextchr = UCHARAT(locinput);
4857 next = scan + ARG(scan);
4858 if (next == scan)
4859 next = NULL;
4860 goto reenter;
4861
95b24440
DM
4862 case resume_PLUS1:
4863 goto resume_point_PLUS1;
4864 case resume_PLUS2:
4865 goto resume_point_PLUS2;
4866 case resume_PLUS3:
4867 goto resume_point_PLUS3;
4868 case resume_PLUS4:
4869 goto resume_point_PLUS4;
95b24440
DM
4870 default:
4871 Perl_croak(aTHX_ "regexp resume memory corruption");
4872 }
95b24440 4873 }
aa283a38
DM
4874
4875final_exit:
4876
5d9a96ca
DM
4877 /* restore original high-water mark */
4878 PL_regmatch_slab = orig_slab;
4879 PL_regmatch_state = orig_state;
4880
4881 /* free all slabs above current one */
4882 if (orig_slab->next) {
c4fd8992 4883 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
4884 orig_slab->next = NULL;
4885 while (sl) {
c4fd8992 4886 regmatch_slab * const osl = sl;
5d9a96ca 4887 sl = sl->next;
ad65c075 4888 Safefree(osl);
5d9a96ca
DM
4889 }
4890 }
4891
95b24440
DM
4892 return result;
4893
a687059c
LW
4894}
4895
4896/*
4897 - regrepeat - repeatedly match something simple, report how many
4898 */
4899/*
4900 * [This routine now assumes that it will only match on things of length 1.
4901 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4902 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4903 */
76e3520e 4904STATIC I32
32fc9b6a 4905S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
a687059c 4906{
27da23d5 4907 dVAR;
a0d0e21e 4908 register char *scan;
a0d0e21e 4909 register I32 c;
3280af22 4910 register char *loceol = PL_regeol;
a0ed51b3 4911 register I32 hardcount = 0;
53c4c00c 4912 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4913
3280af22 4914 scan = PL_reginput;
faf11cac
HS
4915 if (max == REG_INFTY)
4916 max = I32_MAX;
4917 else if (max < loceol - scan)
7f596f4c 4918 loceol = scan + max;
a0d0e21e 4919 switch (OP(p)) {
22c35a8c 4920 case REG_ANY:
1aa99e6b 4921 if (do_utf8) {
ffc61ed2 4922 loceol = PL_regeol;
1aa99e6b 4923 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4924 scan += UTF8SKIP(scan);
4925 hardcount++;
4926 }
4927 } else {
4928 while (scan < loceol && *scan != '\n')
4929 scan++;
a0ed51b3
LW
4930 }
4931 break;
ffc61ed2 4932 case SANY:
def8e4ea
JH
4933 if (do_utf8) {
4934 loceol = PL_regeol;
a0804c9e 4935 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4936 scan += UTF8SKIP(scan);
4937 hardcount++;
4938 }
4939 }
4940 else
4941 scan = loceol;
a0ed51b3 4942 break;
f33976b4
DB
4943 case CANY:
4944 scan = loceol;
4945 break;
090f7165
JH
4946 case EXACT: /* length of string is 1 */
4947 c = (U8)*STRING(p);
4948 while (scan < loceol && UCHARAT(scan) == c)
4949 scan++;
bbce6d69 4950 break;
4951 case EXACTF: /* length of string is 1 */
cd439c50 4952 c = (U8)*STRING(p);
bbce6d69 4953 while (scan < loceol &&
22c35a8c 4954 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4955 scan++;
4956 break;
4957 case EXACTFL: /* length of string is 1 */
3280af22 4958 PL_reg_flags |= RF_tainted;
cd439c50 4959 c = (U8)*STRING(p);
bbce6d69 4960 while (scan < loceol &&
22c35a8c 4961 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4962 scan++;
4963 break;
4964 case ANYOF:
ffc61ed2
JH
4965 if (do_utf8) {
4966 loceol = PL_regeol;
cfc92286 4967 while (hardcount < max && scan < loceol &&
32fc9b6a 4968 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4969 scan += UTF8SKIP(scan);
4970 hardcount++;
4971 }
4972 } else {
32fc9b6a 4973 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
4974 scan++;
4975 }
a0d0e21e
LW
4976 break;
4977 case ALNUM:
1aa99e6b 4978 if (do_utf8) {
ffc61ed2 4979 loceol = PL_regeol;
1a4fad37 4980 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4981 while (hardcount < max && scan < loceol &&
3568d838 4982 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4983 scan += UTF8SKIP(scan);
4984 hardcount++;
4985 }
4986 } else {
4987 while (scan < loceol && isALNUM(*scan))
4988 scan++;
a0ed51b3
LW
4989 }
4990 break;
bbce6d69 4991 case ALNUML:
3280af22 4992 PL_reg_flags |= RF_tainted;
1aa99e6b 4993 if (do_utf8) {
ffc61ed2 4994 loceol = PL_regeol;
1aa99e6b
IH
4995 while (hardcount < max && scan < loceol &&
4996 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4997 scan += UTF8SKIP(scan);
4998 hardcount++;
4999 }
5000 } else {
5001 while (scan < loceol && isALNUM_LC(*scan))
5002 scan++;
a0ed51b3
LW
5003 }
5004 break;
a0d0e21e 5005 case NALNUM:
1aa99e6b 5006 if (do_utf8) {
ffc61ed2 5007 loceol = PL_regeol;
1a4fad37 5008 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 5009 while (hardcount < max && scan < loceol &&
3568d838 5010 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5011 scan += UTF8SKIP(scan);
5012 hardcount++;
5013 }
5014 } else {
5015 while (scan < loceol && !isALNUM(*scan))
5016 scan++;
a0ed51b3
LW
5017 }
5018 break;
bbce6d69 5019 case NALNUML:
3280af22 5020 PL_reg_flags |= RF_tainted;
1aa99e6b 5021 if (do_utf8) {
ffc61ed2 5022 loceol = PL_regeol;
1aa99e6b
IH
5023 while (hardcount < max && scan < loceol &&
5024 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5025 scan += UTF8SKIP(scan);
5026 hardcount++;
5027 }
5028 } else {
5029 while (scan < loceol && !isALNUM_LC(*scan))
5030 scan++;
a0ed51b3
LW
5031 }
5032 break;
a0d0e21e 5033 case SPACE:
1aa99e6b 5034 if (do_utf8) {
ffc61ed2 5035 loceol = PL_regeol;
1a4fad37 5036 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5037 while (hardcount < max && scan < loceol &&
3568d838
JH
5038 (*scan == ' ' ||
5039 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5040 scan += UTF8SKIP(scan);
5041 hardcount++;
5042 }
5043 } else {
5044 while (scan < loceol && isSPACE(*scan))
5045 scan++;
a0ed51b3
LW
5046 }
5047 break;
bbce6d69 5048 case SPACEL:
3280af22 5049 PL_reg_flags |= RF_tainted;
1aa99e6b 5050 if (do_utf8) {
ffc61ed2 5051 loceol = PL_regeol;
1aa99e6b 5052 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5053 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5054 scan += UTF8SKIP(scan);
5055 hardcount++;
5056 }
5057 } else {
5058 while (scan < loceol && isSPACE_LC(*scan))
5059 scan++;
a0ed51b3
LW
5060 }
5061 break;
a0d0e21e 5062 case NSPACE:
1aa99e6b 5063 if (do_utf8) {
ffc61ed2 5064 loceol = PL_regeol;
1a4fad37 5065 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5066 while (hardcount < max && scan < loceol &&
3568d838
JH
5067 !(*scan == ' ' ||
5068 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5069 scan += UTF8SKIP(scan);
5070 hardcount++;
5071 }
5072 } else {
5073 while (scan < loceol && !isSPACE(*scan))
5074 scan++;
5075 break;
a0ed51b3 5076 }
bbce6d69 5077 case NSPACEL:
3280af22 5078 PL_reg_flags |= RF_tainted;
1aa99e6b 5079 if (do_utf8) {
ffc61ed2 5080 loceol = PL_regeol;
1aa99e6b 5081 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5082 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5083 scan += UTF8SKIP(scan);
5084 hardcount++;
5085 }
5086 } else {
5087 while (scan < loceol && !isSPACE_LC(*scan))
5088 scan++;
a0ed51b3
LW
5089 }
5090 break;
a0d0e21e 5091 case DIGIT:
1aa99e6b 5092 if (do_utf8) {
ffc61ed2 5093 loceol = PL_regeol;
1a4fad37 5094 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5095 while (hardcount < max && scan < loceol &&
3568d838 5096 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5097 scan += UTF8SKIP(scan);
5098 hardcount++;
5099 }
5100 } else {
5101 while (scan < loceol && isDIGIT(*scan))
5102 scan++;
a0ed51b3
LW
5103 }
5104 break;
a0d0e21e 5105 case NDIGIT:
1aa99e6b 5106 if (do_utf8) {
ffc61ed2 5107 loceol = PL_regeol;
1a4fad37 5108 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5109 while (hardcount < max && scan < loceol &&
3568d838 5110 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5111 scan += UTF8SKIP(scan);
5112 hardcount++;
5113 }
5114 } else {
5115 while (scan < loceol && !isDIGIT(*scan))
5116 scan++;
a0ed51b3
LW
5117 }
5118 break;
a0d0e21e
LW
5119 default: /* Called on something of 0 width. */
5120 break; /* So match right here or not at all. */
5121 }
a687059c 5122
a0ed51b3
LW
5123 if (hardcount)
5124 c = hardcount;
5125 else
5126 c = scan - PL_reginput;
3280af22 5127 PL_reginput = scan;
a687059c 5128
a3621e74 5129 DEBUG_r({
e68ec53f 5130 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 5131 DEBUG_EXECUTE_r({
e68ec53f
YO
5132 SV * const prop = sv_newmortal();
5133 regprop(prog, prop, p);
5134 PerlIO_printf(Perl_debug_log,
be8e71aa
YO
5135 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5136 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 5137 });
be8e71aa 5138 });
9041c2e3 5139
a0d0e21e 5140 return(c);
a687059c
LW
5141}
5142
c277df42 5143
be8e71aa 5144#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 5145/*
ffc61ed2
JH
5146- regclass_swash - prepare the utf8 swash
5147*/
5148
5149SV *
32fc9b6a 5150Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 5151{
97aff369 5152 dVAR;
9e55ce06
JH
5153 SV *sw = NULL;
5154 SV *si = NULL;
5155 SV *alt = NULL;
3dab1dad 5156 const struct reg_data * const data = prog ? prog->data : NULL;
ffc61ed2 5157
4f639d21 5158 if (data && data->count) {
a3b680e6 5159 const U32 n = ARG(node);
ffc61ed2 5160
4f639d21
DM
5161 if (data->what[n] == 's') {
5162 SV * const rv = (SV*)data->data[n];
890ce7af 5163 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 5164 SV **const ary = AvARRAY(av);
9e55ce06 5165 SV **a, **b;
9041c2e3 5166
711a919c 5167 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5168 * documentation of these array elements. */
5169
b11f357e 5170 si = *ary;
8f7f7219 5171 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
5172 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5173
ffc61ed2
JH
5174 if (a)
5175 sw = *a;
5176 else if (si && doinit) {
5177 sw = swash_init("utf8", "", si, 1, 0);
5178 (void)av_store(av, 1, sw);
5179 }
9e55ce06
JH
5180 if (b)
5181 alt = *b;
ffc61ed2
JH
5182 }
5183 }
5184
9e55ce06
JH
5185 if (listsvp)
5186 *listsvp = si;
5187 if (altsvp)
5188 *altsvp = alt;
ffc61ed2
JH
5189
5190 return sw;
5191}
76234dfb 5192#endif
ffc61ed2
JH
5193
5194/*
ba7b4546 5195 - reginclass - determine if a character falls into a character class
832705d4
JH
5196
5197 The n is the ANYOF regnode, the p is the target string, lenp
5198 is pointer to the maximum length of how far to go in the p
5199 (if the lenp is zero, UTF8SKIP(p) is used),
5200 do_utf8 tells whether the target string is in UTF-8.
5201
bbce6d69 5202 */
5203
76e3520e 5204STATIC bool
32fc9b6a 5205S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5206{
27da23d5 5207 dVAR;
a3b680e6 5208 const char flags = ANYOF_FLAGS(n);
bbce6d69 5209 bool match = FALSE;
cc07378b 5210 UV c = *p;
ae9ddab8 5211 STRLEN len = 0;
9e55ce06 5212 STRLEN plen;
1aa99e6b 5213
19f67299
TS
5214 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5215 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5216 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5217 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
19f67299
TS
5218 if (len == (STRLEN)-1)
5219 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5220 }
bbce6d69 5221
0f0076b4 5222 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5223 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5224 if (lenp)
5225 *lenp = 0;
ffc61ed2 5226 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5227 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5228 match = TRUE;
bbce6d69 5229 }
3568d838 5230 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5231 match = TRUE;
ffc61ed2 5232 if (!match) {
9e55ce06 5233 AV *av;
32fc9b6a 5234 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5235
5236 if (sw) {
3568d838 5237 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
5238 match = TRUE;
5239 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5240 if (!match && lenp && av) {
5241 I32 i;
9e55ce06 5242 for (i = 0; i <= av_len(av); i++) {
890ce7af 5243 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5244 STRLEN len;
890ce7af 5245 const char * const s = SvPV_const(sv, len);
9e55ce06 5246
061b10df 5247 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
5248 *lenp = len;
5249 match = TRUE;
5250 break;
5251 }
5252 }
5253 }
5254 if (!match) {
89ebb4a3 5255 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
5256 STRLEN tmplen;
5257
9e55ce06
JH
5258 to_utf8_fold(p, tmpbuf, &tmplen);
5259 if (swash_fetch(sw, tmpbuf, do_utf8))
5260 match = TRUE;
5261 }
ffc61ed2
JH
5262 }
5263 }
bbce6d69 5264 }
9e55ce06 5265 if (match && lenp && *lenp == 0)
0f0076b4 5266 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5267 }
1aa99e6b 5268 if (!match && c < 256) {
ffc61ed2
JH
5269 if (ANYOF_BITMAP_TEST(n, c))
5270 match = TRUE;
5271 else if (flags & ANYOF_FOLD) {
eb160463 5272 U8 f;
a0ed51b3 5273
ffc61ed2
JH
5274 if (flags & ANYOF_LOCALE) {
5275 PL_reg_flags |= RF_tainted;
5276 f = PL_fold_locale[c];
5277 }
5278 else
5279 f = PL_fold[c];
5280 if (f != c && ANYOF_BITMAP_TEST(n, f))
5281 match = TRUE;
5282 }
5283
5284 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5285 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5286 if (
5287 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5288 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5289 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5290 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5291 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5292 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5293 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5294 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5295 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5296 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5297 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5298 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5299 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5300 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5301 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5302 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5303 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5304 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5305 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5306 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5307 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5308 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5309 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5310 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5311 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5312 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5313 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5314 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5315 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5316 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5317 ) /* How's that for a conditional? */
5318 {
5319 match = TRUE;
5320 }
a0ed51b3 5321 }
a0ed51b3
LW
5322 }
5323
a0ed51b3
LW
5324 return (flags & ANYOF_INVERT) ? !match : match;
5325}
161b471a 5326
dfe13c55 5327STATIC U8 *
0ce71af7 5328S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5329{
97aff369 5330 dVAR;
a0ed51b3 5331 if (off >= 0) {
1aa99e6b 5332 while (off-- && s < lim) {
ffc61ed2 5333 /* XXX could check well-formedness here */
a0ed51b3 5334 s += UTF8SKIP(s);
ffc61ed2 5335 }
a0ed51b3
LW
5336 }
5337 else {
5338 while (off++) {
1aa99e6b 5339 if (s > lim) {
a0ed51b3 5340 s--;
ffc61ed2 5341 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5342 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5343 s--;
ffc61ed2
JH
5344 }
5345 /* XXX could check well-formedness here */
a0ed51b3
LW
5346 }
5347 }
5348 }
5349 return s;
5350}
161b471a 5351
dfe13c55 5352STATIC U8 *
0ce71af7 5353S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5354{
97aff369 5355 dVAR;
a0ed51b3 5356 if (off >= 0) {
1aa99e6b 5357 while (off-- && s < lim) {
ffc61ed2 5358 /* XXX could check well-formedness here */
a0ed51b3 5359 s += UTF8SKIP(s);
ffc61ed2 5360 }
a0ed51b3 5361 if (off >= 0)
3dab1dad 5362 return NULL;
a0ed51b3
LW
5363 }
5364 else {
5365 while (off++) {
1aa99e6b 5366 if (s > lim) {
a0ed51b3 5367 s--;
ffc61ed2 5368 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5369 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5370 s--;
ffc61ed2
JH
5371 }
5372 /* XXX could check well-formedness here */
a0ed51b3
LW
5373 }
5374 else
5375 break;
5376 }
5377 if (off <= 0)
3dab1dad 5378 return NULL;
a0ed51b3
LW
5379 }
5380 return s;
5381}
51371543 5382
51371543 5383static void
acfe0abc 5384restore_pos(pTHX_ void *arg)
51371543 5385{
97aff369 5386 dVAR;
097eb12c 5387 regexp * const rex = (regexp *)arg;
51371543
GS
5388 if (PL_reg_eval_set) {
5389 if (PL_reg_oldsaved) {
4f639d21
DM
5390 rex->subbeg = PL_reg_oldsaved;
5391 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5392#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5393 rex->saved_copy = PL_nrs;
ed252734 5394#endif
4f639d21 5395 RX_MATCH_COPIED_on(rex);
51371543
GS
5396 }
5397 PL_reg_magic->mg_len = PL_reg_oldpos;
5398 PL_reg_eval_set = 0;
5399 PL_curpm = PL_reg_oldcurpm;
5400 }
5401}
33b8afdf
JH
5402
5403STATIC void
5404S_to_utf8_substr(pTHX_ register regexp *prog)
5405{
33b8afdf 5406 if (prog->float_substr && !prog->float_utf8) {
097eb12c
AL
5407 SV* const sv = newSVsv(prog->float_substr);
5408 prog->float_utf8 = sv;
33b8afdf
JH
5409 sv_utf8_upgrade(sv);
5410 if (SvTAIL(prog->float_substr))
5411 SvTAIL_on(sv);
5412 if (prog->float_substr == prog->check_substr)
5413 prog->check_utf8 = sv;
5414 }
5415 if (prog->anchored_substr && !prog->anchored_utf8) {
097eb12c
AL
5416 SV* const sv = newSVsv(prog->anchored_substr);
5417 prog->anchored_utf8 = sv;
33b8afdf
JH
5418 sv_utf8_upgrade(sv);
5419 if (SvTAIL(prog->anchored_substr))
5420 SvTAIL_on(sv);
5421 if (prog->anchored_substr == prog->check_substr)
5422 prog->check_utf8 = sv;
5423 }
5424}
5425
5426STATIC void
5427S_to_byte_substr(pTHX_ register regexp *prog)
5428{
97aff369 5429 dVAR;
33b8afdf 5430 if (prog->float_utf8 && !prog->float_substr) {
097eb12c
AL
5431 SV* sv = newSVsv(prog->float_utf8);
5432 prog->float_substr = sv;
33b8afdf
JH
5433 if (sv_utf8_downgrade(sv, TRUE)) {
5434 if (SvTAIL(prog->float_utf8))
5435 SvTAIL_on(sv);
5436 } else {
5437 SvREFCNT_dec(sv);
5438 prog->float_substr = sv = &PL_sv_undef;
5439 }
5440 if (prog->float_utf8 == prog->check_utf8)
5441 prog->check_substr = sv;
5442 }
5443 if (prog->anchored_utf8 && !prog->anchored_substr) {
097eb12c
AL
5444 SV* sv = newSVsv(prog->anchored_utf8);
5445 prog->anchored_substr = sv;
33b8afdf
JH
5446 if (sv_utf8_downgrade(sv, TRUE)) {
5447 if (SvTAIL(prog->anchored_utf8))
5448 SvTAIL_on(sv);
5449 } else {
5450 SvREFCNT_dec(sv);
5451 prog->anchored_substr = sv = &PL_sv_undef;
5452 }
5453 if (prog->anchored_utf8 == prog->check_utf8)
5454 prog->check_substr = sv;
5455 }
5456}
66610fdd
RGS
5457
5458/*
5459 * Local variables:
5460 * c-indentation-style: bsd
5461 * c-basic-offset: 4
5462 * indent-tabs-mode: t
5463 * End:
5464 *
37442d52
RGS
5465 * ex: set ts=8 sts=4 sw=4 noet:
5466 */