This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Updated handling of signal names and signals for VMS
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e 33#ifdef PERL_EXT_RE_BUILD
54df2634 34#include "re_top.h"
9041c2e3 35#endif
56953603 36
a687059c 37/*
e50aee73 38 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
39 *
40 * Copyright (c) 1986 by University of Toronto.
41 * Written by Henry Spencer. Not derived from licensed software.
42 *
43 * Permission is granted to anyone to use this software for any
44 * purpose on any computer system, and to redistribute it freely,
45 * subject to the following restrictions:
46 *
47 * 1. The author is not responsible for the consequences of use of
48 * this software, no matter how awful, even if they arise
49 * from defects in it.
50 *
51 * 2. The origin of this software must not be misrepresented, either
52 * by explicit claim or by omission.
53 *
54 * 3. Altered versions must be plainly marked as such, and must not
55 * be misrepresented as being the original software.
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
a687059c
LW
64 *
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
68 */
69#include "EXTERN.h"
864dbfa3 70#define PERL_IN_REGEXEC_C
a687059c 71#include "perl.h"
0f5d15d6 72
54df2634
NC
73#ifdef PERL_IN_XSUB_RE
74# include "re_comp.h"
75#else
76# include "regcomp.h"
77#endif
a687059c 78
c277df42
IZ
79#define RF_tainted 1 /* tainted information used? */
80#define RF_warned 2 /* warned about big count? */
ce862d02 81#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
82#define RF_utf8 8 /* String contains multibyte chars? */
83
eb160463 84#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
85
86#define RS_init 1 /* eval environment created */
87#define RS_set 2 /* replsv value is set */
c277df42 88
a687059c
LW
89#ifndef STATIC
90#define STATIC static
91#endif
92
32fc9b6a 93#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 94
c277df42
IZ
95/*
96 * Forwards.
97 */
98
33b8afdf 99#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 100#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 101
3dab1dad
YO
102#define HOPc(pos,off) \
103 (char *)(PL_reg_match_utf8 \
52657f30 104 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
105 : (U8*)(pos + off))
106#define HOPBACKc(pos, off) \
07be1b83
YO
107 (char*)(PL_reg_match_utf8\
108 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
109 : (pos - off >= PL_bostr) \
8e11feef 110 ? (U8*)pos - off \
3dab1dad 111 : NULL)
efb30f32 112
53c4c00c 113#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 114#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 115
1a4fad37
AL
116#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
117 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
118#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
119#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
120#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
121#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 122
3dab1dad
YO
123/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124
5f80c4cf 125/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
126#define JUMPABLE(rn) ( \
127 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
128 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
129 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 130 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
131)
132
cca55fe3 133#define HAS_TEXT(rn) ( \
3dab1dad 134 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 135)
e2d8ce26 136
a84d97b6
HS
137/*
138 Search for mandatory following text node; for lookahead, the text must
139 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140*/
cca55fe3 141#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
142 while (JUMPABLE(rn)) { \
143 const OPCODE type = OP(rn); \
144 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 145 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 146 else if (type == PLUS) \
cca55fe3 147 rn = NEXTOPER(rn); \
3dab1dad 148 else if (type == IFMATCH) \
a84d97b6 149 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 150 else rn += NEXT_OFF(rn); \
3dab1dad 151 } \
5f80c4cf 152} STMT_END
74750237 153
acfe0abc 154static void restore_pos(pTHX_ void *arg);
51371543 155
76e3520e 156STATIC CHECKPOINT
cea2e8a9 157S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 158{
97aff369 159 dVAR;
a3b680e6 160 const int retval = PL_savestack_ix;
b1ce53c5 161#define REGCP_PAREN_ELEMS 4
a3b680e6 162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 163 int p;
40a82448 164 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 165
e49a9654
IH
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
168
a01268b5 169#define REGCP_OTHER_ELEMS 6
4b3c1a47 170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 171 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 172/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
173 SSPUSHINT(PL_regendp[p]);
174 SSPUSHINT(PL_regstartp[p]);
3280af22 175 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 176 SSPUSHINT(p);
40a82448
DM
177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
178 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
179 (UV)p, (IV)PL_regstartp[p],
180 (IV)(PL_reg_start_tmp[p] - PL_bostr),
181 (IV)PL_regendp[p]
182 ));
a0d0e21e 183 }
b1ce53c5 184/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
185 SSPUSHINT(PL_regsize);
186 SSPUSHINT(*PL_reglastparen);
a01268b5 187 SSPUSHINT(*PL_reglastcloseparen);
3280af22 188 SSPUSHPTR(PL_reginput);
41123dfd
JH
189#define REGCP_FRAME_ELEMS 2
190/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
191 * are needed for the regexp context stack bookkeeping. */
192 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 193 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 194
a0d0e21e
LW
195 return retval;
196}
197
c277df42 198/* These are needed since we do not localize EVAL nodes: */
a3621e74 199# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 200 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 201 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 202
a3621e74 203# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 204 PerlIO_printf(Perl_debug_log, \
faccc32b 205 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 206 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 207
76e3520e 208STATIC char *
097eb12c 209S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 210{
97aff369 211 dVAR;
b1ce53c5 212 I32 i;
a0d0e21e 213 char *input;
b1ce53c5 214
a3621e74
YO
215 GET_RE_DEBUG_FLAGS_DECL;
216
b1ce53c5 217 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 218 i = SSPOPINT;
b1ce53c5
JH
219 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
220 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 221 input = (char *) SSPOPPTR;
a01268b5 222 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
223 *PL_reglastparen = SSPOPINT;
224 PL_regsize = SSPOPINT;
b1ce53c5
JH
225
226 /* Now restore the parentheses context. */
41123dfd
JH
227 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
228 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 229 I32 tmps;
097eb12c 230 U32 paren = (U32)SSPOPINT;
3280af22 231 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
232 PL_regstartp[paren] = SSPOPINT;
233 tmps = SSPOPINT;
3280af22
NIS
234 if (paren <= *PL_reglastparen)
235 PL_regendp[paren] = tmps;
a3621e74 236 DEBUG_EXECUTE_r(
c3464db5 237 PerlIO_printf(Perl_debug_log,
b900a521 238 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 239 (UV)paren, (IV)PL_regstartp[paren],
b900a521 240 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 241 (IV)PL_regendp[paren],
3280af22 242 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 243 );
a0d0e21e 244 }
a3621e74 245 DEBUG_EXECUTE_r(
bb7a0f54 246 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 247 PerlIO_printf(Perl_debug_log,
faccc32b 248 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 249 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
250 }
251 );
daf18116 252#if 1
dafc8851
JH
253 /* It would seem that the similar code in regtry()
254 * already takes care of this, and in fact it is in
255 * a better location to since this code can #if 0-ed out
256 * but the code in regtry() is needed or otherwise tests
257 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
258 * (as of patchlevel 7877) will fail. Then again,
259 * this code seems to be necessary or otherwise
260 * building DynaLoader will fail:
261 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
262 * --jhi */
bb7a0f54 263 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
264 if (i > PL_regsize)
265 PL_regstartp[i] = -1;
266 PL_regendp[i] = -1;
a0d0e21e 267 }
dafc8851 268#endif
a0d0e21e
LW
269 return input;
270}
271
02db2b7b 272#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 273
95b24440 274#define TRYPAREN(paren, n, input, where) { \
29d1e993
HS
275 if (paren) { \
276 if (n) { \
277 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
278 PL_regendp[paren] = input - PL_bostr; \
279 } \
280 else \
281 PL_regendp[paren] = -1; \
282 } \
95b24440
DM
283 REGMATCH(next, where); \
284 if (result) \
29d1e993
HS
285 sayYES; \
286 if (paren && n) \
287 PL_regendp[paren] = -1; \
288}
289
290
a687059c 291/*
e50aee73 292 * pregexec and friends
a687059c
LW
293 */
294
76234dfb 295#ifndef PERL_IN_XSUB_RE
a687059c 296/*
c277df42 297 - pregexec - match a regexp against a string
a687059c 298 */
c277df42 299I32
864dbfa3 300Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 301 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
302/* strend: pointer to null at end of string */
303/* strbeg: real beginning of string */
304/* minend: end of match must be >=minend after stringarg. */
305/* nosave: For optimizations. */
306{
307 return
9041c2e3 308 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
309 nosave ? 0 : REXEC_COPY_STR);
310}
76234dfb 311#endif
22e551b9 312
9041c2e3 313/*
cad2e5aa
JH
314 * Need to implement the following flags for reg_anch:
315 *
316 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
317 * USE_INTUIT_ML
318 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
319 * INTUIT_AUTORITATIVE_ML
320 * INTUIT_ONCE_NOML - Intuit can match in one location only.
321 * INTUIT_ONCE_ML
322 *
323 * Another flag for this function: SECOND_TIME (so that float substrs
324 * with giant delta may be not rechecked).
325 */
326
327/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
328
3f7c398e 329/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
330 Otherwise, only SvCUR(sv) is used to get strbeg. */
331
332/* XXXX We assume that strpos is strbeg unless sv. */
333
6eb5f6b9
JH
334/* XXXX Some places assume that there is a fixed substring.
335 An update may be needed if optimizer marks as "INTUITable"
336 RExen without fixed substrings. Similarly, it is assumed that
337 lengths of all the strings are no more than minlen, thus they
338 cannot come from lookahead.
339 (Or minlen should take into account lookahead.) */
340
2c2d71f5
JH
341/* A failure to find a constant substring means that there is no need to make
342 an expensive call to REx engine, thus we celebrate a failure. Similarly,
343 finding a substring too deep into the string means that less calls to
30944b6d
IZ
344 regtry() should be needed.
345
346 REx compiler's optimizer found 4 possible hints:
347 a) Anchored substring;
348 b) Fixed substring;
349 c) Whether we are anchored (beginning-of-line or \G);
350 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 351 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
352 string which does not contradict any of them.
353 */
2c2d71f5 354
6eb5f6b9
JH
355/* Most of decisions we do here should have been done at compile time.
356 The nodes of the REx which we used for the search should have been
357 deleted from the finite automaton. */
358
cad2e5aa
JH
359char *
360Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
361 char *strend, U32 flags, re_scream_pos_data *data)
362{
97aff369 363 dVAR;
b7953727 364 register I32 start_shift = 0;
cad2e5aa 365 /* Should be nonnegative! */
b7953727 366 register I32 end_shift = 0;
2c2d71f5
JH
367 register char *s;
368 register SV *check;
a1933d95 369 char *strbeg;
cad2e5aa 370 char *t;
a3b680e6 371 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 372 I32 ml_anch;
bd61b366
SS
373 register char *other_last = NULL; /* other substr checked before this */
374 char *check_at = NULL; /* check substr found at this pos */
1df70142 375 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 376#ifdef DEBUGGING
890ce7af
AL
377 const char * const i_strpos = strpos;
378 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 379#endif
a3621e74
YO
380
381 GET_RE_DEBUG_FLAGS_DECL;
382
a30b2f1f 383 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 384
b8d68ded 385 if (prog->reganch & ROPT_UTF8) {
a3621e74 386 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
387 "UTF-8 regex...\n"));
388 PL_reg_flags |= RF_utf8;
389 }
390
a3621e74 391 DEBUG_EXECUTE_r({
1df70142 392 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
393 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
394 strpos;
1df70142 395 const int len = PL_reg_match_utf8 ?
bb7a0f54 396 (int)strlen(s) : strend - strpos;
2a782b5b
JH
397 if (!PL_colorset)
398 reginitcolors();
b8d68ded 399 if (PL_reg_match_utf8)
a3621e74 400 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 401 "UTF-8 target...\n"));
2a782b5b 402 PerlIO_printf(Perl_debug_log,
a0288114 403 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 404 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
405 prog->precomp,
406 PL_colors[1],
407 (strlen(prog->precomp) > 60 ? "..." : ""),
408 PL_colors[0],
409 (int)(len > 60 ? 60 : len),
410 s, PL_colors[1],
411 (len > 60 ? "..." : "")
412 );
413 });
cad2e5aa 414
c344f387
JH
415 /* CHR_DIST() would be more correct here but it makes things slow. */
416 if (prog->minlen > strend - strpos) {
a3621e74 417 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 418 "String too short... [re_intuit_start]\n"));
cad2e5aa 419 goto fail;
2c2d71f5 420 }
a1933d95 421 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 422 PL_regeol = strend;
33b8afdf
JH
423 if (do_utf8) {
424 if (!prog->check_utf8 && prog->check_substr)
425 to_utf8_substr(prog);
426 check = prog->check_utf8;
427 } else {
428 if (!prog->check_substr && prog->check_utf8)
429 to_byte_substr(prog);
430 check = prog->check_substr;
431 }
432 if (check == &PL_sv_undef) {
a3621e74 433 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
434 "Non-utf string cannot match utf check string\n"));
435 goto fail;
436 }
2c2d71f5 437 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
438 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
439 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 440 && !multiline ) ); /* Check after \n? */
cad2e5aa 441
7e25d62c
JH
442 if (!ml_anch) {
443 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
444 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 445 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
446 && sv && !SvROK(sv)
447 && (strpos != strbeg)) {
a3621e74 448 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
449 goto fail;
450 }
451 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 452 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 453 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
454 I32 slen;
455
1aa99e6b 456 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
457 if (SvTAIL(check)) {
458 slen = SvCUR(check); /* >= 1 */
cad2e5aa 459
9041c2e3 460 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 461 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 462 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 463 goto fail_finish;
cad2e5aa
JH
464 }
465 /* Now should match s[0..slen-2] */
466 slen--;
3f7c398e 467 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 468 || (slen > 1
3f7c398e 469 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 470 report_neq:
a3621e74 471 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
472 goto fail_finish;
473 }
cad2e5aa 474 }
3f7c398e 475 else if (*SvPVX_const(check) != *s
653099ff 476 || ((slen = SvCUR(check)) > 1
3f7c398e 477 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 478 goto report_neq;
c315bfe8 479 check_at = s;
2c2d71f5 480 goto success_at_start;
7e25d62c 481 }
cad2e5aa 482 }
2c2d71f5 483 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 484 s = strpos;
2c2d71f5 485 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 486 end_shift = prog->minlen - start_shift -
653099ff 487 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 488 if (!ml_anch) {
a3b680e6 489 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 490 - (SvTAIL(check) != 0);
a3b680e6 491 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
492
493 if (end_shift < eshift)
494 end_shift = eshift;
495 }
cad2e5aa 496 }
2c2d71f5 497 else { /* Can match at random position */
cad2e5aa
JH
498 ml_anch = 0;
499 s = strpos;
2c2d71f5
JH
500 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
501 /* Should be nonnegative! */
502 end_shift = prog->minlen - start_shift -
653099ff 503 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
504 }
505
2c2d71f5 506#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 507 if (end_shift < 0)
6bbae5e6 508 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
509#endif
510
2c2d71f5
JH
511 restart:
512 /* Find a possible match in the region s..strend by looking for
513 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 514 if (flags & REXEC_SCREAM) {
cad2e5aa 515 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 516 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 517
2c2d71f5
JH
518 if (PL_screamfirst[BmRARE(check)] >= 0
519 || ( BmRARE(check) == '\n'
520 && (BmPREVIOUS(check) == SvCUR(check) - 1)
521 && SvTAIL(check) ))
9041c2e3 522 s = screaminstr(sv, check,
2c2d71f5 523 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 524 else
2c2d71f5 525 goto fail_finish;
4addbd3b
HS
526 /* we may be pointing at the wrong string */
527 if (s && RX_MATCH_COPIED(prog))
3f7c398e 528 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
529 if (data)
530 *data->scream_olds = s;
531 }
f33976b4 532 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
533 s = fbm_instr((U8*)(s + start_shift),
534 (U8*)(strend - end_shift),
7fba1cd6 535 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 536 else
1aa99e6b
IH
537 s = fbm_instr(HOP3(s, start_shift, strend),
538 HOP3(strend, -end_shift, strbeg),
7fba1cd6 539 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
540
541 /* Update the count-of-usability, remove useless subpatterns,
542 unshift s. */
2c2d71f5 543
a0288114 544 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 545 (s ? "Found" : "Did not find"),
33b8afdf 546 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 547 PL_colors[0],
7b0972df 548 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 549 SvPVX_const(check),
2c2d71f5
JH
550 PL_colors[1], (SvTAIL(check) ? "$" : ""),
551 (s ? " at offset " : "...\n") ) );
552
553 if (!s)
554 goto fail_finish;
555
6eb5f6b9
JH
556 check_at = s;
557
2c2d71f5 558 /* Finish the diagnostic message */
a3621e74 559 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
560
561 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
562 Start with the other substr.
563 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 564 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
565 *always* match. Probably should be marked during compile...
566 Probably it is right to do no SCREAM here...
567 */
568
33b8afdf 569 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 570 /* Take into account the "other" substring. */
2c2d71f5
JH
571 /* XXXX May be hopelessly wrong for UTF... */
572 if (!other_last)
6eb5f6b9 573 other_last = strpos;
33b8afdf 574 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
575 do_other_anchored:
576 {
890ce7af
AL
577 char * const last = HOP3c(s, -start_shift, strbeg);
578 char *last1, *last2;
be8e71aa 579 char * const saved_s = s;
33b8afdf 580 SV* must;
2c2d71f5 581
2c2d71f5
JH
582 t = s - prog->check_offset_max;
583 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 584 && (!do_utf8
0ce71af7 585 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 586 && t > strpos)))
6f207bd3 587 NOOP;
2c2d71f5
JH
588 else
589 t = strpos;
1aa99e6b 590 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
591 if (t < other_last) /* These positions already checked */
592 t = other_last;
1aa99e6b 593 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
594 if (last < last1)
595 last1 = last;
596 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
597 /* On end-of-str: see comment below. */
33b8afdf
JH
598 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
599 if (must == &PL_sv_undef) {
600 s = (char*)NULL;
a3621e74 601 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
602 }
603 else
604 s = fbm_instr(
605 (unsigned char*)t,
606 HOP3(HOP3(last1, prog->anchored_offset, strend)
607 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
608 must,
7fba1cd6 609 multiline ? FBMrf_MULTILINE : 0
33b8afdf 610 );
a3621e74 611 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 612 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
613 (s ? "Found" : "Contradicts"),
614 PL_colors[0],
33b8afdf
JH
615 (int)(SvCUR(must)
616 - (SvTAIL(must)!=0)),
3f7c398e 617 SvPVX_const(must),
33b8afdf 618 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
619 if (!s) {
620 if (last1 >= last2) {
a3621e74 621 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
622 ", giving up...\n"));
623 goto fail_finish;
624 }
a3621e74 625 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 626 ", trying floating at offset %ld...\n",
be8e71aa 627 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
628 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
629 s = HOP3c(last, 1, strend);
2c2d71f5
JH
630 goto restart;
631 }
632 else {
a3621e74 633 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 634 (long)(s - i_strpos)));
1aa99e6b
IH
635 t = HOP3c(s, -prog->anchored_offset, strbeg);
636 other_last = HOP3c(s, 1, strend);
be8e71aa 637 s = saved_s;
2c2d71f5
JH
638 if (t == strpos)
639 goto try_at_start;
2c2d71f5
JH
640 goto try_at_offset;
641 }
30944b6d 642 }
2c2d71f5
JH
643 }
644 else { /* Take into account the floating substring. */
33b8afdf 645 char *last, *last1;
be8e71aa 646 char * const saved_s = s;
33b8afdf
JH
647 SV* must;
648
649 t = HOP3c(s, -start_shift, strbeg);
650 last1 = last =
651 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
652 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
653 last = HOP3c(t, prog->float_max_offset, strend);
654 s = HOP3c(t, prog->float_min_offset, strend);
655 if (s < other_last)
656 s = other_last;
2c2d71f5 657 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
658 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
659 /* fbm_instr() takes into account exact value of end-of-str
660 if the check is SvTAIL(ed). Since false positives are OK,
661 and end-of-str is not later than strend we are OK. */
662 if (must == &PL_sv_undef) {
663 s = (char*)NULL;
a3621e74 664 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
665 }
666 else
2c2d71f5 667 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
668 (unsigned char*)last + SvCUR(must)
669 - (SvTAIL(must)!=0),
7fba1cd6 670 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 671 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
672 (s ? "Found" : "Contradicts"),
673 PL_colors[0],
674 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 675 SvPVX_const(must),
33b8afdf
JH
676 PL_colors[1], (SvTAIL(must) ? "$" : "")));
677 if (!s) {
678 if (last1 == last) {
a3621e74 679 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
680 ", giving up...\n"));
681 goto fail_finish;
2c2d71f5 682 }
a3621e74 683 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 684 ", trying anchored starting at offset %ld...\n",
be8e71aa 685 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
686 other_last = last;
687 s = HOP3c(t, 1, strend);
688 goto restart;
689 }
690 else {
a3621e74 691 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
692 (long)(s - i_strpos)));
693 other_last = s; /* Fix this later. --Hugo */
be8e71aa 694 s = saved_s;
33b8afdf
JH
695 if (t == strpos)
696 goto try_at_start;
697 goto try_at_offset;
698 }
2c2d71f5 699 }
cad2e5aa 700 }
2c2d71f5
JH
701
702 t = s - prog->check_offset_max;
2c2d71f5 703 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 704 && (!do_utf8
0ce71af7 705 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
1aa99e6b 706 && t > strpos))) {
2c2d71f5
JH
707 /* Fixed substring is found far enough so that the match
708 cannot start at strpos. */
709 try_at_offset:
cad2e5aa 710 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
711 /* Eventually fbm_*() should handle this, but often
712 anchored_offset is not 0, so this check will not be wasted. */
713 /* XXXX In the code below we prefer to look for "^" even in
714 presence of anchored substrings. And we search even
715 beyond the found float position. These pessimizations
716 are historical artefacts only. */
717 find_anchor:
2c2d71f5 718 while (t < strend - prog->minlen) {
cad2e5aa 719 if (*t == '\n') {
4ee3650e 720 if (t < check_at - prog->check_offset_min) {
33b8afdf 721 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
722 /* Since we moved from the found position,
723 we definitely contradict the found anchored
30944b6d
IZ
724 substr. Due to the above check we do not
725 contradict "check" substr.
726 Thus we can arrive here only if check substr
727 is float. Redo checking for "other"=="fixed".
728 */
9041c2e3 729 strpos = t + 1;
a3621e74 730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 731 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
732 goto do_other_anchored;
733 }
4ee3650e
GS
734 /* We don't contradict the found floating substring. */
735 /* XXXX Why not check for STCLASS? */
cad2e5aa 736 s = t + 1;
a3621e74 737 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 738 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
739 goto set_useful;
740 }
4ee3650e
GS
741 /* Position contradicts check-string */
742 /* XXXX probably better to look for check-string
743 than for "\n", so one should lower the limit for t? */
a3621e74 744 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 745 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 746 other_last = strpos = s = t + 1;
cad2e5aa
JH
747 goto restart;
748 }
749 t++;
750 }
a3621e74 751 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 752 PL_colors[0], PL_colors[1]));
2c2d71f5 753 goto fail_finish;
cad2e5aa 754 }
f5952150 755 else {
a3621e74 756 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 757 PL_colors[0], PL_colors[1]));
f5952150 758 }
cad2e5aa
JH
759 s = t;
760 set_useful:
33b8afdf 761 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
762 }
763 else {
f5952150 764 /* The found string does not prohibit matching at strpos,
2c2d71f5 765 - no optimization of calling REx engine can be performed,
f5952150
GS
766 unless it was an MBOL and we are not after MBOL,
767 or a future STCLASS check will fail this. */
2c2d71f5
JH
768 try_at_start:
769 /* Even in this situation we may use MBOL flag if strpos is offset
770 wrt the start of the string. */
05b4157f 771 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 772 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
773 /* May be due to an implicit anchor of m{.*foo} */
774 && !(prog->reganch & ROPT_IMPLICIT))
775 {
cad2e5aa
JH
776 t = strpos;
777 goto find_anchor;
778 }
a3621e74 779 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 780 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 781 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 782 );
2c2d71f5 783 success_at_start:
30944b6d 784 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
785 && (do_utf8 ? (
786 prog->check_utf8 /* Could be deleted already */
787 && --BmUSEFUL(prog->check_utf8) < 0
788 && (prog->check_utf8 == prog->float_utf8)
789 ) : (
790 prog->check_substr /* Could be deleted already */
791 && --BmUSEFUL(prog->check_substr) < 0
792 && (prog->check_substr == prog->float_substr)
793 )))
66e933ab 794 {
cad2e5aa 795 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 796 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
797 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
798 if (do_utf8 ? prog->check_substr : prog->check_utf8)
799 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
800 prog->check_substr = prog->check_utf8 = NULL; /* disable */
801 prog->float_substr = prog->float_utf8 = NULL; /* clear */
802 check = NULL; /* abort */
cad2e5aa 803 s = strpos;
3cf5c195
IZ
804 /* XXXX This is a remnant of the old implementation. It
805 looks wasteful, since now INTUIT can use many
6eb5f6b9 806 other heuristics. */
cad2e5aa
JH
807 prog->reganch &= ~RE_USE_INTUIT;
808 }
809 else
810 s = strpos;
811 }
812
6eb5f6b9
JH
813 /* Last resort... */
814 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
07be1b83 815 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
6eb5f6b9
JH
816 /* minlen == 0 is possible if regstclass is \b or \B,
817 and the fixed substr is ''$.
818 Since minlen is already taken into account, s+1 is before strend;
819 accidentally, minlen >= 1 guaranties no false positives at s + 1
820 even for \b or \B. But (minlen? 1 : 0) below assumes that
821 regstclass does not come from lookahead... */
822 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
823 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 824 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 825 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 826 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 827 : 1);
07be1b83 828 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 829 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 830 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
831 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
832 cl_l, strend)
833 : strend);
07be1b83
YO
834 /*if (OP(prog->regstclass) == TRIE)
835 endpos++;*/
6eb5f6b9 836 t = s;
3b0527fe 837 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
838 if (!s) {
839#ifdef DEBUGGING
cbbf8932 840 const char *what = NULL;
6eb5f6b9
JH
841#endif
842 if (endpos == strend) {
a3621e74 843 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
844 "Could not match STCLASS...\n") );
845 goto fail;
846 }
a3621e74 847 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 848 "This position contradicts STCLASS...\n") );
653099ff
GS
849 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
850 goto fail;
6eb5f6b9 851 /* Contradict one of substrings */
33b8afdf
JH
852 if (prog->anchored_substr || prog->anchored_utf8) {
853 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 854 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 855 hop_and_restart:
1aa99e6b 856 s = HOP3c(t, 1, strend);
66e933ab
GS
857 if (s + start_shift + end_shift > strend) {
858 /* XXXX Should be taken into account earlier? */
a3621e74 859 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
860 "Could not match STCLASS...\n") );
861 goto fail;
862 }
5e39e1e5
HS
863 if (!check)
864 goto giveup;
a3621e74 865 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 866 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
867 what, (long)(s + start_shift - i_strpos)) );
868 goto restart;
869 }
66e933ab 870 /* Have both, check_string is floating */
6eb5f6b9
JH
871 if (t + start_shift >= check_at) /* Contradicts floating=check */
872 goto retry_floating_check;
873 /* Recheck anchored substring, but not floating... */
9041c2e3 874 s = check_at;
5e39e1e5
HS
875 if (!check)
876 goto giveup;
a3621e74 877 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 878 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
879 (long)(other_last - i_strpos)) );
880 goto do_other_anchored;
881 }
60e71179
GS
882 /* Another way we could have checked stclass at the
883 current position only: */
884 if (ml_anch) {
885 s = t = t + 1;
5e39e1e5
HS
886 if (!check)
887 goto giveup;
a3621e74 888 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 889 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 890 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 891 goto try_at_offset;
66e933ab 892 }
33b8afdf 893 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 894 goto fail;
6eb5f6b9
JH
895 /* Check is floating subtring. */
896 retry_floating_check:
897 t = check_at - start_shift;
a3621e74 898 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
899 goto hop_and_restart;
900 }
b7953727 901 if (t != s) {
a3621e74 902 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 903 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
904 (long)(t - i_strpos), (long)(s - i_strpos))
905 );
906 }
907 else {
a3621e74 908 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
909 "Does not contradict STCLASS...\n");
910 );
911 }
6eb5f6b9 912 }
5e39e1e5 913 giveup:
a3621e74 914 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
915 PL_colors[4], (check ? "Guessed" : "Giving up"),
916 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 917 return s;
2c2d71f5
JH
918
919 fail_finish: /* Substring not found */
33b8afdf
JH
920 if (prog->check_substr || prog->check_utf8) /* could be removed already */
921 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 922 fail:
a3621e74 923 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 924 PL_colors[4], PL_colors[5]));
bd61b366 925 return NULL;
cad2e5aa 926}
9661b544 927
6eb5f6b9 928/* We know what class REx starts with. Try to find this position... */
3b0527fe 929/* if reginfo is NULL, its a dryrun */
07be1b83
YO
930/* annoyingly all the vars in this routine have different names from their counterparts
931 in regmatch. /grrr */
3b0527fe 932
3c3eec57 933STATIC char *
07be1b83
YO
934S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
935 const char *strend, const regmatch_info *reginfo)
a687059c 936{
27da23d5 937 dVAR;
1df70142 938 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 939 char *m;
d8093b23 940 STRLEN ln;
5dab1207 941 STRLEN lnc;
078c425b 942 register STRLEN uskip;
d8093b23
G
943 unsigned int c1;
944 unsigned int c2;
6eb5f6b9
JH
945 char *e;
946 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 947 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 948
6eb5f6b9
JH
949 /* We know what class it must start with. */
950 switch (OP(c)) {
6eb5f6b9 951 case ANYOF:
388cc4de 952 if (do_utf8) {
078c425b 953 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
954 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
955 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a
DM
956 reginclass(prog, c, (U8*)s, 0, do_utf8) :
957 REGINCLASS(prog, c, (U8*)s)) {
3b0527fe 958 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
959 goto got_it;
960 else
961 tmp = doevery;
962 }
963 else
964 tmp = 1;
078c425b 965 s += uskip;
388cc4de
HS
966 }
967 }
968 else {
969 while (s < strend) {
970 STRLEN skip = 1;
971
32fc9b6a 972 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
973 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
974 /* The assignment of 2 is intentional:
975 * for the folded sharp s, the skip is 2. */
976 (skip = SHARP_S_SKIP))) {
3b0527fe 977 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
978 goto got_it;
979 else
980 tmp = doevery;
981 }
982 else
983 tmp = 1;
984 s += skip;
985 }
a0d0e21e 986 }
6eb5f6b9 987 break;
f33976b4
DB
988 case CANY:
989 while (s < strend) {
3b0527fe 990 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
991 goto got_it;
992 else
993 tmp = doevery;
994 s++;
995 }
996 break;
6eb5f6b9 997 case EXACTF:
5dab1207
NIS
998 m = STRING(c);
999 ln = STR_LEN(c); /* length to match in octets/bytes */
1000 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1001 if (UTF) {
a2a2844f 1002 STRLEN ulen1, ulen2;
5dab1207 1003 U8 *sm = (U8 *) m;
89ebb4a3
JH
1004 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1005 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1006 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1007
1008 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1009 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1010
89ebb4a3 1011 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1012 0, uniflags);
89ebb4a3 1013 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1014 0, uniflags);
5dab1207
NIS
1015 lnc = 0;
1016 while (sm < ((U8 *) m + ln)) {
1017 lnc++;
1018 sm += UTF8SKIP(sm);
1019 }
1aa99e6b
IH
1020 }
1021 else {
1022 c1 = *(U8*)m;
1023 c2 = PL_fold[c1];
1024 }
6eb5f6b9
JH
1025 goto do_exactf;
1026 case EXACTFL:
5dab1207
NIS
1027 m = STRING(c);
1028 ln = STR_LEN(c);
1029 lnc = (I32) ln;
d8093b23 1030 c1 = *(U8*)m;
6eb5f6b9
JH
1031 c2 = PL_fold_locale[c1];
1032 do_exactf:
db12adc6 1033 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1034
3b0527fe 1035 if (!reginfo && e < s)
6eb5f6b9 1036 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1037
60a8b682
JH
1038 /* The idea in the EXACTF* cases is to first find the
1039 * first character of the EXACTF* node and then, if
1040 * necessary, case-insensitively compare the full
1041 * text of the node. The c1 and c2 are the first
1042 * characters (though in Unicode it gets a bit
1043 * more complicated because there are more cases
7f16dd3d
JH
1044 * than just upper and lower: one needs to use
1045 * the so-called folding case for case-insensitive
1046 * matching (called "loose matching" in Unicode).
1047 * ibcmp_utf8() will do just that. */
60a8b682 1048
1aa99e6b 1049 if (do_utf8) {
575cac57 1050 UV c, f;
89ebb4a3 1051 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1052 STRLEN len, foldlen;
4ad0818d 1053 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1054 if (c1 == c2) {
5dab1207
NIS
1055 /* Upper and lower of 1st char are equal -
1056 * probably not a "letter". */
1aa99e6b 1057 while (s <= e) {
89ebb4a3 1058 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1059 uniflags);
80aecb99
JH
1060 if ( c == c1
1061 && (ln == len ||
3dab1dad
YO
1062 ibcmp_utf8(s, NULL, 0, do_utf8,
1063 m, NULL, ln, (bool)UTF))
3b0527fe 1064 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1065 goto got_it;
80aecb99 1066 else {
1df70142 1067 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1068 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1069 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1070 if ( f != c
1071 && (f == c1 || f == c2)
1072 && (ln == foldlen ||
66423254 1073 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1074 NULL, foldlen, do_utf8,
d07ddd77 1075 m,
3dab1dad 1076 NULL, ln, (bool)UTF))
3b0527fe 1077 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1078 goto got_it;
1079 }
1aa99e6b
IH
1080 s += len;
1081 }
09091399
JH
1082 }
1083 else {
1aa99e6b 1084 while (s <= e) {
89ebb4a3 1085 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1086 uniflags);
80aecb99 1087
60a8b682 1088 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1089 * Note that not all the possible combinations
1090 * are handled here: some of them are handled
1091 * by the standard folding rules, and some of
1092 * them (the character class or ANYOF cases)
1093 * are handled during compiletime in
1094 * regexec.c:S_regclass(). */
880bd946
JH
1095 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1096 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1097 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1098
1099 if ( (c == c1 || c == c2)
1100 && (ln == len ||
3dab1dad
YO
1101 ibcmp_utf8(s, NULL, 0, do_utf8,
1102 m, NULL, ln, (bool)UTF))
3b0527fe 1103 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1104 goto got_it;
80aecb99 1105 else {
1df70142 1106 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1107 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1108 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1109 if ( f != c
1110 && (f == c1 || f == c2)
1111 && (ln == foldlen ||
a6872d42 1112 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1113 NULL, foldlen, do_utf8,
d07ddd77 1114 m,
3dab1dad 1115 NULL, ln, (bool)UTF))
3b0527fe 1116 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1117 goto got_it;
1118 }
1aa99e6b
IH
1119 s += len;
1120 }
09091399 1121 }
1aa99e6b
IH
1122 }
1123 else {
1124 if (c1 == c2)
1125 while (s <= e) {
1126 if ( *(U8*)s == c1
1127 && (ln == 1 || !(OP(c) == EXACTF
1128 ? ibcmp(s, m, ln)
1129 : ibcmp_locale(s, m, ln)))
3b0527fe 1130 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1131 goto got_it;
1132 s++;
1133 }
1134 else
1135 while (s <= e) {
1136 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1137 && (ln == 1 || !(OP(c) == EXACTF
1138 ? ibcmp(s, m, ln)
1139 : ibcmp_locale(s, m, ln)))
3b0527fe 1140 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1141 goto got_it;
1142 s++;
1143 }
b3c9acc1
IZ
1144 }
1145 break;
bbce6d69 1146 case BOUNDL:
3280af22 1147 PL_reg_flags |= RF_tainted;
bbce6d69 1148 /* FALL THROUGH */
a0d0e21e 1149 case BOUND:
ffc61ed2 1150 if (do_utf8) {
12d33761 1151 if (s == PL_bostr)
ffc61ed2
JH
1152 tmp = '\n';
1153 else {
6136c704 1154 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1155 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1156 }
1157 tmp = ((OP(c) == BOUND ?
9041c2e3 1158 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1159 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1160 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1161 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1162 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1163 isALNUM_LC_utf8((U8*)s)))
1164 {
1165 tmp = !tmp;
3b0527fe 1166 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1167 goto got_it;
1168 }
078c425b 1169 s += uskip;
a687059c 1170 }
a0d0e21e 1171 }
667bb95a 1172 else {
12d33761 1173 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1174 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1175 while (s < strend) {
1176 if (tmp ==
1177 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1178 tmp = !tmp;
3b0527fe 1179 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1180 goto got_it;
1181 }
1182 s++;
a0ed51b3 1183 }
a0ed51b3 1184 }
3b0527fe 1185 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1186 goto got_it;
1187 break;
bbce6d69 1188 case NBOUNDL:
3280af22 1189 PL_reg_flags |= RF_tainted;
bbce6d69 1190 /* FALL THROUGH */
a0d0e21e 1191 case NBOUND:
ffc61ed2 1192 if (do_utf8) {
12d33761 1193 if (s == PL_bostr)
ffc61ed2
JH
1194 tmp = '\n';
1195 else {
6136c704 1196 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1197 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1198 }
1199 tmp = ((OP(c) == NBOUND ?
9041c2e3 1200 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1201 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1202 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1203 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1204 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1205 isALNUM_LC_utf8((U8*)s)))
1206 tmp = !tmp;
3b0527fe 1207 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2 1208 goto got_it;
078c425b 1209 s += uskip;
ffc61ed2 1210 }
a0d0e21e 1211 }
667bb95a 1212 else {
12d33761 1213 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1214 tmp = ((OP(c) == NBOUND ?
1215 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1216 while (s < strend) {
1217 if (tmp ==
1218 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1219 tmp = !tmp;
3b0527fe 1220 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1221 goto got_it;
1222 s++;
1223 }
a0ed51b3 1224 }
3b0527fe 1225 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1226 goto got_it;
1227 break;
a0d0e21e 1228 case ALNUM:
ffc61ed2 1229 if (do_utf8) {
1a4fad37 1230 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1231 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1232 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1233 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1234 goto got_it;
1235 else
1236 tmp = doevery;
1237 }
bbce6d69 1238 else
ffc61ed2 1239 tmp = 1;
078c425b 1240 s += uskip;
bbce6d69 1241 }
bbce6d69 1242 }
ffc61ed2
JH
1243 else {
1244 while (s < strend) {
1245 if (isALNUM(*s)) {
3b0527fe 1246 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1247 goto got_it;
1248 else
1249 tmp = doevery;
1250 }
a0ed51b3 1251 else
ffc61ed2
JH
1252 tmp = 1;
1253 s++;
a0ed51b3 1254 }
a0ed51b3
LW
1255 }
1256 break;
bbce6d69 1257 case ALNUML:
3280af22 1258 PL_reg_flags |= RF_tainted;
ffc61ed2 1259 if (do_utf8) {
078c425b 1260 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1261 if (isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1262 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1263 goto got_it;
1264 else
1265 tmp = doevery;
1266 }
a687059c 1267 else
ffc61ed2 1268 tmp = 1;
078c425b 1269 s += uskip;
a0d0e21e 1270 }
a0d0e21e 1271 }
ffc61ed2
JH
1272 else {
1273 while (s < strend) {
1274 if (isALNUM_LC(*s)) {
3b0527fe 1275 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1276 goto got_it;
1277 else
1278 tmp = doevery;
1279 }
a0ed51b3 1280 else
ffc61ed2
JH
1281 tmp = 1;
1282 s++;
a0ed51b3 1283 }
a0ed51b3
LW
1284 }
1285 break;
a0d0e21e 1286 case NALNUM:
ffc61ed2 1287 if (do_utf8) {
1a4fad37 1288 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1289 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1290 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1291 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1292 goto got_it;
1293 else
1294 tmp = doevery;
1295 }
bbce6d69 1296 else
ffc61ed2 1297 tmp = 1;
078c425b 1298 s += uskip;
bbce6d69 1299 }
bbce6d69 1300 }
ffc61ed2
JH
1301 else {
1302 while (s < strend) {
1303 if (!isALNUM(*s)) {
3b0527fe 1304 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1305 goto got_it;
1306 else
1307 tmp = doevery;
1308 }
a0ed51b3 1309 else
ffc61ed2
JH
1310 tmp = 1;
1311 s++;
a0ed51b3 1312 }
a0ed51b3
LW
1313 }
1314 break;
bbce6d69 1315 case NALNUML:
3280af22 1316 PL_reg_flags |= RF_tainted;
ffc61ed2 1317 if (do_utf8) {
078c425b 1318 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1319 if (!isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1320 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1321 goto got_it;
1322 else
1323 tmp = doevery;
1324 }
a687059c 1325 else
ffc61ed2 1326 tmp = 1;
078c425b 1327 s += uskip;
a687059c 1328 }
a0d0e21e 1329 }
ffc61ed2
JH
1330 else {
1331 while (s < strend) {
1332 if (!isALNUM_LC(*s)) {
3b0527fe 1333 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1334 goto got_it;
1335 else
1336 tmp = doevery;
1337 }
a0ed51b3 1338 else
ffc61ed2
JH
1339 tmp = 1;
1340 s++;
a0ed51b3 1341 }
a0ed51b3
LW
1342 }
1343 break;
a0d0e21e 1344 case SPACE:
ffc61ed2 1345 if (do_utf8) {
1a4fad37 1346 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1347 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1348 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
3b0527fe 1349 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1350 goto got_it;
1351 else
1352 tmp = doevery;
1353 }
a0d0e21e 1354 else
ffc61ed2 1355 tmp = 1;
078c425b 1356 s += uskip;
2304df62 1357 }
a0d0e21e 1358 }
ffc61ed2
JH
1359 else {
1360 while (s < strend) {
1361 if (isSPACE(*s)) {
3b0527fe 1362 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1363 goto got_it;
1364 else
1365 tmp = doevery;
1366 }
a0ed51b3 1367 else
ffc61ed2
JH
1368 tmp = 1;
1369 s++;
a0ed51b3 1370 }
a0ed51b3
LW
1371 }
1372 break;
bbce6d69 1373 case SPACEL:
3280af22 1374 PL_reg_flags |= RF_tainted;
ffc61ed2 1375 if (do_utf8) {
078c425b 1376 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1377 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
3b0527fe 1378 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1379 goto got_it;
1380 else
1381 tmp = doevery;
1382 }
bbce6d69 1383 else
ffc61ed2 1384 tmp = 1;
078c425b 1385 s += uskip;
bbce6d69 1386 }
bbce6d69 1387 }
ffc61ed2
JH
1388 else {
1389 while (s < strend) {
1390 if (isSPACE_LC(*s)) {
3b0527fe 1391 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1392 goto got_it;
1393 else
1394 tmp = doevery;
1395 }
a0ed51b3 1396 else
ffc61ed2
JH
1397 tmp = 1;
1398 s++;
a0ed51b3 1399 }
a0ed51b3
LW
1400 }
1401 break;
a0d0e21e 1402 case NSPACE:
ffc61ed2 1403 if (do_utf8) {
1a4fad37 1404 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1405 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1406 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
3b0527fe 1407 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1408 goto got_it;
1409 else
1410 tmp = doevery;
1411 }
a0d0e21e 1412 else
ffc61ed2 1413 tmp = 1;
078c425b 1414 s += uskip;
a687059c 1415 }
a0d0e21e 1416 }
ffc61ed2
JH
1417 else {
1418 while (s < strend) {
1419 if (!isSPACE(*s)) {
3b0527fe 1420 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1421 goto got_it;
1422 else
1423 tmp = doevery;
1424 }
a0ed51b3 1425 else
ffc61ed2
JH
1426 tmp = 1;
1427 s++;
a0ed51b3 1428 }
a0ed51b3
LW
1429 }
1430 break;
bbce6d69 1431 case NSPACEL:
3280af22 1432 PL_reg_flags |= RF_tainted;
ffc61ed2 1433 if (do_utf8) {
078c425b 1434 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1435 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
3b0527fe 1436 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1437 goto got_it;
1438 else
1439 tmp = doevery;
1440 }
bbce6d69 1441 else
ffc61ed2 1442 tmp = 1;
078c425b 1443 s += uskip;
bbce6d69 1444 }
bbce6d69 1445 }
ffc61ed2
JH
1446 else {
1447 while (s < strend) {
1448 if (!isSPACE_LC(*s)) {
3b0527fe 1449 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1450 goto got_it;
1451 else
1452 tmp = doevery;
1453 }
a0ed51b3 1454 else
ffc61ed2
JH
1455 tmp = 1;
1456 s++;
a0ed51b3 1457 }
a0ed51b3
LW
1458 }
1459 break;
a0d0e21e 1460 case DIGIT:
ffc61ed2 1461 if (do_utf8) {
1a4fad37 1462 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1463 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1464 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1465 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1466 goto got_it;
1467 else
1468 tmp = doevery;
1469 }
a0d0e21e 1470 else
ffc61ed2 1471 tmp = 1;
078c425b 1472 s += uskip;
2b69d0c2 1473 }
a0d0e21e 1474 }
ffc61ed2
JH
1475 else {
1476 while (s < strend) {
1477 if (isDIGIT(*s)) {
3b0527fe 1478 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1479 goto got_it;
1480 else
1481 tmp = doevery;
1482 }
a0ed51b3 1483 else
ffc61ed2
JH
1484 tmp = 1;
1485 s++;
a0ed51b3 1486 }
a0ed51b3
LW
1487 }
1488 break;
b8c5462f
JH
1489 case DIGITL:
1490 PL_reg_flags |= RF_tainted;
ffc61ed2 1491 if (do_utf8) {
078c425b 1492 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1493 if (isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1494 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1495 goto got_it;
1496 else
1497 tmp = doevery;
1498 }
b8c5462f 1499 else
ffc61ed2 1500 tmp = 1;
078c425b 1501 s += uskip;
b8c5462f 1502 }
b8c5462f 1503 }
ffc61ed2
JH
1504 else {
1505 while (s < strend) {
1506 if (isDIGIT_LC(*s)) {
3b0527fe 1507 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1508 goto got_it;
1509 else
1510 tmp = doevery;
1511 }
b8c5462f 1512 else
ffc61ed2
JH
1513 tmp = 1;
1514 s++;
b8c5462f 1515 }
b8c5462f
JH
1516 }
1517 break;
a0d0e21e 1518 case NDIGIT:
ffc61ed2 1519 if (do_utf8) {
1a4fad37 1520 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1521 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1522 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1523 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1524 goto got_it;
1525 else
1526 tmp = doevery;
1527 }
a0d0e21e 1528 else
ffc61ed2 1529 tmp = 1;
078c425b 1530 s += uskip;
a687059c 1531 }
a0d0e21e 1532 }
ffc61ed2
JH
1533 else {
1534 while (s < strend) {
1535 if (!isDIGIT(*s)) {
3b0527fe 1536 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1537 goto got_it;
1538 else
1539 tmp = doevery;
1540 }
a0ed51b3 1541 else
ffc61ed2
JH
1542 tmp = 1;
1543 s++;
a0ed51b3 1544 }
a0ed51b3
LW
1545 }
1546 break;
b8c5462f
JH
1547 case NDIGITL:
1548 PL_reg_flags |= RF_tainted;
ffc61ed2 1549 if (do_utf8) {
078c425b 1550 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1551 if (!isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1552 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1553 goto got_it;
1554 else
1555 tmp = doevery;
1556 }
b8c5462f 1557 else
ffc61ed2 1558 tmp = 1;
078c425b 1559 s += uskip;
b8c5462f 1560 }
a0ed51b3 1561 }
ffc61ed2
JH
1562 else {
1563 while (s < strend) {
1564 if (!isDIGIT_LC(*s)) {
3b0527fe 1565 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1566 goto got_it;
1567 else
1568 tmp = doevery;
1569 }
cf93c79d 1570 else
ffc61ed2
JH
1571 tmp = 1;
1572 s++;
b8c5462f 1573 }
b8c5462f
JH
1574 }
1575 break;
07be1b83
YO
1576 case TRIE:
1577 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1578 {
1579 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1580 trie_type = do_utf8 ?
1581 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1582 : trie_plain;
1583 /* what trie are we using right now */
1584 reg_ac_data *aho
1585 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1586 reg_trie_data *trie=aho->trie;
1587
1588 const char *last_start = strend - trie->minlen;
1589 const char *real_start = s;
1590 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1591 SV *sv_points;
1592 U8 **points; /* map of where we were in the input string
1593 when reading a given string. For ASCII this
1594 is unnecessary overhead as the relationship
1595 is always 1:1, but for unicode, especially
1596 case folded unicode this is not true. */
f9e705e8 1597 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
07be1b83
YO
1598
1599 GET_RE_DEBUG_FLAGS_DECL;
1600
be8e71aa
YO
1601 /* We can't just allocate points here. We need to wrap it in
1602 * an SV so it gets freed properly if there is a croak while
1603 * running the match */
1604 ENTER;
1605 SAVETMPS;
1606 sv_points=newSV(maxlen * sizeof(U8 *));
1607 SvCUR_set(sv_points,
1608 maxlen * sizeof(U8 *));
1609 SvPOK_on(sv_points);
1610 sv_2mortal(sv_points);
1611 points=(U8**)SvPV_nolen(sv_points );
07be1b83
YO
1612
1613 if (trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1614 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
07be1b83
YO
1615 s++;
1616 }
1617 }
1618
1619 while (s <= last_start) {
1620 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1621 U8 *uc = (U8*)s;
1622 U16 charid = 0;
1623 U32 base = 1;
1624 U32 state = 1;
1625 UV uvc = 0;
1626 STRLEN len = 0;
1627 STRLEN foldlen = 0;
1628 U8 *uscan = (U8*)NULL;
1629 U8 *leftmost = NULL;
1630
1631 U32 pointpos = 0;
1632
1633 while ( state && uc <= (U8*)strend ) {
1634 int failed=0;
1635 if (aho->states[ state ].wordnum) {
1636 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1637 if (!leftmost || lpos < leftmost)
1638 leftmost= lpos;
1639 if (base==0) break;
1640 }
1641 points[pointpos++ % maxlen]= uc;
1642 switch (trie_type) {
1643 case trie_utf8_fold:
1644 if ( foldlen>0 ) {
1645 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
1646 foldlen -= len;
1647 uscan += len;
1648 len=0;
1649 } else {
07be1b83
YO
1650 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
1651 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
1652 foldlen -= UNISKIP( uvc );
1653 uscan = foldbuf + UNISKIP( uvc );
1654 }
1655 break;
1656 case trie_utf8:
1657 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
1658 &len, uniflags );
1659 break;
1660 case trie_plain:
1661 uvc = (UV)*uc;
1662 len = 1;
1663 }
1664
1665 if (uvc < 256) {
1666 charid = trie->charmap[ uvc ];
1667 }
1668 else {
1669 charid = 0;
1670 if (trie->widecharmap) {
1671 SV** const svpp = hv_fetch(trie->widecharmap,
1672 (char*)&uvc, sizeof(UV), 0);
1673 if (svpp)
1674 charid = (U16)SvIV(*svpp);
1675 }
1676 }
1677 DEBUG_TRIE_EXECUTE_r(
1678 PerlIO_printf(Perl_debug_log,
1679 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1680 (int)((const char*)uc - real_start), charid, uvc)
1681 );
1682 uc += len;
1683
1684 do {
1685 U32 word = aho->states[ state ].wordnum;
1686 base = aho->states[ state ].trans.base;
1687
1688 DEBUG_TRIE_EXECUTE_r(
1689 PerlIO_printf( Perl_debug_log,
1690 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1691 failed ? "Fail transition to " : "",
1692 state, base, uvc, word)
1693 );
1694 if ( base ) {
1695 U32 tmp;
1696 if (charid &&
1697 (base + charid > trie->uniquecharcount )
1698 && (base + charid - 1 - trie->uniquecharcount
1699 < trie->lasttrans)
1700 && trie->trans[base + charid - 1 -
1701 trie->uniquecharcount].check == state
1702 && (tmp=trie->trans[base + charid - 1 -
1703 trie->uniquecharcount ].next))
1704 {
1705 state = tmp;
1706 break;
1707 }
1708 else {
1709 failed++;
1710 if ( state == 1 )
1711 break;
1712 else
1713 state = aho->fail[state];
1714 }
1715 }
1716 else {
1717 /* we must be accepting here */
1718 failed++;
1719 break;
1720 }
1721 } while(state);
1722 if (failed) {
1723 if (leftmost)
1724 break;
1725 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1726 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
07be1b83
YO
1727 uc++;
1728 }
1729 }
1730 }
1731 }
1732 if ( aho->states[ state ].wordnum ) {
1733 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1734 if (!leftmost || lpos < leftmost)
1735 leftmost = lpos;
1736 }
1737 DEBUG_TRIE_EXECUTE_r(
1738 PerlIO_printf( Perl_debug_log,
1739 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1740 "All done: ",
1741 state, base, uvc)
1742 );
1743 if (leftmost) {
1744 s = (char*)leftmost;
be8e71aa
YO
1745 if (!reginfo || regtry(reginfo, s)) {
1746 FREETMPS;
1747 LEAVE;
07be1b83 1748 goto got_it;
be8e71aa 1749 }
07be1b83
YO
1750 s = HOPc(s,1);
1751 } else {
1752 break;
1753 }
1754 }
be8e71aa
YO
1755 FREETMPS;
1756 LEAVE;
07be1b83
YO
1757 }
1758 break;
b3c9acc1 1759 default:
3c3eec57
GS
1760 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1761 break;
d6a28714 1762 }
6eb5f6b9
JH
1763 return 0;
1764 got_it:
1765 return s;
1766}
1767
1768/*
1769 - regexec_flags - match a regexp against a string
1770 */
1771I32
1772Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1773 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1774/* strend: pointer to null at end of string */
1775/* strbeg: real beginning of string */
1776/* minend: end of match must be >=minend after stringarg. */
1777/* data: May be used for some additional optimizations. */
1778/* nosave: For optimizations. */
1779{
97aff369 1780 dVAR;
6eb5f6b9
JH
1781 register char *s;
1782 register regnode *c;
1783 register char *startpos = stringarg;
6eb5f6b9
JH
1784 I32 minlen; /* must match at least this many chars */
1785 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1786 I32 end_shift = 0; /* Same for the end. */ /* CC */
1787 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1788 char *scream_olds = NULL;
3dab1dad 1789 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1790 const bool do_utf8 = DO_UTF8(sv);
2757e526 1791 I32 multiline;
2a782b5b 1792#ifdef DEBUGGING
2757e526
JH
1793 SV* dsv0;
1794 SV* dsv1;
2a782b5b 1795#endif
3b0527fe 1796 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1797
1798 GET_RE_DEBUG_FLAGS_DECL;
1799
9d4ba2ae 1800 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1801
1802 /* Be paranoid... */
1803 if (prog == NULL || startpos == NULL) {
1804 Perl_croak(aTHX_ "NULL regexp parameter");
1805 return 0;
1806 }
1807
2757e526 1808 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1809 reginfo.prog = prog;
2757e526
JH
1810
1811#ifdef DEBUGGING
1812 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1813 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1814#endif
1815
bac06658
JH
1816 RX_MATCH_UTF8_set(prog, do_utf8);
1817
6eb5f6b9 1818 minlen = prog->minlen;
61a36c01 1819 if (strend - startpos < minlen) {
a3621e74 1820 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1821 "String too short [regexec_flags]...\n"));
1822 goto phooey;
1aa99e6b 1823 }
6eb5f6b9 1824
6eb5f6b9
JH
1825 /* Check validity of program. */
1826 if (UCHARAT(prog->program) != REG_MAGIC) {
1827 Perl_croak(aTHX_ "corrupted regexp program");
1828 }
1829
1830 PL_reg_flags = 0;
1831 PL_reg_eval_set = 0;
1832 PL_reg_maxiter = 0;
1833
1834 if (prog->reganch & ROPT_UTF8)
1835 PL_reg_flags |= RF_utf8;
1836
1837 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1838 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1839 PL_bostr = strbeg;
3b0527fe 1840 reginfo.sv = sv;
6eb5f6b9
JH
1841
1842 /* Mark end of line for $ (and such) */
1843 PL_regeol = strend;
1844
1845 /* see how far we have to get to not match where we matched before */
3b0527fe 1846 reginfo.till = startpos+minend;
6eb5f6b9 1847
6eb5f6b9
JH
1848 /* If there is a "must appear" string, look for it. */
1849 s = startpos;
1850
3b0527fe 1851 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1852 MAGIC *mg;
1853
1854 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1855 reginfo.ganch = startpos;
6eb5f6b9
JH
1856 else if (sv && SvTYPE(sv) >= SVt_PVMG
1857 && SvMAGIC(sv)
14befaf4
DM
1858 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1859 && mg->mg_len >= 0) {
3b0527fe 1860 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1861 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1862 if (s > reginfo.ganch)
6eb5f6b9 1863 goto phooey;
3b0527fe 1864 s = reginfo.ganch;
6eb5f6b9
JH
1865 }
1866 }
1867 else /* pos() not defined */
3b0527fe 1868 reginfo.ganch = strbeg;
6eb5f6b9
JH
1869 }
1870
a0714e2c 1871 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1872 re_scream_pos_data d;
1873
1874 d.scream_olds = &scream_olds;
1875 d.scream_pos = &scream_pos;
1876 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1877 if (!s) {
a3621e74 1878 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1879 goto phooey; /* not present */
3fa9c3d7 1880 }
6eb5f6b9
JH
1881 }
1882
a3621e74 1883 DEBUG_EXECUTE_r({
1df70142
AL
1884 const char * const s0 = UTF
1885 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1886 UNI_DISPLAY_REGEX)
1887 : prog->precomp;
bb7a0f54 1888 const int len0 = UTF ? (int)SvCUR(dsv0) : prog->prelen;
1df70142 1889 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1890 UNI_DISPLAY_REGEX) : startpos;
bb7a0f54 1891 const int len1 = do_utf8 ? (int)SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1892 if (!PL_colorset)
1893 reginitcolors();
1894 PerlIO_printf(Perl_debug_log,
a0288114 1895 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1896 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1897 len0, len0, s0,
2a782b5b 1898 PL_colors[1],
9e55ce06 1899 len0 > 60 ? "..." : "",
2a782b5b 1900 PL_colors[0],
9e55ce06
JH
1901 (int)(len1 > 60 ? 60 : len1),
1902 s1, PL_colors[1],
1903 (len1 > 60 ? "..." : "")
2a782b5b
JH
1904 );
1905 });
6eb5f6b9
JH
1906
1907 /* Simplest case: anchored match need be tried only once. */
1908 /* [unless only anchor is BOL and multiline is set] */
1909 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1910 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1911 goto got_it;
7fba1cd6 1912 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1913 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1914 {
1915 char *end;
1916
1917 if (minlen)
1918 dontbother = minlen - 1;
1aa99e6b 1919 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1920 /* for multiline we only have to try after newlines */
33b8afdf 1921 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1922 if (s == startpos)
1923 goto after_try;
1924 while (1) {
3b0527fe 1925 if (regtry(&reginfo, s))
6eb5f6b9
JH
1926 goto got_it;
1927 after_try:
1928 if (s >= end)
1929 goto phooey;
1930 if (prog->reganch & RE_USE_INTUIT) {
1931 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1932 if (!s)
1933 goto phooey;
1934 }
1935 else
1936 s++;
1937 }
1938 } else {
1939 if (s > startpos)
1940 s--;
1941 while (s < end) {
1942 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1943 if (regtry(&reginfo, s))
6eb5f6b9
JH
1944 goto got_it;
1945 }
1946 }
1947 }
1948 }
1949 goto phooey;
1950 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1951 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1952 goto got_it;
1953 goto phooey;
1954 }
1955
1956 /* Messy cases: unanchored match. */
33b8afdf 1957 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1958 /* we have /x+whatever/ */
1959 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1960 char ch;
bf93d4cc
GS
1961#ifdef DEBUGGING
1962 int did_match = 0;
1963#endif
33b8afdf
JH
1964 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1965 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1966 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1967
1aa99e6b 1968 if (do_utf8) {
6eb5f6b9
JH
1969 while (s < strend) {
1970 if (*s == ch) {
a3621e74 1971 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1972 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1973 s += UTF8SKIP(s);
1974 while (s < strend && *s == ch)
1975 s += UTF8SKIP(s);
1976 }
1977 s += UTF8SKIP(s);
1978 }
1979 }
1980 else {
1981 while (s < strend) {
1982 if (*s == ch) {
a3621e74 1983 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1984 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1985 s++;
1986 while (s < strend && *s == ch)
1987 s++;
1988 }
1989 s++;
1990 }
1991 }
a3621e74 1992 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1993 PerlIO_printf(Perl_debug_log,
b7953727
JH
1994 "Did not find anchored character...\n")
1995 );
6eb5f6b9 1996 }
a0714e2c
SS
1997 else if (prog->anchored_substr != NULL
1998 || prog->anchored_utf8 != NULL
1999 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2000 && prog->float_max_offset < strend - s)) {
2001 SV *must;
2002 I32 back_max;
2003 I32 back_min;
2004 char *last;
6eb5f6b9 2005 char *last1; /* Last position checked before */
bf93d4cc
GS
2006#ifdef DEBUGGING
2007 int did_match = 0;
2008#endif
33b8afdf
JH
2009 if (prog->anchored_substr || prog->anchored_utf8) {
2010 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2011 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2012 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2013 back_max = back_min = prog->anchored_offset;
2014 } else {
2015 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2016 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2017 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2018 back_max = prog->float_max_offset;
2019 back_min = prog->float_min_offset;
2020 }
2021 if (must == &PL_sv_undef)
2022 /* could not downgrade utf8 check substring, so must fail */
2023 goto phooey;
2024
2025 last = HOP3c(strend, /* Cannot start after this */
2026 -(I32)(CHR_SVLEN(must)
2027 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
2028
2029 if (s > PL_bostr)
2030 last1 = HOPc(s, -1);
2031 else
2032 last1 = s - 1; /* bogus */
2033
a0288114 2034 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2035 check_substr==must. */
2036 scream_pos = -1;
2037 dontbother = end_shift;
2038 strend = HOPc(strend, -dontbother);
2039 while ( (s <= last) &&
9041c2e3 2040 ((flags & REXEC_SCREAM)
1aa99e6b 2041 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 2042 end_shift, &scream_pos, 0))
1aa99e6b 2043 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 2044 (unsigned char*)strend, must,
7fba1cd6 2045 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
2046 /* we may be pointing at the wrong string */
2047 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 2048 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2049 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2050 if (HOPc(s, -back_max) > last1) {
2051 last1 = HOPc(s, -back_min);
2052 s = HOPc(s, -back_max);
2053 }
2054 else {
52657f30 2055 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2056
2057 last1 = HOPc(s, -back_min);
52657f30 2058 s = t;
6eb5f6b9 2059 }
1aa99e6b 2060 if (do_utf8) {
6eb5f6b9 2061 while (s <= last1) {
3b0527fe 2062 if (regtry(&reginfo, s))
6eb5f6b9
JH
2063 goto got_it;
2064 s += UTF8SKIP(s);
2065 }
2066 }
2067 else {
2068 while (s <= last1) {
3b0527fe 2069 if (regtry(&reginfo, s))
6eb5f6b9
JH
2070 goto got_it;
2071 s++;
2072 }
2073 }
2074 }
a3621e74 2075 DEBUG_EXECUTE_r(if (!did_match)
b7953727 2076 PerlIO_printf(Perl_debug_log,
a0288114 2077 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 2078 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
2079 ? "anchored" : "floating"),
2080 PL_colors[0],
2081 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 2082 SvPVX_const(must),
b7953727
JH
2083 PL_colors[1], (SvTAIL(must) ? "$" : ""))
2084 );
6eb5f6b9
JH
2085 goto phooey;
2086 }
155aba94 2087 else if ((c = prog->regstclass)) {
f14c76ed 2088 if (minlen) {
be8e71aa 2089 const OPCODE op = OP(prog->regstclass);
66e933ab 2090 /* don't bother with what can't match */
07be1b83 2091 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
f14c76ed
RGS
2092 strend = HOPc(strend, -(minlen - 1));
2093 }
a3621e74 2094 DEBUG_EXECUTE_r({
be8e71aa 2095 SV * const prop = sv_newmortal();
cfd0369c
NC
2096 const char *s0;
2097 const char *s1;
9e55ce06
JH
2098 int len0;
2099 int len1;
2100
32fc9b6a 2101 regprop(prog, prop, c);
9e55ce06 2102 s0 = UTF ?
3f7c398e 2103 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 2104 UNI_DISPLAY_REGEX) :
cfd0369c 2105 SvPVX_const(prop);
9e55ce06
JH
2106 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
2107 s1 = UTF ?
c728cb41 2108 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
bb7a0f54 2109 len1 = UTF ? (int)SvCUR(dsv1) : strend - s;
9e55ce06 2110 PerlIO_printf(Perl_debug_log,
07be1b83 2111 "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
9e55ce06 2112 len0, len0, s0,
07be1b83 2113 len1, len1, s1, (int)(strend - s));
ffc61ed2 2114 });
3b0527fe 2115 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2116 goto got_it;
07be1b83 2117 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2118 }
2119 else {
2120 dontbother = 0;
a0714e2c 2121 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2122 /* Trim the end. */
d6a28714 2123 char *last;
33b8afdf
JH
2124 SV* float_real;
2125
2126 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2127 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2128 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2129
2130 if (flags & REXEC_SCREAM) {
33b8afdf 2131 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2132 end_shift, &scream_pos, 1); /* last one */
2133 if (!last)
ffc61ed2 2134 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
2135 /* we may be pointing at the wrong string */
2136 else if (RX_MATCH_COPIED(prog))
3f7c398e 2137 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2138 }
d6a28714
JH
2139 else {
2140 STRLEN len;
cfd0369c 2141 const char * const little = SvPV_const(float_real, len);
d6a28714 2142
33b8afdf 2143 if (SvTAIL(float_real)) {
d6a28714
JH
2144 if (memEQ(strend - len + 1, little, len - 1))
2145 last = strend - len + 1;
7fba1cd6 2146 else if (!multiline)
9041c2e3 2147 last = memEQ(strend - len, little, len)
bd61b366 2148 ? strend - len : NULL;
b8c5462f 2149 else
d6a28714
JH
2150 goto find_last;
2151 } else {
2152 find_last:
9041c2e3 2153 if (len)
d6a28714 2154 last = rninstr(s, strend, little, little + len);
b8c5462f 2155 else
a0288114 2156 last = strend; /* matching "$" */
b8c5462f 2157 }
b8c5462f 2158 }
bf93d4cc 2159 if (last == NULL) {
a3621e74 2160 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2161 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2162 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2163 goto phooey; /* Should not happen! */
2164 }
d6a28714
JH
2165 dontbother = strend - last + prog->float_min_offset;
2166 }
2167 if (minlen && (dontbother < minlen))
2168 dontbother = minlen - 1;
2169 strend -= dontbother; /* this one's always in bytes! */
2170 /* We don't know much -- general case. */
1aa99e6b 2171 if (do_utf8) {
d6a28714 2172 for (;;) {
3b0527fe 2173 if (regtry(&reginfo, s))
d6a28714
JH
2174 goto got_it;
2175 if (s >= strend)
2176 break;
b8c5462f 2177 s += UTF8SKIP(s);
d6a28714
JH
2178 };
2179 }
2180 else {
2181 do {
3b0527fe 2182 if (regtry(&reginfo, s))
d6a28714
JH
2183 goto got_it;
2184 } while (s++ < strend);
2185 }
2186 }
2187
2188 /* Failure. */
2189 goto phooey;
2190
2191got_it:
2192 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2193
2194 if (PL_reg_eval_set) {
2195 /* Preserve the current value of $^R */
2196 if (oreplsv != GvSV(PL_replgv))
2197 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2198 restored, the value remains
2199 the same. */
4f639d21 2200 restore_pos(aTHX_ prog);
d6a28714
JH
2201 }
2202
2203 /* make sure $`, $&, $', and $digit will work later */
2204 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2205 RX_MATCH_COPY_FREE(prog);
d6a28714 2206 if (flags & REXEC_COPY_STR) {
be8e71aa 2207 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2208#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2209 if ((SvIsCOW(sv)
2210 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2211 if (DEBUG_C_TEST) {
2212 PerlIO_printf(Perl_debug_log,
2213 "Copy on write: regexp capture, type %d\n",
2214 (int) SvTYPE(sv));
2215 }
2216 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2217 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2218 assert (SvPOKp(prog->saved_copy));
2219 } else
2220#endif
2221 {
2222 RX_MATCH_COPIED_on(prog);
2223 s = savepvn(strbeg, i);
2224 prog->subbeg = s;
2225 }
d6a28714 2226 prog->sublen = i;
d6a28714
JH
2227 }
2228 else {
2229 prog->subbeg = strbeg;
2230 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2231 }
2232 }
9041c2e3 2233
d6a28714
JH
2234 return 1;
2235
2236phooey:
a3621e74 2237 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2238 PL_colors[4], PL_colors[5]));
d6a28714 2239 if (PL_reg_eval_set)
4f639d21 2240 restore_pos(aTHX_ prog);
d6a28714
JH
2241 return 0;
2242}
2243
2244/*
2245 - regtry - try match at specific point
2246 */
2247STATIC I32 /* 0 failure, 1 success */
3b0527fe 2248S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2249{
97aff369 2250 dVAR;
d6a28714
JH
2251 register I32 *sp;
2252 register I32 *ep;
2253 CHECKPOINT lastcp;
3b0527fe 2254 regexp *prog = reginfo->prog;
a3621e74 2255 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2256
02db2b7b
IZ
2257#ifdef DEBUGGING
2258 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2259#endif
d6a28714
JH
2260 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2261 MAGIC *mg;
2262
2263 PL_reg_eval_set = RS_init;
a3621e74 2264 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2265 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2266 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2267 ));
e8347627 2268 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2269 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2270 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2271 SAVETMPS;
2272 /* Apparently this is not needed, judging by wantarray. */
e8347627 2273 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2274 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2275
3b0527fe 2276 if (reginfo->sv) {
d6a28714 2277 /* Make $_ available to executed code. */
3b0527fe 2278 if (reginfo->sv != DEFSV) {
59f00321 2279 SAVE_DEFSV;
3b0527fe 2280 DEFSV = reginfo->sv;
b8c5462f 2281 }
d6a28714 2282
3b0527fe
DM
2283 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2284 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2285 /* prepare for quick setting of pos */
d300d9fa
NC
2286#ifdef PERL_OLD_COPY_ON_WRITE
2287 if (SvIsCOW(sv))
2288 sv_force_normal_flags(sv, 0);
2289#endif
3dab1dad 2290 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2291 &PL_vtbl_mglob, NULL, 0);
d6a28714 2292 mg->mg_len = -1;
b8c5462f 2293 }
d6a28714
JH
2294 PL_reg_magic = mg;
2295 PL_reg_oldpos = mg->mg_len;
4f639d21 2296 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2297 }
09687e5a 2298 if (!PL_reg_curpm) {
a02a5408 2299 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2300#ifdef USE_ITHREADS
2301 {
be8e71aa 2302 SV* const repointer = newSViv(0);
577e12cc 2303 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2304 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2305 av_push(PL_regex_padav,repointer);
2306 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2307 PL_regex_pad = AvARRAY(PL_regex_padav);
2308 }
2309#endif
2310 }
aaa362c4 2311 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2312 PL_reg_oldcurpm = PL_curpm;
2313 PL_curpm = PL_reg_curpm;
2314 if (RX_MATCH_COPIED(prog)) {
2315 /* Here is a serious problem: we cannot rewrite subbeg,
2316 since it may be needed if this match fails. Thus
2317 $` inside (?{}) could fail... */
2318 PL_reg_oldsaved = prog->subbeg;
2319 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2320#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2321 PL_nrs = prog->saved_copy;
2322#endif
d6a28714
JH
2323 RX_MATCH_COPIED_off(prog);
2324 }
2325 else
bd61b366 2326 PL_reg_oldsaved = NULL;
d6a28714
JH
2327 prog->subbeg = PL_bostr;
2328 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2329 }
973dddac 2330 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2331 PL_reginput = startpos;
2332 PL_regstartp = prog->startp;
2333 PL_regendp = prog->endp;
2334 PL_reglastparen = &prog->lastparen;
a01268b5 2335 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2336 prog->lastparen = 0;
03994de8 2337 prog->lastcloseparen = 0;
d6a28714 2338 PL_regsize = 0;
a3621e74 2339 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2340 if (PL_reg_start_tmpl <= prog->nparens) {
2341 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2342 if(PL_reg_start_tmp)
2343 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2344 else
a02a5408 2345 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2346 }
2347
2348 /* XXXX What this code is doing here?!!! There should be no need
2349 to do this again and again, PL_reglastparen should take care of
3dd2943c 2350 this! --ilya*/
dafc8851
JH
2351
2352 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2353 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2354 * PL_reglastparen), is not needed at all by the test suite
2355 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2356 * enough, for building DynaLoader, or otherwise this
2357 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2358 * will happen. Meanwhile, this code *is* needed for the
2359 * above-mentioned test suite tests to succeed. The common theme
2360 * on those tests seems to be returning null fields from matches.
2361 * --jhi */
dafc8851 2362#if 1
d6a28714
JH
2363 sp = prog->startp;
2364 ep = prog->endp;
2365 if (prog->nparens) {
097eb12c 2366 register I32 i;
eb160463 2367 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2368 *++sp = -1;
2369 *++ep = -1;
2370 }
2371 }
dafc8851 2372#endif
02db2b7b 2373 REGCP_SET(lastcp);
3b0527fe 2374 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2375 prog->endp[0] = PL_reginput - PL_bostr;
2376 return 1;
2377 }
02db2b7b 2378 REGCP_UNWIND(lastcp);
d6a28714
JH
2379 return 0;
2380}
2381
02db2b7b 2382
8ba1375e
MJD
2383#define sayYES goto yes
2384#define sayNO goto no
e0f9d4a8 2385#define sayNO_ANYOF goto no_anyof
8ba1375e 2386#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2387#define sayNO_FINAL goto no_final
2388#define sayNO_SILENT goto do_no
2389#define saySAME(x) if (x) goto yes; else goto no
2390
3ab3c9b4
HS
2391#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2392#define POSCACHE_SEEN 1 /* we know what we're caching */
2393#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
7409bbd3 2394
3ab3c9b4 2395#define CACHEsayYES STMT_START { \
d8319b27 2396 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3
DM
2397 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2398 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2399 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2400 } \
2401 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2402 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2403 } \
2404 else { \
3ab3c9b4
HS
2405 /* cache records failure, but this is success */ \
2406 DEBUG_r( \
2407 PerlIO_printf(Perl_debug_log, \
2408 "%*s (remove success from failure cache)\n", \
2409 REPORT_CODE_OFF+PL_regindent*2, "") \
2410 ); \
d8319b27 2411 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2412 } \
2413 } \
2414 sayYES; \
2415} STMT_END
7409bbd3 2416
3ab3c9b4 2417#define CACHEsayNO STMT_START { \
d8319b27 2418 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3 2419 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
3ab3c9b4 2420 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
7409bbd3
DM
2421 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2422 } \
2423 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2424 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2425 } \
2426 else { \
3ab3c9b4
HS
2427 /* cache records success, but this is failure */ \
2428 DEBUG_r( \
2429 PerlIO_printf(Perl_debug_log, \
2430 "%*s (remove failure from success cache)\n", \
2431 REPORT_CODE_OFF+PL_regindent*2, "") \
2432 ); \
d8319b27 2433 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2434 } \
2435 } \
2436 sayNO; \
2437} STMT_END
2438
a3621e74
YO
2439/* this is used to determine how far from the left messages like
2440 'failed...' are printed. Currently 29 makes these messages line
2441 up with the opcode they refer to. Earlier perls used 25 which
2442 left these messages outdented making reviewing a debug output
2443 quite difficult.
2444*/
2445#define REPORT_CODE_OFF 29
2446
2447
2448/* Make sure there is a test for this +1 options in re_tests */
2449#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2450
40a82448
DM
2451#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2452#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2453
86545054
DM
2454#define SLAB_FIRST(s) (&(s)->states[0])
2455#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2456
5d9a96ca
DM
2457/* grab a new slab and return the first slot in it */
2458
2459STATIC regmatch_state *
2460S_push_slab(pTHX)
2461{
54df2634
NC
2462#if PERL_VERSION < 9
2463 dMY_CXT;
2464#endif
5d9a96ca
DM
2465 regmatch_slab *s = PL_regmatch_slab->next;
2466 if (!s) {
2467 Newx(s, 1, regmatch_slab);
2468 s->prev = PL_regmatch_slab;
2469 s->next = NULL;
2470 PL_regmatch_slab->next = s;
2471 }
2472 PL_regmatch_slab = s;
86545054 2473 return SLAB_FIRST(s);
5d9a96ca 2474}
5b47454d 2475
95b24440
DM
2476/* simulate a recursive call to regmatch */
2477
2478#define REGMATCH(ns, where) \
5d9a96ca
DM
2479 st->scan = scan; \
2480 scan = (ns); \
2481 st->resume_state = resume_##where; \
95b24440
DM
2482 goto start_recurse; \
2483 resume_point_##where:
2484
40a82448
DM
2485/* push a new state then goto it */
2486
2487#define PUSH_STATE_GOTO(state, node) \
2488 scan = node; \
2489 st->resume_state = state; \
2490 goto push_state;
2491
2492/* push a new state with success backtracking, then goto it */
2493
2494#define PUSH_YES_STATE_GOTO(state, node) \
2495 scan = node; \
2496 st->resume_state = state; \
2497 goto push_yes_state;
2498
aa283a38 2499
aa283a38 2500
d6a28714
JH
2501/*
2502 - regmatch - main matching routine
2503 *
2504 * Conceptually the strategy is simple: check to see whether the current
2505 * node matches, call self recursively to see whether the rest matches,
2506 * and then act accordingly. In practice we make some effort to avoid
2507 * recursion, in particular by going through "ordinary" nodes (that don't
2508 * need to know whether the rest of the match failed) by a loop instead of
2509 * by recursion.
2510 */
2511/* [lwall] I've hoisted the register declarations to the outer block in order to
2512 * maybe save a little bit of pushing and popping on the stack. It also takes
2513 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2514 *
2515 * This function used to be heavily recursive, but since this had the
2516 * effect of blowing the CPU stack on complex regexes, it has been
2517 * restructured to be iterative, and to save state onto the heap rather
2518 * than the stack. Essentially whereever regmatch() used to be called, it
2519 * pushes the current state, notes where to return, then jumps back into
2520 * the main loop.
2521 *
2522 * Originally the structure of this function used to look something like
2523
2524 S_regmatch() {
2525 int a = 1, b = 2;
2526 ...
2527 while (scan != NULL) {
5d9a96ca 2528 a++; // do stuff with a and b
95b24440
DM
2529 ...
2530 switch (OP(scan)) {
2531 case FOO: {
2532 int local = 3;
2533 ...
2534 if (regmatch(...)) // recurse
2535 goto yes;
2536 }
2537 ...
2538 }
2539 }
2540 yes:
2541 return 1;
2542 }
2543
2544 * Now it looks something like this:
2545
5d9a96ca 2546 typedef struct {
95b24440
DM
2547 int a, b, local;
2548 int resume_state;
5d9a96ca 2549 } regmatch_state;
95b24440
DM
2550
2551 S_regmatch() {
5d9a96ca
DM
2552 regmatch_state *st = new();
2553 int depth=0;
2554 st->a++; // do stuff with a and b
95b24440
DM
2555 ...
2556 while (scan != NULL) {
2557 ...
2558 switch (OP(scan)) {
2559 case FOO: {
5d9a96ca 2560 st->local = 3;
95b24440 2561 ...
5d9a96ca
DM
2562 st->scan = scan;
2563 scan = ...;
2564 st->resume_state = resume_FOO;
2565 goto start_recurse; // recurse
95b24440 2566
5d9a96ca
DM
2567 resume_point_FOO:
2568 if (result)
95b24440
DM
2569 goto yes;
2570 }
2571 ...
2572 }
5d9a96ca
DM
2573 start_recurse:
2574 st = new(); push a new state
2575 st->a = 1; st->b = 2;
2576 depth++;
95b24440 2577 }
5d9a96ca 2578 yes:
95b24440 2579 result = 1;
5d9a96ca
DM
2580 if (depth--) {
2581 st = pop();
95b24440
DM
2582 switch (resume_state) {
2583 case resume_FOO:
2584 goto resume_point_FOO;
2585 ...
2586 }
2587 }
2588 return result
2589 }
2590
2591 * WARNING: this means that any line in this function that contains a
2592 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2593 * regmatch() using gotos instead. Thus the values of any local variables
2594 * not saved in the regmatch_state structure will have been lost when
2595 * execution resumes on the next line .
5d9a96ca
DM
2596 *
2597 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2598 * PL_regmatch_state always points to the currently active state, and
2599 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2600 * The first time regmatch is called, the first slab is allocated, and is
2601 * never freed until interpreter desctruction. When the slab is full,
2602 * a new one is allocated chained to the end. At exit from regmatch, slabs
2603 * allocated since entry are freed.
d6a28714 2604 */
95b24440 2605
40a82448 2606/* *** every FOO_fail should = FOO+1 */
166ba7cd
DM
2607#define TRIE_next (REGNODE_MAX+1)
2608#define TRIE_next_fail (REGNODE_MAX+2)
40a82448
DM
2609#define EVAL_A (REGNODE_MAX+3)
2610#define EVAL_A_fail (REGNODE_MAX+4)
2611#define resume_CURLYX (REGNODE_MAX+5)
2612#define resume_WHILEM1 (REGNODE_MAX+6)
2613#define resume_WHILEM2 (REGNODE_MAX+7)
2614#define resume_WHILEM3 (REGNODE_MAX+8)
2615#define resume_WHILEM4 (REGNODE_MAX+9)
2616#define resume_WHILEM5 (REGNODE_MAX+10)
2617#define resume_WHILEM6 (REGNODE_MAX+11)
2618#define BRANCH_next (REGNODE_MAX+12)
2619#define BRANCH_next_fail (REGNODE_MAX+13)
2620#define CURLYM_A (REGNODE_MAX+14)
2621#define CURLYM_A_fail (REGNODE_MAX+15)
2622#define CURLYM_B (REGNODE_MAX+16)
2623#define CURLYM_B_fail (REGNODE_MAX+17)
2624#define IFMATCH_A (REGNODE_MAX+18)
2625#define IFMATCH_A_fail (REGNODE_MAX+19)
2626#define resume_PLUS1 (REGNODE_MAX+20)
2627#define resume_PLUS2 (REGNODE_MAX+21)
2628#define resume_PLUS3 (REGNODE_MAX+22)
2629#define resume_PLUS4 (REGNODE_MAX+23)
2630
2631
2632
3dab1dad 2633#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2634
07be1b83
YO
2635#ifdef DEBUGGING
2636STATIC void
2637S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2638{
2639 const int docolor = *PL_colors[0];
2640 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2641 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2642 /* The part of the string before starttry has one color
2643 (pref0_len chars), between starttry and current
2644 position another one (pref_len - pref0_len chars),
2645 after the current position the third one.
2646 We assume that pref0_len <= pref_len, otherwise we
2647 decrease pref0_len. */
2648 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2649 ? (5 + taill) - l : locinput - PL_bostr;
2650 int pref0_len;
2651
2652 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2653 pref_len++;
2654 pref0_len = pref_len - (locinput - PL_reg_starttry);
2655 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2656 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2657 ? (5 + taill) - pref_len : PL_regeol - locinput);
2658 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2659 l--;
2660 if (pref0_len < 0)
2661 pref0_len = 0;
2662 if (pref0_len > pref_len)
2663 pref0_len = pref_len;
2664 {
2665 const char * const s0 =
2666 do_utf8 && OP(scan) != CANY ?
2667 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
2668 pref0_len, 60, UNI_DISPLAY_REGEX) :
2669 locinput - pref_len;
2670 const int len0 = do_utf8 ? (int)strlen(s0) : pref0_len;
2671 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2672 pv_uni_display(PERL_DEBUG_PAD(1),
2673 (U8*)(locinput - pref_len + pref0_len),
2674 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2675 locinput - pref_len + pref0_len;
2676 const int len1 = do_utf8 ? (int)strlen(s1) : pref_len - pref0_len;
2677 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2678 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
2679 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2680 locinput;
2681 const int len2 = do_utf8 ? (int)strlen(s2) : l;
2682 PerlIO_printf(Perl_debug_log,
2683 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
2684 (IV)(locinput - PL_bostr),
2685 PL_colors[4],
2686 len0, s0,
2687 PL_colors[5],
2688 PL_colors[2],
2689 len1, s1,
2690 PL_colors[3],
2691 (docolor ? "" : "> <"),
2692 PL_colors[0],
2693 len2, s2,
2694 PL_colors[1],
2695 15 - l - pref_len + 1,
2696 "");
2697 }
2698}
2699#endif
2700
d6a28714 2701STATIC I32 /* 0 failure, 1 success */
3b0527fe 2702S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2703{
54df2634
NC
2704#if PERL_VERSION < 9
2705 dMY_CXT;
2706#endif
27da23d5 2707 dVAR;
95b24440 2708 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2709 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2710
3b0527fe
DM
2711 regexp *rex = reginfo->prog;
2712
5d9a96ca
DM
2713 regmatch_slab *orig_slab;
2714 regmatch_state *orig_state;
a3621e74 2715
5d9a96ca
DM
2716 /* the current state. This is a cached copy of PL_regmatch_state */
2717 register regmatch_state *st;
95b24440 2718
5d9a96ca
DM
2719 /* cache heavy used fields of st in registers */
2720 register regnode *scan;
2721 register regnode *next;
2722 register I32 n = 0; /* initialize to shut up compiler warning */
2723 register char *locinput = PL_reginput;
95b24440 2724
5d9a96ca
DM
2725 /* these variables are NOT saved during a recusive RFEGMATCH: */
2726 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2727 bool result; /* return value of S_regmatch */
5d9a96ca 2728 int depth = 0; /* depth of recursion */
77cb431f
DM
2729 regmatch_state *yes_state = NULL; /* state to pop to on success of
2730 subpattern */
40a82448 2731 U32 state_num;
95b24440
DM
2732
2733#ifdef DEBUGGING
e68ec53f 2734 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2735 PL_regindent++;
2736#endif
2737
5d9a96ca
DM
2738 /* on first ever call to regmatch, allocate first slab */
2739 if (!PL_regmatch_slab) {
2740 Newx(PL_regmatch_slab, 1, regmatch_slab);
2741 PL_regmatch_slab->prev = NULL;
2742 PL_regmatch_slab->next = NULL;
86545054 2743 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2744 }
2745
2746 /* remember current high-water mark for exit */
2747 /* XXX this should be done with SAVE* instead */
2748 orig_slab = PL_regmatch_slab;
2749 orig_state = PL_regmatch_state;
2750
2751 /* grab next free state slot */
2752 st = ++PL_regmatch_state;
86545054 2753 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2754 st = PL_regmatch_state = S_push_slab(aTHX);
2755
2756 st->minmod = 0;
2757 st->sw = 0;
2758 st->logical = 0;
5d9a96ca 2759 st->cc = NULL;
d6a28714
JH
2760 /* Note that nextchr is a byte even in UTF */
2761 nextchr = UCHARAT(locinput);
2762 scan = prog;
2763 while (scan != NULL) {
8ba1375e 2764
a3621e74 2765 DEBUG_EXECUTE_r( {
6136c704 2766 SV * const prop = sv_newmortal();
07be1b83 2767 dump_exec_pos( locinput, scan, do_utf8 );
32fc9b6a 2768 regprop(rex, prop, scan);
07be1b83
YO
2769
2770 PerlIO_printf(Perl_debug_log,
2771 "%3"IVdf":%*s%s(%"IVdf")\n",
2772 (IV)(scan - rex->program), PL_regindent*2, "",
2773 SvPVX_const(prop),
2774 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2a782b5b 2775 });
d6a28714
JH
2776
2777 next = scan + NEXT_OFF(scan);
2778 if (next == scan)
2779 next = NULL;
40a82448 2780 state_num = OP(scan);
d6a28714 2781
40a82448
DM
2782 reenter_switch:
2783 switch (state_num) {
d6a28714 2784 case BOL:
7fba1cd6 2785 if (locinput == PL_bostr)
d6a28714 2786 {
3b0527fe 2787 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2788 break;
2789 }
d6a28714
JH
2790 sayNO;
2791 case MBOL:
12d33761
HS
2792 if (locinput == PL_bostr ||
2793 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2794 {
b8c5462f
JH
2795 break;
2796 }
d6a28714
JH
2797 sayNO;
2798 case SBOL:
c2a73568 2799 if (locinput == PL_bostr)
b8c5462f 2800 break;
d6a28714
JH
2801 sayNO;
2802 case GPOS:
3b0527fe 2803 if (locinput == reginfo->ganch)
d6a28714
JH
2804 break;
2805 sayNO;
2806 case EOL:
d6a28714
JH
2807 goto seol;
2808 case MEOL:
d6a28714 2809 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2810 sayNO;
b8c5462f 2811 break;
d6a28714
JH
2812 case SEOL:
2813 seol:
2814 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2815 sayNO;
d6a28714 2816 if (PL_regeol - locinput > 1)
b8c5462f 2817 sayNO;
b8c5462f 2818 break;
d6a28714
JH
2819 case EOS:
2820 if (PL_regeol != locinput)
b8c5462f 2821 sayNO;
d6a28714 2822 break;
ffc61ed2 2823 case SANY:
d6a28714 2824 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2825 sayNO;
f33976b4
DB
2826 if (do_utf8) {
2827 locinput += PL_utf8skip[nextchr];
2828 if (locinput > PL_regeol)
2829 sayNO;
2830 nextchr = UCHARAT(locinput);
2831 }
2832 else
2833 nextchr = UCHARAT(++locinput);
2834 break;
2835 case CANY:
2836 if (!nextchr && locinput >= PL_regeol)
2837 sayNO;
b8c5462f 2838 nextchr = UCHARAT(++locinput);
a0d0e21e 2839 break;
ffc61ed2 2840 case REG_ANY:
1aa99e6b
IH
2841 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2842 sayNO;
2843 if (do_utf8) {
b8c5462f 2844 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2845 if (locinput > PL_regeol)
2846 sayNO;
a0ed51b3 2847 nextchr = UCHARAT(locinput);
a0ed51b3 2848 }
1aa99e6b
IH
2849 else
2850 nextchr = UCHARAT(++locinput);
a0ed51b3 2851 break;
166ba7cd
DM
2852
2853#undef ST
2854#define ST st->u.trie
2855
5b47454d 2856 case TRIE:
3dab1dad 2857 {
07be1b83 2858 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2859 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2860 trie_type = do_utf8 ?
2861 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2862 : trie_plain;
2863
2864 /* what trie are we using right now */
be8e71aa 2865 reg_trie_data * const trie
3dab1dad
YO
2866 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2867 U32 state = trie->startstate;
166ba7cd
DM
2868
2869 U8 *uc = ( U8* )locinput;
2870 U16 charid = 0;
2871 U32 base = 0;
2872 UV uvc = 0;
2873 STRLEN len = 0;
2874 STRLEN foldlen = 0;
2875 U8 *uscan = (U8*)NULL;
2876 STRLEN bufflen=0;
2877 SV *sv_accept_buff = NULL;
2878 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2879
2880 ST.accepted = 0; /* how many accepting states we have seen */
2881 ST.B = next;
2882#ifdef DEBUGGING
2883 ST.me = scan;
2884#endif
3dab1dad
YO
2885
2886 if (trie->bitmap && trie_type != trie_utf8_fold &&
2887 !TRIE_BITMAP_TEST(trie,*locinput)
2888 ) {
2889 if (trie->states[ state ].wordnum) {
2890 DEBUG_EXECUTE_r(
2891 PerlIO_printf(Perl_debug_log,
2892 "%*s %smatched empty string...%s\n",
2893 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2894 );
2895 break;
2896 } else {
2897 DEBUG_EXECUTE_r(
2898 PerlIO_printf(Perl_debug_log,
2899 "%*s %sfailed to match start class...%s\n",
2900 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2901 );
2902 sayNO_SILENT;
2903 }
2904 }
166ba7cd 2905
07be1b83
YO
2906 /*
2907 traverse the TRIE keeping track of all accepting states
2908 we transition through until we get to a failing node.
2909 */
2910
a3621e74
YO
2911 while ( state && uc <= (U8*)PL_regeol ) {
2912
5b47454d 2913 if (trie->states[ state ].wordnum) {
166ba7cd 2914 if (!ST.accepted ) {
5b47454d
DM
2915 ENTER;
2916 SAVETMPS;
2917 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2918 sv_accept_buff=newSV(bufflen *
2919 sizeof(reg_trie_accepted) - 1);
2920 SvCUR_set(sv_accept_buff,
2921 sizeof(reg_trie_accepted));
2922 SvPOK_on(sv_accept_buff);
2923 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2924 SAVETMPS;
2925 ST.accept_buff =
5b47454d
DM
2926 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2927 }
2928 else {
166ba7cd 2929 if (ST.accepted >= bufflen) {
5b47454d 2930 bufflen *= 2;
166ba7cd 2931 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2932 SvGROW(sv_accept_buff,
2933 bufflen * sizeof(reg_trie_accepted));
2934 }
2935 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2936 + sizeof(reg_trie_accepted));
2937 }
166ba7cd
DM
2938 ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum;
2939 ST.accept_buff[ST.accepted].endpos = uc;
2940 ++ST.accepted;
5b47454d 2941 }
a3621e74
YO
2942
2943 base = trie->states[ state ].trans.base;
2944
07be1b83
YO
2945 DEBUG_TRIE_EXECUTE_r({
2946 dump_exec_pos( (char *)uc, scan, do_utf8 );
a3621e74 2947 PerlIO_printf( Perl_debug_log,
e4584336 2948 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
07be1b83 2949 2+PL_regindent * 2, "", PL_colors[4],
166ba7cd 2950 (UV)state, (UV)base, (UV)ST.accepted );
07be1b83 2951 });
a3621e74
YO
2952
2953 if ( base ) {
5b47454d 2954 switch (trie_type) {
3dab1dad 2955 case trie_utf8_fold:
a3621e74
YO
2956 if ( foldlen>0 ) {
2957 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2958 foldlen -= len;
2959 uscan += len;
2960 len=0;
2961 } else {
2962 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2963 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2964 foldlen -= UNISKIP( uvc );
2965 uscan = foldbuf + UNISKIP( uvc );
2966 }
5b47454d
DM
2967 break;
2968 case trie_utf8:
2969 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2970 &len, uniflags );
2971 break;
2972 case trie_plain:
e4584336 2973 uvc = (UV)*uc;
a3621e74
YO
2974 len = 1;
2975 }
2976
5b47454d
DM
2977 if (uvc < 256) {
2978 charid = trie->charmap[ uvc ];
2979 }
2980 else {
2981 charid = 0;
2982 if (trie->widecharmap) {
3dab1dad 2983 SV** const svpp = hv_fetch(trie->widecharmap,
5b47454d
DM
2984 (char*)&uvc, sizeof(UV), 0);
2985 if (svpp)
2986 charid = (U16)SvIV(*svpp);
2987 }
2988 }
a3621e74 2989
5b47454d
DM
2990 if (charid &&
2991 (base + charid > trie->uniquecharcount )
2992 && (base + charid - 1 - trie->uniquecharcount
2993 < trie->lasttrans)
2994 && trie->trans[base + charid - 1 -
2995 trie->uniquecharcount].check == state)
2996 {
2997 state = trie->trans[base + charid - 1 -
2998 trie->uniquecharcount ].next;
2999 }
3000 else {
3001 state = 0;
3002 }
3003 uc += len;
3004
3005 }
3006 else {
a3621e74
YO
3007 state = 0;
3008 }
3009 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
3010 PerlIO_printf( Perl_debug_log,
3011 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
3012 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3013 );
3014 }
166ba7cd 3015 if (!ST.accepted )
a3621e74 3016 sayNO;
a3621e74 3017
166ba7cd
DM
3018 DEBUG_EXECUTE_r(
3019 PerlIO_printf( Perl_debug_log,
3020 "%*s %sgot %"IVdf" possible matches%s\n",
3021 REPORT_CODE_OFF + PL_regindent * 2, "",
3022 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3023 );
3024 }
3025
3026 /* FALL THROUGH */
3027
3028 case TRIE_next_fail: /* we failed - try next alterative */
3029
3030 if ( ST.accepted == 1 ) {
3031 /* only one choice left - just continue */
3032 DEBUG_EXECUTE_r({
3033 reg_trie_data * const trie
3034 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
3035 SV ** const tmp = RX_DEBUG(reginfo->prog)
3036 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
3037 : NULL;
3038 PerlIO_printf( Perl_debug_log,
3039 "%*s %sonly one match left: #%d <%s>%s\n",
3040 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3041 ST.accept_buff[ 0 ].wordnum,
3042 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
3043 PL_colors[5] );
3044 });
3045 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3046 /* in this case we free tmps/leave before we call regmatch
3047 as we wont be using accept_buff again. */
3048 FREETMPS;
3049 LEAVE;
3050 locinput = PL_reginput;
3051 nextchr = UCHARAT(locinput);
3052 scan = ST.B;
3053 continue; /* execute rest of RE */
3054 }
3055
3056 if (!ST.accepted-- ) {
3057 FREETMPS;
3058 LEAVE;
3059 sayNO;
3060 }
3061
a3621e74 3062 /*
166ba7cd
DM
3063 There are at least two accepting states left. Presumably
3064 the number of accepting states is going to be low,
3065 typically two. So we simply scan through to find the one
3066 with lowest wordnum. Once we find it, we swap the last
3067 state into its place and decrement the size. We then try to
3068 match the rest of the pattern at the point where the word
3069 ends. If we succeed, control just continues along the
3070 regex; if we fail we return here to try the next accepting
3071 state
3072 */
a3621e74 3073
166ba7cd
DM
3074 {
3075 U32 best = 0;
3076 U32 cur;
3077 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3078 DEBUG_TRIE_EXECUTE_r(
f2278c82 3079 PerlIO_printf( Perl_debug_log,
166ba7cd
DM
3080 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3081 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
3082 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3083 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3084 );
3085
3086 if (ST.accept_buff[cur].wordnum <
3087 ST.accept_buff[best].wordnum)
3088 best = cur;
a3621e74 3089 }
166ba7cd
DM
3090
3091 DEBUG_EXECUTE_r({
3092 reg_trie_data * const trie
3093 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
3094 SV ** const tmp = RX_DEBUG(reginfo->prog)
3095 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
3096 : NULL;
3097 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
3098 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3099 ST.accept_buff[best].wordnum,
3100 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
3101 PL_colors[5] );
3102 });
3103
3104 if ( best<ST.accepted ) {
3105 reg_trie_accepted tmp = ST.accept_buff[ best ];
3106 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3107 ST.accept_buff[ ST.accepted ] = tmp;
3108 best = ST.accepted;
a3621e74 3109 }
166ba7cd
DM
3110 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3111 }
3112 PUSH_STATE_GOTO(TRIE_next, ST.B);
3113 /* NOTREACHED */
3114
3115#undef ST
3116
95b24440
DM
3117 case EXACT: {
3118 char *s = STRING(scan);
5d9a96ca 3119 st->ln = STR_LEN(scan);
eb160463 3120 if (do_utf8 != UTF) {
bc517b45 3121 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3122 char *l = locinput;
be8e71aa 3123 const char * const e = s + st->ln;
a72c7584 3124
5ff6fc6d
JH
3125 if (do_utf8) {
3126 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3127 while (s < e) {
a3b680e6 3128 STRLEN ulen;
1aa99e6b 3129 if (l >= PL_regeol)
5ff6fc6d
JH
3130 sayNO;
3131 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3132 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3133 uniflags))
5ff6fc6d 3134 sayNO;
bc517b45 3135 l += ulen;
5ff6fc6d 3136 s ++;
1aa99e6b 3137 }
5ff6fc6d
JH
3138 }
3139 else {
3140 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3141 while (s < e) {
a3b680e6 3142 STRLEN ulen;
1aa99e6b
IH
3143 if (l >= PL_regeol)
3144 sayNO;
5ff6fc6d 3145 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3146 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3147 uniflags))
1aa99e6b 3148 sayNO;
bc517b45 3149 s += ulen;
a72c7584 3150 l ++;
1aa99e6b 3151 }
5ff6fc6d 3152 }
1aa99e6b
IH
3153 locinput = l;
3154 nextchr = UCHARAT(locinput);
3155 break;
3156 }
bc517b45 3157 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3158 /* Inline the first character, for speed. */
3159 if (UCHARAT(s) != nextchr)
3160 sayNO;
5d9a96ca 3161 if (PL_regeol - locinput < st->ln)
d6a28714 3162 sayNO;
5d9a96ca 3163 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 3164 sayNO;
5d9a96ca 3165 locinput += st->ln;
d6a28714
JH
3166 nextchr = UCHARAT(locinput);
3167 break;
95b24440 3168 }
d6a28714 3169 case EXACTFL:
b8c5462f
JH
3170 PL_reg_flags |= RF_tainted;
3171 /* FALL THROUGH */
95b24440 3172 case EXACTF: {
be8e71aa 3173 char * const s = STRING(scan);
5d9a96ca 3174 st->ln = STR_LEN(scan);
d6a28714 3175
d07ddd77
JH
3176 if (do_utf8 || UTF) {
3177 /* Either target or the pattern are utf8. */
be8e71aa 3178 const char * const l = locinput;
d07ddd77 3179 char *e = PL_regeol;
bc517b45 3180
5d9a96ca 3181 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 3182 l, &e, 0, do_utf8)) {
5486206c
JH
3183 /* One more case for the sharp s:
3184 * pack("U0U*", 0xDF) =~ /ss/i,
3185 * the 0xC3 0x9F are the UTF-8
3186 * byte sequence for the U+00DF. */
3187 if (!(do_utf8 &&
3188 toLOWER(s[0]) == 's' &&
5d9a96ca 3189 st->ln >= 2 &&
5486206c
JH
3190 toLOWER(s[1]) == 's' &&
3191 (U8)l[0] == 0xC3 &&
3192 e - l >= 2 &&
3193 (U8)l[1] == 0x9F))
3194 sayNO;
3195 }
d07ddd77
JH
3196 locinput = e;
3197 nextchr = UCHARAT(locinput);
3198 break;
a0ed51b3 3199 }
d6a28714 3200
bc517b45
JH
3201 /* Neither the target and the pattern are utf8. */
3202
d6a28714
JH
3203 /* Inline the first character, for speed. */
3204 if (UCHARAT(s) != nextchr &&
3205 UCHARAT(s) != ((OP(scan) == EXACTF)
3206 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3207 sayNO;
5d9a96ca 3208 if (PL_regeol - locinput < st->ln)
b8c5462f 3209 sayNO;
5d9a96ca
DM
3210 if (st->ln > 1 && (OP(scan) == EXACTF
3211 ? ibcmp(s, locinput, st->ln)
3212 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 3213 sayNO;
5d9a96ca 3214 locinput += st->ln;
d6a28714 3215 nextchr = UCHARAT(locinput);
a0d0e21e 3216 break;
95b24440 3217 }
d6a28714 3218 case ANYOF:
ffc61ed2 3219 if (do_utf8) {
9e55ce06
JH
3220 STRLEN inclasslen = PL_regeol - locinput;
3221
32fc9b6a 3222 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 3223 sayNO_ANYOF;
ffc61ed2
JH
3224 if (locinput >= PL_regeol)
3225 sayNO;
0f0076b4 3226 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3227 nextchr = UCHARAT(locinput);
e0f9d4a8 3228 break;
ffc61ed2
JH
3229 }
3230 else {
3231 if (nextchr < 0)
3232 nextchr = UCHARAT(locinput);
32fc9b6a 3233 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 3234 sayNO_ANYOF;
ffc61ed2
JH
3235 if (!nextchr && locinput >= PL_regeol)
3236 sayNO;
3237 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3238 break;
3239 }
3240 no_anyof:
3241 /* If we might have the case of the German sharp s
3242 * in a casefolding Unicode character class. */
3243
ebc501f0
JH
3244 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3245 locinput += SHARP_S_SKIP;
e0f9d4a8 3246 nextchr = UCHARAT(locinput);
ffc61ed2 3247 }
e0f9d4a8
JH
3248 else
3249 sayNO;
b8c5462f 3250 break;
d6a28714 3251 case ALNUML:
b8c5462f
JH
3252 PL_reg_flags |= RF_tainted;
3253 /* FALL THROUGH */
d6a28714 3254 case ALNUM:
b8c5462f 3255 if (!nextchr)
4633a7c4 3256 sayNO;
ffc61ed2 3257 if (do_utf8) {
1a4fad37 3258 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3259 if (!(OP(scan) == ALNUM
bb7a0f54 3260 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3261 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3262 {
3263 sayNO;
a0ed51b3 3264 }
b8c5462f 3265 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3266 nextchr = UCHARAT(locinput);
3267 break;
3268 }
ffc61ed2 3269 if (!(OP(scan) == ALNUM
d6a28714 3270 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3271 sayNO;
b8c5462f 3272 nextchr = UCHARAT(++locinput);
a0d0e21e 3273 break;
d6a28714 3274 case NALNUML:
b8c5462f
JH
3275 PL_reg_flags |= RF_tainted;
3276 /* FALL THROUGH */
d6a28714
JH
3277 case NALNUM:
3278 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3279 sayNO;
ffc61ed2 3280 if (do_utf8) {
1a4fad37 3281 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3282 if (OP(scan) == NALNUM
bb7a0f54 3283 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3284 : isALNUM_LC_utf8((U8*)locinput))
3285 {
b8c5462f 3286 sayNO;
d6a28714 3287 }
b8c5462f
JH
3288 locinput += PL_utf8skip[nextchr];
3289 nextchr = UCHARAT(locinput);
3290 break;
3291 }
ffc61ed2 3292 if (OP(scan) == NALNUM
d6a28714 3293 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3294 sayNO;
76e3520e 3295 nextchr = UCHARAT(++locinput);
a0d0e21e 3296 break;
d6a28714
JH
3297 case BOUNDL:
3298 case NBOUNDL:
3280af22 3299 PL_reg_flags |= RF_tainted;
bbce6d69 3300 /* FALL THROUGH */
d6a28714
JH
3301 case BOUND:
3302 case NBOUND:
3303 /* was last char in word? */
ffc61ed2 3304 if (do_utf8) {
12d33761 3305 if (locinput == PL_bostr)
5d9a96ca 3306 st->ln = '\n';
ffc61ed2 3307 else {
a3b680e6 3308 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3309
4ad0818d 3310 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3311 }
3312 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3313 st->ln = isALNUM_uni(st->ln);
1a4fad37 3314 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3315 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3316 }
3317 else {
5d9a96ca 3318 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3319 n = isALNUM_LC_utf8((U8*)locinput);
3320 }
a0ed51b3 3321 }
d6a28714 3322 else {
5d9a96ca 3323 st->ln = (locinput != PL_bostr) ?
12d33761 3324 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3325 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3326 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3327 n = isALNUM(nextchr);
3328 }
3329 else {
5d9a96ca 3330 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3331 n = isALNUM_LC(nextchr);
3332 }
d6a28714 3333 }
5d9a96ca 3334 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3335 OP(scan) == BOUNDL))
3336 sayNO;
a0ed51b3 3337 break;
d6a28714 3338 case SPACEL:
3280af22 3339 PL_reg_flags |= RF_tainted;
bbce6d69 3340 /* FALL THROUGH */
d6a28714 3341 case SPACE:
9442cb0e 3342 if (!nextchr)
4633a7c4 3343 sayNO;
1aa99e6b 3344 if (do_utf8) {
fd400ab9 3345 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3346 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3347 if (!(OP(scan) == SPACE
bb7a0f54 3348 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3349 : isSPACE_LC_utf8((U8*)locinput)))
3350 {
3351 sayNO;
3352 }
3353 locinput += PL_utf8skip[nextchr];
3354 nextchr = UCHARAT(locinput);
3355 break;
d6a28714 3356 }
ffc61ed2
JH
3357 if (!(OP(scan) == SPACE
3358 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3359 sayNO;
3360 nextchr = UCHARAT(++locinput);
3361 }
3362 else {
3363 if (!(OP(scan) == SPACE
3364 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3365 sayNO;
3366 nextchr = UCHARAT(++locinput);
a0ed51b3 3367 }
a0ed51b3 3368 break;
d6a28714 3369 case NSPACEL:
3280af22 3370 PL_reg_flags |= RF_tainted;
bbce6d69 3371 /* FALL THROUGH */
d6a28714 3372 case NSPACE:
9442cb0e 3373 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3374 sayNO;
1aa99e6b 3375 if (do_utf8) {
1a4fad37 3376 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3377 if (OP(scan) == NSPACE
bb7a0f54 3378 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3379 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3380 {
3381 sayNO;
3382 }
3383 locinput += PL_utf8skip[nextchr];
3384 nextchr = UCHARAT(locinput);
3385 break;
a0ed51b3 3386 }
ffc61ed2 3387 if (OP(scan) == NSPACE
d6a28714 3388 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3389 sayNO;
76e3520e 3390 nextchr = UCHARAT(++locinput);
a0d0e21e 3391 break;
d6a28714 3392 case DIGITL:
a0ed51b3
LW
3393 PL_reg_flags |= RF_tainted;
3394 /* FALL THROUGH */
d6a28714 3395 case DIGIT:
9442cb0e 3396 if (!nextchr)
a0ed51b3 3397 sayNO;
1aa99e6b 3398 if (do_utf8) {
1a4fad37 3399 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3400 if (!(OP(scan) == DIGIT
bb7a0f54 3401 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3402 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3403 {
a0ed51b3 3404 sayNO;
dfe13c55 3405 }
6f06b55f 3406 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3407 nextchr = UCHARAT(locinput);
3408 break;
3409 }
ffc61ed2 3410 if (!(OP(scan) == DIGIT
9442cb0e 3411 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3412 sayNO;
3413 nextchr = UCHARAT(++locinput);
3414 break;
d6a28714 3415 case NDIGITL:
b8c5462f
JH
3416 PL_reg_flags |= RF_tainted;
3417 /* FALL THROUGH */
d6a28714 3418 case NDIGIT:
9442cb0e 3419 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3420 sayNO;
1aa99e6b 3421 if (do_utf8) {
1a4fad37 3422 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3423 if (OP(scan) == NDIGIT
bb7a0f54 3424 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3425 : isDIGIT_LC_utf8((U8*)locinput))
3426 {
a0ed51b3 3427 sayNO;
9442cb0e 3428 }
6f06b55f 3429 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3430 nextchr = UCHARAT(locinput);
3431 break;
3432 }
ffc61ed2 3433 if (OP(scan) == NDIGIT
9442cb0e 3434 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3435 sayNO;
3436 nextchr = UCHARAT(++locinput);
3437 break;
3438 case CLUMP:
b7c83a7e 3439 if (locinput >= PL_regeol)
a0ed51b3 3440 sayNO;
b7c83a7e 3441 if (do_utf8) {
1a4fad37 3442 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3443 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3444 sayNO;
3445 locinput += PL_utf8skip[nextchr];
3446 while (locinput < PL_regeol &&
3447 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3448 locinput += UTF8SKIP(locinput);
3449 if (locinput > PL_regeol)
3450 sayNO;
eb08e2da
JH
3451 }
3452 else
3453 locinput++;
a0ed51b3
LW
3454 nextchr = UCHARAT(locinput);
3455 break;
c8756f30 3456 case REFFL:
3280af22 3457 PL_reg_flags |= RF_tainted;
c8756f30 3458 /* FALL THROUGH */
c277df42 3459 case REF:
95b24440
DM
3460 case REFF: {
3461 char *s;
c277df42 3462 n = ARG(scan); /* which paren pair */
5d9a96ca 3463 st->ln = PL_regstartp[n];
2c2d71f5 3464 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3465 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3466 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3467 if (st->ln == PL_regendp[n])
a0d0e21e 3468 break;
a0ed51b3 3469
5d9a96ca 3470 s = PL_bostr + st->ln;
1aa99e6b 3471 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3472 char *l = locinput;
a3b680e6 3473 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3474 /*
3475 * Note that we can't do the "other character" lookup trick as
3476 * in the 8-bit case (no pun intended) because in Unicode we
3477 * have to map both upper and title case to lower case.
3478 */
3479 if (OP(scan) == REFF) {
3480 while (s < e) {
a3b680e6
AL
3481 STRLEN ulen1, ulen2;
3482 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3483 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3484
a0ed51b3
LW
3485 if (l >= PL_regeol)
3486 sayNO;
a2a2844f
JH
3487 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3488 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3489 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3490 sayNO;
a2a2844f
JH
3491 s += ulen1;
3492 l += ulen2;
a0ed51b3
LW
3493 }
3494 }
3495 locinput = l;
3496 nextchr = UCHARAT(locinput);
3497 break;
3498 }
3499
a0d0e21e 3500 /* Inline the first character, for speed. */
76e3520e 3501 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3502 (OP(scan) == REF ||
3503 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3504 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3505 sayNO;
5d9a96ca
DM
3506 st->ln = PL_regendp[n] - st->ln;
3507 if (locinput + st->ln > PL_regeol)
4633a7c4 3508 sayNO;
5d9a96ca
DM
3509 if (st->ln > 1 && (OP(scan) == REF
3510 ? memNE(s, locinput, st->ln)
c8756f30 3511 : (OP(scan) == REFF
5d9a96ca
DM
3512 ? ibcmp(s, locinput, st->ln)
3513 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3514 sayNO;
5d9a96ca 3515 locinput += st->ln;
76e3520e 3516 nextchr = UCHARAT(locinput);
a0d0e21e 3517 break;
95b24440 3518 }
a0d0e21e
LW
3519
3520 case NOTHING:
c277df42 3521 case TAIL:
a0d0e21e
LW
3522 break;
3523 case BACK:
3524 break;
40a82448
DM
3525
3526#undef ST
3527#define ST st->u.eval
3528
3529 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
c277df42 3530 {
c277df42 3531 SV *ret;
8e5e9ebe 3532 {
4aabdb9b
DM
3533 /* execute the code in the {...} */
3534 dSP;
6136c704 3535 SV ** const before = SP;
4aabdb9b
DM
3536 OP_4tree * const oop = PL_op;
3537 COP * const ocurcop = PL_curcop;
3538 PAD *old_comppad;
4aabdb9b
DM
3539
3540 n = ARG(scan);
32fc9b6a 3541 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3542 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3543 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3544 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3545
8e5e9ebe
RGS
3546 CALLRUNOPS(aTHX); /* Scalar context. */
3547 SPAGAIN;
3548 if (SP == before)
075aa684 3549 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3550 else {
3551 ret = POPs;
3552 PUTBACK;
3553 }
4aabdb9b
DM
3554
3555 PL_op = oop;
3556 PAD_RESTORE_LOCAL(old_comppad);
3557 PL_curcop = ocurcop;
3558 if (!st->logical) {
3559 /* /(?{...})/ */
3560 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3561 break;
3562 }
8e5e9ebe 3563 }
4aabdb9b
DM
3564 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3565 regexp *re;
4aabdb9b 3566 {
4f639d21
DM
3567 /* extract RE object from returned value; compiling if
3568 * necessary */
3569
6136c704 3570 MAGIC *mg = NULL;
be8e71aa 3571 const SV *sv;
faf82a0b
AE
3572 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3573 mg = mg_find(sv, PERL_MAGIC_qr);
3574 else if (SvSMAGICAL(ret)) {
3575 if (SvGMAGICAL(ret))
3576 sv_unmagic(ret, PERL_MAGIC_qr);
3577 else
3578 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3579 }
faf82a0b 3580
0f5d15d6
IZ
3581 if (mg) {
3582 re = (regexp *)mg->mg_obj;
df0003d4 3583 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3584 }
3585 else {
3586 STRLEN len;
6136c704 3587 const char * const t = SvPV_const(ret, len);
0f5d15d6 3588 PMOP pm;
a3b680e6 3589 const I32 osize = PL_regsize;
0f5d15d6 3590
5fcd1c1b 3591 Zero(&pm, 1, PMOP);
4aabdb9b 3592 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3593 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3594 if (!(SvFLAGS(ret)
faf82a0b
AE
3595 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3596 | SVs_GMG)))
14befaf4
DM
3597 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3598 PERL_MAGIC_qr,0,0);
0f5d15d6 3599 PL_regsize = osize;
0f5d15d6 3600 }
4aabdb9b 3601 }
aa283a38
DM
3602
3603 /* run the pattern returned from (??{...}) */
3604
4aabdb9b
DM
3605 DEBUG_EXECUTE_r(
3606 PerlIO_printf(Perl_debug_log,
3607 "Entering embedded \"%s%.60s%s%s\"\n",
3608 PL_colors[0],
3609 re->precomp,
3610 PL_colors[1],
3611 (strlen(re->precomp) > 60 ? "..." : ""))
3612 );
2c2d71f5 3613
40a82448
DM
3614 ST.cp = regcppush(0); /* Save *all* the positions. */
3615 REGCP_SET(ST.lastcp);
4aabdb9b
DM
3616 *PL_reglastparen = 0;
3617 *PL_reglastcloseparen = 0;
4aabdb9b 3618 PL_reginput = locinput;
4aabdb9b
DM
3619
3620 /* XXXX This is too dramatic a measure... */
3621 PL_reg_maxiter = 0;
3622
5d9a96ca 3623 st->logical = 0;
40a82448 3624 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
aa283a38 3625 ((re->reganch & ROPT_UTF8) != 0);
40a82448
DM
3626 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3627 ST.prev_rex = rex;
aa283a38 3628 rex = re;
aa283a38 3629
40a82448 3630 ST.B = next;
aa283a38 3631 /* now continue from first node in postoned RE */
40a82448 3632 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
4aabdb9b 3633 /* NOTREACHED */
a0ed51b3 3634 }
4aabdb9b
DM
3635 /* /(?(?{...})X|Y)/ */
3636 st->sw = SvTRUE(ret);
3637 st->logical = 0;
c277df42
IZ
3638 break;
3639 }
40a82448
DM
3640
3641 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3642 if (ST.toggleutf)
3643 PL_reg_flags ^= RF_utf8;
3644 ReREFCNT_dec(rex);
3645 rex = ST.prev_rex;
3646 /* XXXX This is too dramatic a measure... */
3647 PL_reg_maxiter = 0;
3648 /* Restore parens of the caller without popping the
3649 * savestack */
3650 {
3651 const I32 tmp = PL_savestack_ix;
3652 PL_savestack_ix = ST.lastcp;
3653 regcppop(rex);
3654 PL_savestack_ix = tmp;
3655 }
3656 PL_reginput = locinput;
3657 /* continue at the node following the (??{...}) */
3658 scan = ST.B;
3659 continue;
3660
3661 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3662 /* Restore state to the outer re then re-throw the failure */
3663 if (ST.toggleutf)
3664 PL_reg_flags ^= RF_utf8;
3665 ReREFCNT_dec(rex);
3666 rex = ST.prev_rex;
3667
3668 /* XXXX This is too dramatic a measure... */
3669 PL_reg_maxiter = 0;
3670
3671 PL_reginput = locinput;
3672 REGCP_UNWIND(ST.lastcp);
3673 regcppop(rex);
3674 sayNO_SILENT;
3675
3676#undef ST
3677
a0d0e21e 3678 case OPEN:
c277df42 3679 n = ARG(scan); /* which paren pair */
3280af22
NIS
3680 PL_reg_start_tmp[n] = locinput;
3681 if (n > PL_regsize)
3682 PL_regsize = n;
a0d0e21e
LW
3683 break;
3684 case CLOSE:
c277df42 3685 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3686 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3687 PL_regendp[n] = locinput - PL_bostr;
eb160463 3688 if (n > (I32)*PL_reglastparen)
3280af22 3689 *PL_reglastparen = n;
a01268b5 3690 *PL_reglastcloseparen = n;
a0d0e21e 3691 break;
c277df42
IZ
3692 case GROUPP:
3693 n = ARG(scan); /* which paren pair */
5d9a96ca 3694 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3695 break;
3696 case IFTHEN:
2c2d71f5 3697 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3698 if (st->sw)
c277df42
IZ
3699 next = NEXTOPER(NEXTOPER(scan));
3700 else {
3701 next = scan + ARG(scan);
3702 if (OP(next) == IFTHEN) /* Fake one. */
3703 next = NEXTOPER(NEXTOPER(next));
3704 }
3705 break;
3706 case LOGICAL:
5d9a96ca 3707 st->logical = scan->flags;
c277df42 3708 break;
2ab05381 3709/*******************************************************************
a0374537
DM
3710 cc points to the regmatch_state associated with the most recent CURLYX.
3711 This struct contains info about the innermost (...)* loop (an
3712 "infoblock"), and a pointer to the next outer cc.
2ab05381
IZ
3713
3714 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3715
95b24440 3716 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3717
a0374537 3718 2) This regnode populates cc, and calls regmatch() recursively
95b24440 3719 with the starting point at WHILEM node;
2ab05381
IZ
3720
3721 3) Each hit of WHILEM node tries to match A and Z (in the order
3722 depending on the current iteration, min/max of {min,max} and
3723 greediness). The information about where are nodes for "A"
a0374537 3724 and "Z" is read from cc, as is info on how many times "A"
2ab05381
IZ
3725 was already matched, and greediness.
3726
3727 4) After A matches, the same WHILEM node is hit again.
3728
95b24440 3729 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
2ab05381 3730 of the same pair. Thus when WHILEM tries to match Z, it temporarily
95b24440 3731 resets cc, since this Y(A)*Z can be a part of some other loop:
2ab05381
IZ
3732 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3733 of the external loop.
3734
a0374537 3735 Currently present infoblocks form a tree with a stem formed by st->cc
2ab05381
IZ
3736 and whatever it mentions via ->next, and additional attached trees
3737 corresponding to temporarily unset infoblocks as in "5" above.
3738
95b24440 3739 In the following picture, infoblocks for outer loop of
2ab05381
IZ
3740 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3741 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3742 infoblocks are drawn below the "reset" infoblock.
3743
3744 In fact in the picture below we do not show failed matches for Z and T
3745 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3746 more obvious *why* one needs to *temporary* unset infoblocks.]
3747
3748 Matched REx position InfoBlocks Comment
3749 (Y(A)*?Z)*?T x
3750 Y(A)*?Z)*?T x <- O
3751 Y (A)*?Z)*?T x <- O
3752 Y A)*?Z)*?T x <- O <- I
3753 YA )*?Z)*?T x <- O <- I
3754 YA A)*?Z)*?T x <- O <- I
3755 YAA )*?Z)*?T x <- O <- I
3756 YAA Z)*?T x <- O # Temporary unset I
3757 I
3758
3759 YAAZ Y(A)*?Z)*?T x <- O
3760 I
3761
3762 YAAZY (A)*?Z)*?T x <- O
3763 I
3764
3765 YAAZY A)*?Z)*?T x <- O <- I
3766 I
3767
3768 YAAZYA )*?Z)*?T x <- O <- I
3769 I
3770
3771 YAAZYA Z)*?T x <- O # Temporary unset I
3772 I,I
3773
3774 YAAZYAZ )*?T x <- O
3775 I,I
3776
3777 YAAZYAZ T x # Temporary unset O
3778 O
3779 I,I
3780
3781 YAAZYAZT x
3782 O
3783 I,I
3784 *******************************************************************/
95b24440 3785
a0d0e21e 3786 case CURLYX: {
cb434fcc
IZ
3787 /* No need to save/restore up to this paren */
3788 I32 parenfloor = scan->flags;
c277df42 3789
c2b7afd3
NC
3790 /* Dave says:
3791
3792 CURLYX and WHILEM are always paired: they're the moral
3793 equivalent of pp_enteriter anbd pp_iter.
3794
3795 The only time next could be null is if the node tree is
3796 corrupt. This was mentioned on p5p a few days ago.
3797
3798 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3799 So we'll assert that this is true:
3800 */
3801 assert(next);
30b2893d 3802 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
c277df42 3803 next += ARG(next);
cb434fcc
IZ
3804 /* XXXX Probably it is better to teach regpush to support
3805 parenfloor > PL_regsize... */
eb160463 3806 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc 3807 parenfloor = *PL_reglastparen; /* Pessimization... */
a0374537 3808
d8319b27
DM
3809 st->u.curlyx.cp = PL_savestack_ix;
3810 st->u.curlyx.outercc = st->cc;
a0374537
DM
3811 st->cc = st;
3812 /* these fields contain the state of the current curly.
3813 * they are accessed by subsequent WHILEMs;
3814 * cur and lastloc are also updated by WHILEM */
d8319b27
DM
3815 st->u.curlyx.parenfloor = parenfloor;
3816 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3817 st->u.curlyx.min = ARG1(scan);
3818 st->u.curlyx.max = ARG2(scan);
3819 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3820 st->u.curlyx.lastloc = 0;
a0374537
DM
3821 /* st->next and st->minmod are also read by WHILEM */
3822
3280af22 3823 PL_reginput = locinput;
95b24440
DM
3824 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3825 /*** all unsaved local vars undefined at this point */
d8319b27
DM
3826 regcpblow(st->u.curlyx.cp);
3827 st->cc = st->u.curlyx.outercc;
95b24440 3828 saySAME(result);
a0d0e21e 3829 }
5f66b61c 3830 /* NOTREACHED */
a0d0e21e
LW
3831 case WHILEM: {
3832 /*
3833 * This is really hard to understand, because after we match
3834 * what we're trying to match, we must make sure the rest of
2c2d71f5 3835 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3836 * to go back UP the parse tree by recursing ever deeper. And
3837 * if it fails, we have to reset our parent's current state
3838 * that we can try again after backing off.
3839 */
3840
c2b7afd3
NC
3841 /* Dave says:
3842
3843 st->cc gets initialised by CURLYX ready for use by WHILEM.
3844 So again, unless somethings been corrupted, st->cc cannot
3845 be null at that point in WHILEM.
3846
3847 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3848 So we'll assert that this is true:
3849 */
3850 assert(st->cc);
d8319b27
DM
3851 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3852 st->u.whilem.cache_offset = 0;
3853 st->u.whilem.cache_bit = 0;
c277df42 3854
d8319b27 3855 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3280af22 3856 PL_reginput = locinput;
a0d0e21e 3857
a3621e74 3858 DEBUG_EXECUTE_r(
9041c2e3 3859 PerlIO_printf(Perl_debug_log,
91f3b821 3860 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3861 REPORT_CODE_OFF+PL_regindent*2, "",
d8319b27
DM
3862 (long)n, (long)st->cc->u.curlyx.min,
3863 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
c277df42 3864 );
4633a7c4 3865
a0d0e21e
LW
3866 /* If degenerate scan matches "", assume scan done. */
3867
d8319b27
DM
3868 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3869 st->u.whilem.savecc = st->cc;
3870 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3871 if (st->cc)
d8319b27 3872 st->ln = st->cc->u.curlyx.cur;
a3621e74 3873 DEBUG_EXECUTE_r(
c3464db5
DD
3874 PerlIO_printf(Perl_debug_log,
3875 "%*s empty match detected, try continuation...\n",
3280af22 3876 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3877 );
d8319b27 3878 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
95b24440 3879 /*** all unsaved local vars undefined at this point */
d8319b27 3880 st->cc = st->u.whilem.savecc;
95b24440 3881 if (result)
4633a7c4 3882 sayYES;
d8319b27
DM
3883 if (st->cc->u.curlyx.outercc)
3884 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4633a7c4 3885 sayNO;
a0d0e21e
LW
3886 }
3887
3888 /* First just match a string of min scans. */
3889
d8319b27
DM
3890 if (n < st->cc->u.curlyx.min) {
3891 st->cc->u.curlyx.cur = n;
3892 st->cc->u.curlyx.lastloc = locinput;
3893 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
95b24440
DM
3894 /*** all unsaved local vars undefined at this point */
3895 if (result)
4633a7c4 3896 sayYES;
d8319b27
DM
3897 st->cc->u.curlyx.cur = n - 1;
3898 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
4633a7c4 3899 sayNO;
a0d0e21e
LW
3900 }
3901
2c2d71f5
JH
3902 if (scan->flags) {
3903 /* Check whether we already were at this position.
3904 Postpone detection until we know the match is not
3905 *that* much linear. */
3906 if (!PL_reg_maxiter) {
3907 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
3908 /* possible overflow for long strings and many CURLYX's */
3909 if (PL_reg_maxiter < 0)
3910 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
3911 PL_reg_leftiter = PL_reg_maxiter;
3912 }
3913 if (PL_reg_leftiter-- == 0) {
a3b680e6 3914 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
2c2d71f5 3915 if (PL_reg_poscache) {
eb160463 3916 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3917 Renew(PL_reg_poscache, size, char);
3918 PL_reg_poscache_size = size;
3919 }
3920 Zero(PL_reg_poscache, size, char);
3921 }
3922 else {
3923 PL_reg_poscache_size = size;
a02a5408 3924 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3925 }
a3621e74 3926 DEBUG_EXECUTE_r(
2c2d71f5
JH
3927 PerlIO_printf(Perl_debug_log,
3928 "%sDetected a super-linear match, switching on caching%s...\n",
3929 PL_colors[4], PL_colors[5])
3930 );
3931 }
3932 if (PL_reg_leftiter < 0) {
d8319b27 3933 st->u.whilem.cache_offset = locinput - PL_bostr;
2c2d71f5 3934
d8319b27
DM
3935 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3936 + st->u.whilem.cache_offset * (scan->flags>>4);
3937 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3938 st->u.whilem.cache_offset /= 8;
3939 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
a3621e74 3940 DEBUG_EXECUTE_r(
2c2d71f5
JH
3941 PerlIO_printf(Perl_debug_log,
3942 "%*s already tried at this position...\n",
3943 REPORT_CODE_OFF+PL_regindent*2, "")
3944 );
3ab3c9b4
HS
3945 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3946 /* cache records success */
c2b0868c
HS
3947 sayYES;
3948 else
3ab3c9b4 3949 /* cache records failure */
c2b0868c 3950 sayNO_SILENT;
2c2d71f5 3951 }
2c2d71f5
JH
3952 }
3953 }
3954
a0d0e21e
LW
3955 /* Prefer next over scan for minimal matching. */
3956
5d9a96ca 3957 if (st->cc->minmod) {
d8319b27
DM
3958 st->u.whilem.savecc = st->cc;
3959 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3960 if (st->cc)
d8319b27
DM
3961 st->ln = st->cc->u.curlyx.cur;
3962 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3963 REGCP_SET(st->u.whilem.lastcp);
3964 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
95b24440 3965 /*** all unsaved local vars undefined at this point */
d8319b27 3966 st->cc = st->u.whilem.savecc;
95b24440 3967 if (result) {
d8319b27 3968 regcpblow(st->u.whilem.cp);
3ab3c9b4 3969 CACHEsayYES; /* All done. */
5f05dabc 3970 }
d8319b27 3971 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3972 regcppop(rex);
d8319b27
DM
3973 if (st->cc->u.curlyx.outercc)
3974 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
a0d0e21e 3975
d8319b27 3976 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
9041c2e3 3977 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3978 && !(PL_reg_flags & RF_warned)) {
3979 PL_reg_flags |= RF_warned;
9014280d 3980 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3981 "Complex regular subexpression recursion",
3982 REG_INFTY - 1);
c277df42 3983 }
3ab3c9b4 3984 CACHEsayNO;
c277df42 3985 }
a687059c 3986
a3621e74 3987 DEBUG_EXECUTE_r(
c3464db5
DD
3988 PerlIO_printf(Perl_debug_log,
3989 "%*s trying longer...\n",
3280af22 3990 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3991 );
a0d0e21e 3992 /* Try scanning more and see if it helps. */
3280af22 3993 PL_reginput = locinput;
d8319b27
DM
3994 st->cc->u.curlyx.cur = n;
3995 st->cc->u.curlyx.lastloc = locinput;
3996 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3997 REGCP_SET(st->u.whilem.lastcp);
3998 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
95b24440
DM
3999 /*** all unsaved local vars undefined at this point */
4000 if (result) {
d8319b27 4001 regcpblow(st->u.whilem.cp);
3ab3c9b4 4002 CACHEsayYES;
5f05dabc 4003 }
d8319b27 4004 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 4005 regcppop(rex);
d8319b27
DM
4006 st->cc->u.curlyx.cur = n - 1;
4007 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 4008 CACHEsayNO;
a0d0e21e
LW
4009 }
4010
4011 /* Prefer scan over next for maximal matching. */
4012
d8319b27
DM
4013 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
4014 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
4015 st->cc->u.curlyx.cur = n;
4016 st->cc->u.curlyx.lastloc = locinput;
4017 REGCP_SET(st->u.whilem.lastcp);
4018 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
95b24440
DM
4019 /*** all unsaved local vars undefined at this point */
4020 if (result) {
d8319b27 4021 regcpblow(st->u.whilem.cp);
3ab3c9b4 4022 CACHEsayYES;
5f05dabc 4023 }
d8319b27 4024 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 4025 regcppop(rex); /* Restore some previous $<digit>s? */
3280af22 4026 PL_reginput = locinput;
a3621e74 4027 DEBUG_EXECUTE_r(
c3464db5
DD
4028 PerlIO_printf(Perl_debug_log,
4029 "%*s failed, try continuation...\n",
3280af22 4030 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
4031 );
4032 }
9041c2e3 4033 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 4034 && !(PL_reg_flags & RF_warned)) {
3280af22 4035 PL_reg_flags |= RF_warned;
9014280d 4036 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
4037 "Complex regular subexpression recursion",
4038 REG_INFTY - 1);
a0d0e21e
LW
4039 }
4040
4041 /* Failed deeper matches of scan, so see if this one works. */
d8319b27
DM
4042 st->u.whilem.savecc = st->cc;
4043 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 4044 if (st->cc)
d8319b27
DM
4045 st->ln = st->cc->u.curlyx.cur;
4046 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
95b24440 4047 /*** all unsaved local vars undefined at this point */
d8319b27 4048 st->cc = st->u.whilem.savecc;
95b24440 4049 if (result)
3ab3c9b4 4050 CACHEsayYES;
d8319b27
DM
4051 if (st->cc->u.curlyx.outercc)
4052 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4053 st->cc->u.curlyx.cur = n - 1;
4054 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 4055 CACHEsayNO;
a0d0e21e 4056 }
5f66b61c 4057 /* NOTREACHED */
40a82448
DM
4058
4059#undef ST
4060#define ST st->u.branch
4061
4062 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
4063 next = scan + ARG(scan);
4064 if (next == scan)
4065 next = NULL;
40a82448
DM
4066 scan = NEXTOPER(scan);
4067 /* FALL THROUGH */
c277df42 4068
40a82448
DM
4069 case BRANCH: /* /(...|A|...)/ */
4070 scan = NEXTOPER(scan); /* scan now points to inner node */
4071 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
4072 /* last branch; skip state push and jump direct to node */
4073 continue;
4074 ST.lastparen = *PL_reglastparen;
4075 ST.next_branch = next;
4076 REGCP_SET(ST.cp);
4077 PL_reginput = locinput;
02db2b7b 4078
40a82448
DM
4079 /* Now go into the branch */
4080 PUSH_STATE_GOTO(BRANCH_next, scan);
4081 /* NOTREACHED */
4082
4083 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4084 REGCP_UNWIND(ST.cp);
4085 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4086 PL_regendp[n] = -1;
4087 *PL_reglastparen = n;
4088 scan = ST.next_branch;
4089 /* no more branches? */
4090 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
4091 sayNO;
4092 continue; /* execute next BRANCH[J] op */
4093 /* NOTREACHED */
4094
a0d0e21e 4095 case MINMOD:
5d9a96ca 4096 st->minmod = 1;
a0d0e21e 4097 break;
40a82448
DM
4098
4099#undef ST
4100#define ST st->u.curlym
4101
4102 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4103
4104 /* This is an optimisation of CURLYX that enables us to push
4105 * only a single backtracking state, no matter now many matches
4106 * there are in {m,n}. It relies on the pattern being constant
4107 * length, with no parens to influence future backrefs
4108 */
4109
4110 ST.me = scan;
dc45a647 4111 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
4112
4113 /* if paren positive, emulate an OPEN/CLOSE around A */
4114 if (ST.me->flags) {
4115 I32 paren = ST.me->flags;
4116 if (paren > PL_regsize)
4117 PL_regsize = paren;
4118 if (paren > (I32)*PL_reglastparen)
4119 *PL_reglastparen = paren;
c277df42 4120 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 4121 }
40a82448
DM
4122 ST.A = scan;
4123 ST.B = next;
4124 ST.alen = 0;
4125 ST.count = 0;
4126 ST.minmod = st->minmod;
4127 st->minmod = 0;
4128 ST.c1 = CHRTEST_UNINIT;
4129 REGCP_SET(ST.cp);
6407bf3b 4130
40a82448
DM
4131 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4132 goto curlym_do_B;
4133
4134 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 4135 PL_reginput = locinput;
40a82448
DM
4136 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4137 /* NOTREACHED */
5f80c4cf 4138
40a82448
DM
4139 case CURLYM_A: /* we've just matched an A */
4140 locinput = st->locinput;
4141 nextchr = UCHARAT(locinput);
4142
4143 ST.count++;
4144 /* after first match, determine A's length: u.curlym.alen */
4145 if (ST.count == 1) {
4146 if (PL_reg_match_utf8) {
4147 char *s = locinput;
4148 while (s < PL_reginput) {
4149 ST.alen++;
4150 s += UTF8SKIP(s);
4151 }
4152 }
4153 else {
4154 ST.alen = PL_reginput - locinput;
4155 }
4156 if (ST.alen == 0)
4157 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4158 }
0cadcf80
DM
4159 DEBUG_EXECUTE_r(
4160 PerlIO_printf(Perl_debug_log,
40a82448 4161 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
0cadcf80 4162 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
40a82448 4163 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
4164 );
4165
40a82448
DM
4166 locinput = PL_reginput;
4167 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
4168 goto curlym_do_A; /* try to match another A */
4169 goto curlym_do_B; /* try to match B */
5f80c4cf 4170
40a82448
DM
4171 case CURLYM_A_fail: /* just failed to match an A */
4172 REGCP_UNWIND(ST.cp);
4173 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
4174 sayNO;
0cadcf80 4175
40a82448
DM
4176 curlym_do_B: /* execute the B in /A{m,n}B/ */
4177 PL_reginput = locinput;
4178 if (ST.c1 == CHRTEST_UNINIT) {
4179 /* calculate c1 and c2 for possible match of 1st char
4180 * following curly */
4181 ST.c1 = ST.c2 = CHRTEST_VOID;
4182 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4183 regnode *text_node = ST.B;
4184 if (! HAS_TEXT(text_node))
4185 FIND_NEXT_IMPT(text_node);
4186 if (HAS_TEXT(text_node)
4187 && PL_regkind[OP(text_node)] != REF)
4188 {
4189 ST.c1 = (U8)*STRING(text_node);
4190 ST.c2 =
4191 (OP(text_node) == EXACTF || OP(text_node) == REFF)
4192 ? PL_fold[ST.c1]
4193 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4194 ? PL_fold_locale[ST.c1]
4195 : ST.c1;
c277df42 4196 }
c277df42 4197 }
40a82448
DM
4198 }
4199
4200 DEBUG_EXECUTE_r(
4201 PerlIO_printf(Perl_debug_log,
4202 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4203 (int)(REPORT_CODE_OFF+PL_regindent*2),
4204 "", (IV)ST.count)
4205 );
4206 if (ST.c1 != CHRTEST_VOID
4207 && UCHARAT(PL_reginput) != ST.c1
4208 && UCHARAT(PL_reginput) != ST.c2)
4209 {
4210 /* simulate B failing */
4211 state_num = CURLYM_B_fail;
4212 goto reenter_switch;
4213 }
4214
4215 if (ST.me->flags) {
4216 /* mark current A as captured */
4217 I32 paren = ST.me->flags;
4218 if (ST.count) {
4219 PL_regstartp[paren]
4220 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4221 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 4222 }
40a82448
DM
4223 else
4224 PL_regendp[paren] = -1;
c277df42 4225 }
40a82448 4226 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 4227 /* NOTREACHED */
40a82448
DM
4228
4229 case CURLYM_B_fail: /* just failed to match a B */
4230 REGCP_UNWIND(ST.cp);
4231 if (ST.minmod) {
4232 if (ST.count == ARG2(ST.me) /* max */)
4233 sayNO;
4234 goto curlym_do_A; /* try to match a further A */
4235 }
4236 /* backtrack one A */
4237 if (ST.count == ARG1(ST.me) /* min */)
4238 sayNO;
4239 ST.count--;
4240 locinput = HOPc(locinput, -ST.alen);
4241 goto curlym_do_B; /* try to match B */
4242
4243
c277df42 4244 case CURLYN:
d8319b27
DM
4245 st->u.plus.paren = scan->flags; /* Which paren to set */
4246 if (st->u.plus.paren > PL_regsize)
4247 PL_regsize = st->u.plus.paren;
4248 if (st->u.plus.paren > (I32)*PL_reglastparen)
4249 *PL_reglastparen = st->u.plus.paren;
5d9a96ca 4250 st->ln = ARG1(scan); /* min to match */
c277df42 4251 n = ARG2(scan); /* max to match */
dc45a647 4252 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 4253 goto repeat;
a0d0e21e 4254 case CURLY:
d8319b27 4255 st->u.plus.paren = 0;
5d9a96ca 4256 st->ln = ARG1(scan); /* min to match */
a0d0e21e 4257 n = ARG2(scan); /* max to match */
dc45a647 4258 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
4259 goto repeat;
4260 case STAR:
5d9a96ca 4261 st->ln = 0;
c277df42 4262 n = REG_INFTY;
a0d0e21e 4263 scan = NEXTOPER(scan);
d8319b27 4264 st->u.plus.paren = 0;
a0d0e21e
LW
4265 goto repeat;
4266 case PLUS:
5d9a96ca 4267 st->ln = 1;
c277df42
IZ
4268 n = REG_INFTY;
4269 scan = NEXTOPER(scan);
d8319b27 4270 st->u.plus.paren = 0;
c277df42 4271 repeat:
a0d0e21e
LW
4272 /*
4273 * Lookahead to avoid useless match attempts
4274 * when we know what character comes next.
4275 */
5f80c4cf
JP
4276
4277 /*
4278 * Used to only do .*x and .*?x, but now it allows
4279 * for )'s, ('s and (?{ ... })'s to be in the way
4280 * of the quantifier and the EXACT-like node. -- japhy
4281 */
4282
cca55fe3 4283 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4284 U8 *s;
4285 regnode *text_node = next;
4286
3dab1dad
YO
4287 if (! HAS_TEXT(text_node))
4288 FIND_NEXT_IMPT(text_node);
5f80c4cf 4289
9e137952
DM
4290 if (! HAS_TEXT(text_node))
4291 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
5f80c4cf 4292 else {
3dab1dad 4293 if (PL_regkind[OP(text_node)] == REF) {
9e137952 4294 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
44a68960 4295 goto assume_ok_easy;
cca55fe3 4296 }
be8e71aa
YO
4297 else
4298 s = (U8*)STRING(text_node);
5f80c4cf
JP
4299
4300 if (!UTF) {
d8319b27 4301 st->u.plus.c2 = st->u.plus.c1 = *s;
f65d3ee7 4302 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
d8319b27 4303 st->u.plus.c2 = PL_fold[st->u.plus.c1];
f65d3ee7 4304 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
d8319b27 4305 st->u.plus.c2 = PL_fold_locale[st->u.plus.c1];
1aa99e6b 4306 }
5f80c4cf 4307 else { /* UTF */
f65d3ee7 4308 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 4309 STRLEN ulen1, ulen2;
89ebb4a3
JH
4310 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4311 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4312
4313 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4314 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4315
d8319b27 4316 st->u.plus.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
041457d9 4317 uniflags);
d8319b27 4318 st->u.plus.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
041457d9 4319 uniflags);
5f80c4cf
JP
4320 }
4321 else {
d8319b27 4322 st->u.plus.c2 = st->u.plus.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4323 uniflags);
5f80c4cf 4324 }
1aa99e6b
IH
4325 }
4326 }
bbce6d69 4327 }
a0d0e21e 4328 else
9e137952 4329 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
cca55fe3 4330 assume_ok_easy:
3280af22 4331 PL_reginput = locinput;
5d9a96ca
DM
4332 if (st->minmod) {
4333 st->minmod = 0;
32fc9b6a 4334 if (st->ln && regrepeat(rex, scan, st->ln) < st->ln)
4633a7c4 4335 sayNO;
a0ed51b3 4336 locinput = PL_reginput;
d8319b27 4337 REGCP_SET(st->u.plus.lastcp);
9e137952 4338 if (st->u.plus.c1 != CHRTEST_VOID) {
d8319b27
DM
4339 st->u.plus.old = locinput;
4340 st->u.plus.count = 0;
0fe9bf95 4341
1aa99e6b 4342 if (n == REG_INFTY) {
d8319b27 4343 st->u.plus.e = PL_regeol - 1;
1aa99e6b 4344 if (do_utf8)
d8319b27
DM
4345 while (UTF8_IS_CONTINUATION(*(U8*)st->u.plus.e))
4346 st->u.plus.e--;
1aa99e6b
IH
4347 }
4348 else if (do_utf8) {
5d9a96ca 4349 int m = n - st->ln;
d8319b27
DM
4350 for (st->u.plus.e = locinput;
4351 m >0 && st->u.plus.e + UTF8SKIP(st->u.plus.e) <= PL_regeol; m--)
4352 st->u.plus.e += UTF8SKIP(st->u.plus.e);
1aa99e6b
IH
4353 }
4354 else {
d8319b27
DM
4355 st->u.plus.e = locinput + n - st->ln;
4356 if (st->u.plus.e >= PL_regeol)
4357 st->u.plus.e = PL_regeol - 1;
1aa99e6b 4358 }
0fe9bf95
IZ
4359 while (1) {
4360 /* Find place 'next' could work */
1aa99e6b 4361 if (!do_utf8) {
d8319b27
DM
4362 if (st->u.plus.c1 == st->u.plus.c2) {
4363 while (locinput <= st->u.plus.e &&
4364 UCHARAT(locinput) != st->u.plus.c1)
1aa99e6b
IH
4365 locinput++;
4366 } else {
d8319b27
DM
4367 while (locinput <= st->u.plus.e
4368 && UCHARAT(locinput) != st->u.plus.c1
4369 && UCHARAT(locinput) != st->u.plus.c2)
1aa99e6b
IH
4370 locinput++;
4371 }
d8319b27 4372 st->u.plus.count = locinput - st->u.plus.old;
1aa99e6b
IH
4373 }
4374 else {
d8319b27 4375 if (st->u.plus.c1 == st->u.plus.c2) {
a3b680e6 4376 STRLEN len;
872c91ae
JH
4377 /* count initialised to
4378 * utf8_distance(old, locinput) */
d8319b27 4379 while (locinput <= st->u.plus.e &&
872c91ae 4380 utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4381 UTF8_MAXBYTES, &len,
d8319b27 4382 uniflags) != (UV)st->u.plus.c1) {
1aa99e6b 4383 locinput += len;
d8319b27 4384 st->u.plus.count++;
b2f2f093 4385 }
1aa99e6b 4386 } else {
872c91ae
JH
4387 /* count initialised to
4388 * utf8_distance(old, locinput) */
d8319b27 4389 while (locinput <= st->u.plus.e) {
c4fd8992
AL
4390 STRLEN len;
4391 const UV c = utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4392 UTF8_MAXBYTES, &len,
041457d9 4393 uniflags);
d8319b27 4394 if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2)
1aa99e6b 4395 break;
b2f2f093 4396 locinput += len;
d8319b27 4397 st->u.plus.count++;
1aa99e6b
IH
4398 }
4399 }
0fe9bf95 4400 }
d8319b27 4401 if (locinput > st->u.plus.e)
0fe9bf95
IZ
4402 sayNO;
4403 /* PL_reginput == old now */
d8319b27 4404 if (locinput != st->u.plus.old) {
5d9a96ca 4405 st->ln = 1; /* Did some */
32fc9b6a 4406 if (regrepeat(rex, scan, st->u.plus.count) < st->u.plus.count)
0fe9bf95
IZ
4407 sayNO;
4408 }
4409 /* PL_reginput == locinput now */
2afa90c7 4410 PL_reginput = locinput; /* Could be reset... */
d8319b27 4411 TRYPAREN(st->u.plus.paren, st->ln, locinput, PLUS1);
95b24440 4412 /*** all unsaved local vars undefined at this point */
2afa90c7 4413
d8319b27 4414 REGCP_UNWIND(st->u.plus.lastcp);
0fe9bf95 4415 /* Couldn't or didn't -- move forward. */
d8319b27 4416 st->u.plus.old = locinput;
1aa99e6b
IH
4417 if (do_utf8)
4418 locinput += UTF8SKIP(locinput);
4419 else
4420 locinput++;
d8319b27 4421 st->u.plus.count = 1;
0fe9bf95
IZ
4422 }
4423 }
4424 else
5d9a96ca 4425 while (n >= st->ln || (n == REG_INFTY && st->ln > 0)) { /* ln overflow ? */
1aa99e6b 4426 UV c;
9e137952 4427 if (st->u.plus.c1 != CHRTEST_VOID) {
1aa99e6b 4428 if (do_utf8)
872c91ae 4429 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4430 UTF8_MAXBYTES, 0,
041457d9 4431 uniflags);
1aa99e6b 4432 else
9041c2e3 4433 c = UCHARAT(PL_reginput);
2390ecbc 4434 /* If it could work, try it. */
be8e71aa 4435 if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2) {
d8319b27 4436 TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS2);
95b24440 4437 /*** all unsaved local vars undefined at this point */
d8319b27 4438 REGCP_UNWIND(st->u.plus.lastcp);
2390ecbc 4439 }
1aa99e6b 4440 }
a0d0e21e 4441 /* If it could work, try it. */
be8e71aa 4442 else if (st->u.plus.c1 == CHRTEST_VOID) {
d8319b27 4443 TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS3);
95b24440 4444 /*** all unsaved local vars undefined at this point */
d8319b27 4445 REGCP_UNWIND(st->u.plus.lastcp);
bbce6d69 4446 }
c277df42 4447 /* Couldn't or didn't -- move forward. */
a0ed51b3 4448 PL_reginput = locinput;
32fc9b6a 4449 if (regrepeat(rex, scan, 1)) {
5d9a96ca 4450 st->ln++;
a0ed51b3
LW
4451 locinput = PL_reginput;
4452 }
4453 else
4633a7c4 4454 sayNO;
a0d0e21e
LW
4455 }
4456 }
4457 else {
32fc9b6a 4458 n = regrepeat(rex, scan, n);
a0ed51b3 4459 locinput = PL_reginput;
3dab1dad
YO
4460 if ((st->ln < n) && (PL_regkind[OP(next)] == EOL) &&
4461 (OP(next) != MEOL || OP(next) == SEOL || OP(next) == EOS))
15272685 4462 {
5d9a96ca 4463 st->ln = n; /* why back off? */
1aeab75a
GS
4464 /* ...because $ and \Z can match before *and* after
4465 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4466 We should back off by one in this case. */
4467 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
5d9a96ca 4468 st->ln--;
1aeab75a 4469 }
d8319b27 4470 REGCP_SET(st->u.plus.lastcp);
1d5c262f 4471 {
5d9a96ca 4472 while (n >= st->ln) {
be8e71aa 4473 UV c = 0;
9e137952 4474 if (st->u.plus.c1 != CHRTEST_VOID) {
1aa99e6b 4475 if (do_utf8)
872c91ae 4476 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4477 UTF8_MAXBYTES, 0,
041457d9 4478 uniflags);
1aa99e6b 4479 else
9041c2e3 4480 c = UCHARAT(PL_reginput);
1aa99e6b 4481 }
c277df42 4482 /* If it could work, try it. */
be8e71aa
YO
4483 if (st->u.plus.c1 == CHRTEST_VOID || c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2) {
4484 TRYPAREN(st->u.plus.paren, n, PL_reginput, PLUS4);
4485 /*** all unsaved local vars undefined at this point */
4486 REGCP_UNWIND(st->u.plus.lastcp);
4487 }
c277df42
IZ
4488 /* Couldn't or didn't -- back up. */
4489 n--;
dfe13c55 4490 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 4491 }
a0d0e21e
LW
4492 }
4493 }
4633a7c4 4494 sayNO;
c277df42 4495 break;
a0d0e21e 4496 case END:
3b0527fe 4497 if (locinput < reginfo->till) {
a3621e74 4498 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4499 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4500 PL_colors[4],
4501 (long)(locinput - PL_reg_starttry),
3b0527fe 4502 (long)(reginfo->till - PL_reg_starttry),
7821416a
IZ
4503 PL_colors[5]));
4504 sayNO_FINAL; /* Cannot match: too short. */
4505 }
4506 PL_reginput = locinput; /* put where regtry can find it */
4507 sayYES_FINAL; /* Success! */
dad79028
DM
4508
4509 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4510 DEBUG_EXECUTE_r(
4511 PerlIO_printf(Perl_debug_log,
4512 "%*s %ssubpattern success...%s\n",
4513 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
3280af22 4514 PL_reginput = locinput; /* put where regtry can find it */
dad79028
DM
4515 sayYES_FINAL; /* Success! */
4516
40a82448
DM
4517#undef ST
4518#define ST st->u.ifmatch
4519
4520 case SUSPEND: /* (?>A) */
4521 ST.wanted = 1;
9fe1d20c 4522 PL_reginput = locinput;
9041c2e3 4523 goto do_ifmatch;
dad79028 4524
40a82448
DM
4525 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4526 ST.wanted = 0;
dad79028
DM
4527 goto ifmatch_trivial_fail_test;
4528
40a82448
DM
4529 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4530 ST.wanted = 1;
dad79028 4531 ifmatch_trivial_fail_test:
a0ed51b3 4532 if (scan->flags) {
52657f30 4533 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4534 if (!s) {
4535 /* trivial fail */
4536 if (st->logical) {
4537 st->logical = 0;
40a82448 4538 st->sw = 1 - (bool)ST.wanted;
dad79028 4539 }
40a82448 4540 else if (ST.wanted)
dad79028
DM
4541 sayNO;
4542 next = scan + ARG(scan);
4543 if (next == scan)
4544 next = NULL;
4545 break;
4546 }
efb30f32 4547 PL_reginput = s;
a0ed51b3
LW
4548 }
4549 else
4550 PL_reginput = locinput;
4551
c277df42 4552 do_ifmatch:
40a82448
DM
4553 ST.me = scan;
4554 /* execute body of (?...A) */
4555 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4556 /* NOTREACHED */
4557
4558 case IFMATCH_A_fail: /* body of (?...A) failed */
4559 ST.wanted = !ST.wanted;
4560 /* FALL THROUGH */
4561
4562 case IFMATCH_A: /* body of (?...A) succeeded */
4563 if (st->logical) {
4564 st->logical = 0;
4565 st->sw = (bool)ST.wanted;
4566 }
4567 else if (!ST.wanted)
4568 sayNO;
4569
4570 if (OP(ST.me) == SUSPEND)
4571 locinput = PL_reginput;
4572 else {
4573 locinput = PL_reginput = st->locinput;
4574 nextchr = UCHARAT(locinput);
4575 }
4576 scan = ST.me + ARG(ST.me);
4577 if (scan == ST.me)
4578 scan = NULL;
4579 continue; /* execute B */
4580
4581#undef ST
dad79028 4582
c277df42 4583 case LONGJMP:
c277df42
IZ
4584 next = scan + ARG(scan);
4585 if (next == scan)
4586 next = NULL;
a0d0e21e
LW
4587 break;
4588 default:
b900a521 4589 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4590 PTR2UV(scan), OP(scan));
cea2e8a9 4591 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4592 }
95b24440 4593
a0d0e21e 4594 scan = next;
95b24440
DM
4595 continue;
4596 /* NOTREACHED */
4597
40a82448
DM
4598 push_yes_state:
4599 /* push a state that backtracks on success */
4600 st->u.yes.prev_yes_state = yes_state;
4601 yes_state = st;
4602 /* FALL THROUGH */
4603 push_state:
4604 /* push a new regex state, then continue at scan */
4605 {
4606 regmatch_state *newst;
4607
4608 depth++;
4609 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4610 "PUSH STATE(%d)\n", depth));
4611 st->locinput = locinput;
4612 newst = st+1;
4613 if (newst > SLAB_LAST(PL_regmatch_slab))
4614 newst = S_push_slab(aTHX);
4615 PL_regmatch_state = newst;
4616 newst->cc = st->cc;
4617 /* XXX probably don't need to initialise these */
4618 newst->minmod = 0;
4619 newst->sw = 0;
4620 newst->logical = 0;
4621
4622 locinput = PL_reginput;
4623 nextchr = UCHARAT(locinput);
4624 st = newst;
4625 continue;
4626 /* NOTREACHED */
4627 }
4628
95b24440
DM
4629 /* simulate recursively calling regmatch(), but without actually
4630 * recursing - ie save the current state on the heap rather than on
4631 * the stack, then re-enter the loop. This avoids complex regexes
4632 * blowing the processor stack */
4633
4634 start_recurse:
4635 {
5d9a96ca
DM
4636 /* push new state */
4637 regmatch_state *oldst = st;
4638
4639 depth++;
40a82448 4640 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH RECURSE STATE(%d)\n", depth));
5d9a96ca
DM
4641
4642 /* grab the next free state slot */
4643 st++;
86545054 4644 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
4645 st = S_push_slab(aTHX);
4646 PL_regmatch_state = st;
4647
4648 oldst->next = next;
4649 oldst->n = n;
4650 oldst->locinput = locinput;
5d9a96ca
DM
4651
4652 st->cc = oldst->cc;
95b24440
DM
4653 locinput = PL_reginput;
4654 nextchr = UCHARAT(locinput);
5d9a96ca
DM
4655 st->minmod = 0;
4656 st->sw = 0;
4657 st->logical = 0;
95b24440
DM
4658#ifdef DEBUGGING
4659 PL_regindent++;
4660#endif
4661 }
a0d0e21e 4662 }
a687059c 4663
aa283a38
DM
4664
4665
a0d0e21e
LW
4666 /*
4667 * We get here only if there's trouble -- normally "case END" is
4668 * the terminating point.
4669 */
cea2e8a9 4670 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4671 /*NOTREACHED*/
4633a7c4
LW
4672 sayNO;
4673
7821416a 4674yes_final:
77cb431f
DM
4675
4676 if (yes_state) {
4677 /* we have successfully completed a subexpression, but we must now
4678 * pop to the state marked by yes_state and continue from there */
4679
4680 assert(st != yes_state);
4681 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4682 || yes_state > SLAB_LAST(PL_regmatch_slab))
4683 {
4684 /* not in this slab, pop slab */
4685 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4686 PL_regmatch_slab = PL_regmatch_slab->prev;
4687 st = SLAB_LAST(PL_regmatch_slab);
4688 }
4689 depth -= (st - yes_state);
40a82448
DM
4690 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%d..%d)\n",
4691 depth+1, depth+(st - yes_state)));
77cb431f
DM
4692 st = yes_state;
4693 yes_state = st->u.yes.prev_yes_state;
4694 PL_regmatch_state = st;
4695
4696 switch (st->resume_state) {
40a82448
DM
4697 case IFMATCH_A:
4698 case CURLYM_A:
40a82448
DM
4699 case EVAL_A:
4700 state_num = st->resume_state;
4701 goto reenter_switch;
4702
4703 case CURLYM_B:
166ba7cd
DM
4704 case BRANCH_next:
4705 case TRIE_next:
77cb431f 4706 default:
40a82448 4707 Perl_croak(aTHX_ "unexpected yes resume state");
77cb431f
DM
4708 }
4709 }
4710
a3621e74 4711 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4712 PL_colors[4], PL_colors[5]));
4633a7c4
LW
4713yes:
4714#ifdef DEBUGGING
3280af22 4715 PL_regindent--;
4633a7c4 4716#endif
02db2b7b 4717
95b24440 4718 result = 1;
aa283a38 4719 /* XXX this is duplicate(ish) code to that in the do_no section.
40a82448 4720 * will disappear when REGFMATCH goes */
aa283a38
DM
4721 if (depth) {
4722 /* restore previous state and re-enter */
40a82448
DM
4723 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4724 depth--;
4725 st--;
4726 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4727 PL_regmatch_slab = PL_regmatch_slab->prev;
4728 st = SLAB_LAST(PL_regmatch_slab);
4729 }
4730 PL_regmatch_state = st;
4731 scan = st->scan;
4732 next = st->next;
4733 n = st->n;
4734 locinput= st->locinput;
4735 nextchr = UCHARAT(locinput);
aa283a38
DM
4736
4737 switch (st->resume_state) {
aa283a38
DM
4738 case resume_CURLYX:
4739 goto resume_point_CURLYX;
4740 case resume_WHILEM1:
4741 goto resume_point_WHILEM1;
4742 case resume_WHILEM2:
4743 goto resume_point_WHILEM2;
4744 case resume_WHILEM3:
4745 goto resume_point_WHILEM3;
4746 case resume_WHILEM4:
4747 goto resume_point_WHILEM4;
4748 case resume_WHILEM5:
4749 goto resume_point_WHILEM5;
4750 case resume_WHILEM6:
4751 goto resume_point_WHILEM6;
aa283a38
DM
4752 case resume_PLUS1:
4753 goto resume_point_PLUS1;
4754 case resume_PLUS2:
4755 goto resume_point_PLUS2;
4756 case resume_PLUS3:
4757 goto resume_point_PLUS3;
4758 case resume_PLUS4:
4759 goto resume_point_PLUS4;
77cb431f 4760
166ba7cd 4761 case TRIE_next:
40a82448
DM
4762 case CURLYM_A:
4763 case CURLYM_B:
4764 case EVAL_A:
4765 case IFMATCH_A:
4766 case BRANCH_next:
4767 break;
4768
aa283a38
DM
4769 default:
4770 Perl_croak(aTHX_ "regexp resume memory corruption");
4771 }
4772 }
4773 goto final_exit;
4633a7c4
LW
4774
4775no:
a3621e74 4776 DEBUG_EXECUTE_r(
7821416a
IZ
4777 PerlIO_printf(Perl_debug_log,
4778 "%*s %sfailed...%s\n",
e4584336 4779 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a 4780 );
7821416a
IZ
4781no_final:
4782do_no:
aa283a38 4783
4633a7c4 4784#ifdef DEBUGGING
3280af22 4785 PL_regindent--;
4633a7c4 4786#endif
95b24440 4787 result = 0;
5d9a96ca 4788
aa283a38
DM
4789 if (depth) {
4790 /* there's a previous state to backtrack to */
40a82448
DM
4791 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4792 depth--;
4793 st--;
4794 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4795 PL_regmatch_slab = PL_regmatch_slab->prev;
4796 st = SLAB_LAST(PL_regmatch_slab);
4797 }
4798 PL_regmatch_state = st;
4799 scan = st->scan;
4800 next = st->next;
4801 n = st->n;
4802 locinput= st->locinput;
4803 nextchr = UCHARAT(locinput);
4804
5d9a96ca 4805 switch (st->resume_state) {
95b24440
DM
4806 case resume_CURLYX:
4807 goto resume_point_CURLYX;
4808 case resume_WHILEM1:
4809 goto resume_point_WHILEM1;
4810 case resume_WHILEM2:
4811 goto resume_point_WHILEM2;
4812 case resume_WHILEM3:
4813 goto resume_point_WHILEM3;
4814 case resume_WHILEM4:
4815 goto resume_point_WHILEM4;
4816 case resume_WHILEM5:
4817 goto resume_point_WHILEM5;
4818 case resume_WHILEM6:
4819 goto resume_point_WHILEM6;
dad79028 4820
166ba7cd 4821 case TRIE_next:
40a82448
DM
4822 case EVAL_A:
4823 case BRANCH_next:
4824 case CURLYM_A:
4825 case CURLYM_B:
4826 case IFMATCH_A:
4827 if (yes_state == st)
4828 yes_state = st->u.yes.prev_yes_state;
4829 state_num = st->resume_state + 1; /* failure = success + 1 */
4830 goto reenter_switch;
dad79028 4831
95b24440
DM
4832 case resume_PLUS1:
4833 goto resume_point_PLUS1;
4834 case resume_PLUS2:
4835 goto resume_point_PLUS2;
4836 case resume_PLUS3:
4837 goto resume_point_PLUS3;
4838 case resume_PLUS4:
4839 goto resume_point_PLUS4;
95b24440
DM
4840 default:
4841 Perl_croak(aTHX_ "regexp resume memory corruption");
4842 }
95b24440 4843 }
aa283a38
DM
4844
4845final_exit:
4846
5d9a96ca
DM
4847 /* restore original high-water mark */
4848 PL_regmatch_slab = orig_slab;
4849 PL_regmatch_state = orig_state;
4850
4851 /* free all slabs above current one */
4852 if (orig_slab->next) {
c4fd8992 4853 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
4854 orig_slab->next = NULL;
4855 while (sl) {
c4fd8992 4856 regmatch_slab * const osl = sl;
5d9a96ca 4857 sl = sl->next;
ad65c075 4858 Safefree(osl);
5d9a96ca
DM
4859 }
4860 }
4861
95b24440
DM
4862 return result;
4863
a687059c
LW
4864}
4865
4866/*
4867 - regrepeat - repeatedly match something simple, report how many
4868 */
4869/*
4870 * [This routine now assumes that it will only match on things of length 1.
4871 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4872 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4873 */
76e3520e 4874STATIC I32
32fc9b6a 4875S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
a687059c 4876{
27da23d5 4877 dVAR;
a0d0e21e 4878 register char *scan;
a0d0e21e 4879 register I32 c;
3280af22 4880 register char *loceol = PL_regeol;
a0ed51b3 4881 register I32 hardcount = 0;
53c4c00c 4882 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4883
3280af22 4884 scan = PL_reginput;
faf11cac
HS
4885 if (max == REG_INFTY)
4886 max = I32_MAX;
4887 else if (max < loceol - scan)
7f596f4c 4888 loceol = scan + max;
a0d0e21e 4889 switch (OP(p)) {
22c35a8c 4890 case REG_ANY:
1aa99e6b 4891 if (do_utf8) {
ffc61ed2 4892 loceol = PL_regeol;
1aa99e6b 4893 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4894 scan += UTF8SKIP(scan);
4895 hardcount++;
4896 }
4897 } else {
4898 while (scan < loceol && *scan != '\n')
4899 scan++;
a0ed51b3
LW
4900 }
4901 break;
ffc61ed2 4902 case SANY:
def8e4ea
JH
4903 if (do_utf8) {
4904 loceol = PL_regeol;
a0804c9e 4905 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4906 scan += UTF8SKIP(scan);
4907 hardcount++;
4908 }
4909 }
4910 else
4911 scan = loceol;
a0ed51b3 4912 break;
f33976b4
DB
4913 case CANY:
4914 scan = loceol;
4915 break;
090f7165
JH
4916 case EXACT: /* length of string is 1 */
4917 c = (U8)*STRING(p);
4918 while (scan < loceol && UCHARAT(scan) == c)
4919 scan++;
bbce6d69 4920 break;
4921 case EXACTF: /* length of string is 1 */
cd439c50 4922 c = (U8)*STRING(p);
bbce6d69 4923 while (scan < loceol &&
22c35a8c 4924 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4925 scan++;
4926 break;
4927 case EXACTFL: /* length of string is 1 */
3280af22 4928 PL_reg_flags |= RF_tainted;
cd439c50 4929 c = (U8)*STRING(p);
bbce6d69 4930 while (scan < loceol &&
22c35a8c 4931 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4932 scan++;
4933 break;
4934 case ANYOF:
ffc61ed2
JH
4935 if (do_utf8) {
4936 loceol = PL_regeol;
cfc92286 4937 while (hardcount < max && scan < loceol &&
32fc9b6a 4938 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4939 scan += UTF8SKIP(scan);
4940 hardcount++;
4941 }
4942 } else {
32fc9b6a 4943 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
4944 scan++;
4945 }
a0d0e21e
LW
4946 break;
4947 case ALNUM:
1aa99e6b 4948 if (do_utf8) {
ffc61ed2 4949 loceol = PL_regeol;
1a4fad37 4950 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4951 while (hardcount < max && scan < loceol &&
3568d838 4952 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4953 scan += UTF8SKIP(scan);
4954 hardcount++;
4955 }
4956 } else {
4957 while (scan < loceol && isALNUM(*scan))
4958 scan++;
a0ed51b3
LW
4959 }
4960 break;
bbce6d69 4961 case ALNUML:
3280af22 4962 PL_reg_flags |= RF_tainted;
1aa99e6b 4963 if (do_utf8) {
ffc61ed2 4964 loceol = PL_regeol;
1aa99e6b
IH
4965 while (hardcount < max && scan < loceol &&
4966 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4967 scan += UTF8SKIP(scan);
4968 hardcount++;
4969 }
4970 } else {
4971 while (scan < loceol && isALNUM_LC(*scan))
4972 scan++;
a0ed51b3
LW
4973 }
4974 break;
a0d0e21e 4975 case NALNUM:
1aa99e6b 4976 if (do_utf8) {
ffc61ed2 4977 loceol = PL_regeol;
1a4fad37 4978 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4979 while (hardcount < max && scan < loceol &&
3568d838 4980 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4981 scan += UTF8SKIP(scan);
4982 hardcount++;
4983 }
4984 } else {
4985 while (scan < loceol && !isALNUM(*scan))
4986 scan++;
a0ed51b3
LW
4987 }
4988 break;
bbce6d69 4989 case NALNUML:
3280af22 4990 PL_reg_flags |= RF_tainted;
1aa99e6b 4991 if (do_utf8) {
ffc61ed2 4992 loceol = PL_regeol;
1aa99e6b
IH
4993 while (hardcount < max && scan < loceol &&
4994 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4995 scan += UTF8SKIP(scan);
4996 hardcount++;
4997 }
4998 } else {
4999 while (scan < loceol && !isALNUM_LC(*scan))
5000 scan++;
a0ed51b3
LW
5001 }
5002 break;
a0d0e21e 5003 case SPACE:
1aa99e6b 5004 if (do_utf8) {
ffc61ed2 5005 loceol = PL_regeol;
1a4fad37 5006 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5007 while (hardcount < max && scan < loceol &&
3568d838
JH
5008 (*scan == ' ' ||
5009 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5010 scan += UTF8SKIP(scan);
5011 hardcount++;
5012 }
5013 } else {
5014 while (scan < loceol && isSPACE(*scan))
5015 scan++;
a0ed51b3
LW
5016 }
5017 break;
bbce6d69 5018 case SPACEL:
3280af22 5019 PL_reg_flags |= RF_tainted;
1aa99e6b 5020 if (do_utf8) {
ffc61ed2 5021 loceol = PL_regeol;
1aa99e6b 5022 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5023 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5024 scan += UTF8SKIP(scan);
5025 hardcount++;
5026 }
5027 } else {
5028 while (scan < loceol && isSPACE_LC(*scan))
5029 scan++;
a0ed51b3
LW
5030 }
5031 break;
a0d0e21e 5032 case NSPACE:
1aa99e6b 5033 if (do_utf8) {
ffc61ed2 5034 loceol = PL_regeol;
1a4fad37 5035 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5036 while (hardcount < max && scan < loceol &&
3568d838
JH
5037 !(*scan == ' ' ||
5038 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5039 scan += UTF8SKIP(scan);
5040 hardcount++;
5041 }
5042 } else {
5043 while (scan < loceol && !isSPACE(*scan))
5044 scan++;
5045 break;
a0ed51b3 5046 }
bbce6d69 5047 case NSPACEL:
3280af22 5048 PL_reg_flags |= RF_tainted;
1aa99e6b 5049 if (do_utf8) {
ffc61ed2 5050 loceol = PL_regeol;
1aa99e6b 5051 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5052 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5053 scan += UTF8SKIP(scan);
5054 hardcount++;
5055 }
5056 } else {
5057 while (scan < loceol && !isSPACE_LC(*scan))
5058 scan++;
a0ed51b3
LW
5059 }
5060 break;
a0d0e21e 5061 case DIGIT:
1aa99e6b 5062 if (do_utf8) {
ffc61ed2 5063 loceol = PL_regeol;
1a4fad37 5064 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5065 while (hardcount < max && scan < loceol &&
3568d838 5066 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5067 scan += UTF8SKIP(scan);
5068 hardcount++;
5069 }
5070 } else {
5071 while (scan < loceol && isDIGIT(*scan))
5072 scan++;
a0ed51b3
LW
5073 }
5074 break;
a0d0e21e 5075 case NDIGIT:
1aa99e6b 5076 if (do_utf8) {
ffc61ed2 5077 loceol = PL_regeol;
1a4fad37 5078 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5079 while (hardcount < max && scan < loceol &&
3568d838 5080 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5081 scan += UTF8SKIP(scan);
5082 hardcount++;
5083 }
5084 } else {
5085 while (scan < loceol && !isDIGIT(*scan))
5086 scan++;
a0ed51b3
LW
5087 }
5088 break;
a0d0e21e
LW
5089 default: /* Called on something of 0 width. */
5090 break; /* So match right here or not at all. */
5091 }
a687059c 5092
a0ed51b3
LW
5093 if (hardcount)
5094 c = hardcount;
5095 else
5096 c = scan - PL_reginput;
3280af22 5097 PL_reginput = scan;
a687059c 5098
a3621e74 5099 DEBUG_r({
e68ec53f 5100 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 5101 DEBUG_EXECUTE_r({
e68ec53f
YO
5102 SV * const prop = sv_newmortal();
5103 regprop(prog, prop, p);
5104 PerlIO_printf(Perl_debug_log,
be8e71aa
YO
5105 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5106 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 5107 });
be8e71aa 5108 });
9041c2e3 5109
a0d0e21e 5110 return(c);
a687059c
LW
5111}
5112
c277df42 5113
be8e71aa 5114#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 5115/*
ffc61ed2
JH
5116- regclass_swash - prepare the utf8 swash
5117*/
5118
5119SV *
32fc9b6a 5120Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 5121{
97aff369 5122 dVAR;
9e55ce06
JH
5123 SV *sw = NULL;
5124 SV *si = NULL;
5125 SV *alt = NULL;
3dab1dad 5126 const struct reg_data * const data = prog ? prog->data : NULL;
ffc61ed2 5127
4f639d21 5128 if (data && data->count) {
a3b680e6 5129 const U32 n = ARG(node);
ffc61ed2 5130
4f639d21
DM
5131 if (data->what[n] == 's') {
5132 SV * const rv = (SV*)data->data[n];
890ce7af 5133 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 5134 SV **const ary = AvARRAY(av);
9e55ce06 5135 SV **a, **b;
9041c2e3 5136
711a919c 5137 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5138 * documentation of these array elements. */
5139
b11f357e 5140 si = *ary;
8f7f7219 5141 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
5142 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5143
ffc61ed2
JH
5144 if (a)
5145 sw = *a;
5146 else if (si && doinit) {
5147 sw = swash_init("utf8", "", si, 1, 0);
5148 (void)av_store(av, 1, sw);
5149 }
9e55ce06
JH
5150 if (b)
5151 alt = *b;
ffc61ed2
JH
5152 }
5153 }
5154
9e55ce06
JH
5155 if (listsvp)
5156 *listsvp = si;
5157 if (altsvp)
5158 *altsvp = alt;
ffc61ed2
JH
5159
5160 return sw;
5161}
76234dfb 5162#endif
ffc61ed2
JH
5163
5164/*
ba7b4546 5165 - reginclass - determine if a character falls into a character class
832705d4
JH
5166
5167 The n is the ANYOF regnode, the p is the target string, lenp
5168 is pointer to the maximum length of how far to go in the p
5169 (if the lenp is zero, UTF8SKIP(p) is used),
5170 do_utf8 tells whether the target string is in UTF-8.
5171
bbce6d69 5172 */
5173
76e3520e 5174STATIC bool
32fc9b6a 5175S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5176{
27da23d5 5177 dVAR;
a3b680e6 5178 const char flags = ANYOF_FLAGS(n);
bbce6d69 5179 bool match = FALSE;
cc07378b 5180 UV c = *p;
ae9ddab8 5181 STRLEN len = 0;
9e55ce06 5182 STRLEN plen;
1aa99e6b 5183
19f67299
TS
5184 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5185 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5186 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5187 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
19f67299
TS
5188 if (len == (STRLEN)-1)
5189 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5190 }
bbce6d69 5191
0f0076b4 5192 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5193 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5194 if (lenp)
5195 *lenp = 0;
ffc61ed2 5196 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5197 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5198 match = TRUE;
bbce6d69 5199 }
3568d838 5200 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5201 match = TRUE;
ffc61ed2 5202 if (!match) {
9e55ce06 5203 AV *av;
32fc9b6a 5204 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5205
5206 if (sw) {
3568d838 5207 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
5208 match = TRUE;
5209 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5210 if (!match && lenp && av) {
5211 I32 i;
9e55ce06 5212 for (i = 0; i <= av_len(av); i++) {
890ce7af 5213 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5214 STRLEN len;
890ce7af 5215 const char * const s = SvPV_const(sv, len);
9e55ce06 5216
061b10df 5217 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
5218 *lenp = len;
5219 match = TRUE;
5220 break;
5221 }
5222 }
5223 }
5224 if (!match) {
89ebb4a3 5225 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
5226 STRLEN tmplen;
5227
9e55ce06
JH
5228 to_utf8_fold(p, tmpbuf, &tmplen);
5229 if (swash_fetch(sw, tmpbuf, do_utf8))
5230 match = TRUE;
5231 }
ffc61ed2
JH
5232 }
5233 }
bbce6d69 5234 }
9e55ce06 5235 if (match && lenp && *lenp == 0)
0f0076b4 5236 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5237 }
1aa99e6b 5238 if (!match && c < 256) {
ffc61ed2
JH
5239 if (ANYOF_BITMAP_TEST(n, c))
5240 match = TRUE;
5241 else if (flags & ANYOF_FOLD) {
eb160463 5242 U8 f;
a0ed51b3 5243
ffc61ed2
JH
5244 if (flags & ANYOF_LOCALE) {
5245 PL_reg_flags |= RF_tainted;
5246 f = PL_fold_locale[c];
5247 }
5248 else
5249 f = PL_fold[c];
5250 if (f != c && ANYOF_BITMAP_TEST(n, f))
5251 match = TRUE;
5252 }
5253
5254 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5255 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5256 if (
5257 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5258 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5259 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5260 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5261 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5262 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5263 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5264 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5265 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5266 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5267 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5268 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5269 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5270 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5271 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5272 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5273 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5274 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5275 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5276 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5277 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5278 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5279 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5280 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5281 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5282 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5283 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5284 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5285 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5286 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5287 ) /* How's that for a conditional? */
5288 {
5289 match = TRUE;
5290 }
a0ed51b3 5291 }
a0ed51b3
LW
5292 }
5293
a0ed51b3
LW
5294 return (flags & ANYOF_INVERT) ? !match : match;
5295}
161b471a 5296
dfe13c55 5297STATIC U8 *
0ce71af7 5298S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5299{
97aff369 5300 dVAR;
a0ed51b3 5301 if (off >= 0) {
1aa99e6b 5302 while (off-- && s < lim) {
ffc61ed2 5303 /* XXX could check well-formedness here */
a0ed51b3 5304 s += UTF8SKIP(s);
ffc61ed2 5305 }
a0ed51b3
LW
5306 }
5307 else {
5308 while (off++) {
1aa99e6b 5309 if (s > lim) {
a0ed51b3 5310 s--;
ffc61ed2 5311 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5312 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5313 s--;
ffc61ed2
JH
5314 }
5315 /* XXX could check well-formedness here */
a0ed51b3
LW
5316 }
5317 }
5318 }
5319 return s;
5320}
161b471a 5321
dfe13c55 5322STATIC U8 *
0ce71af7 5323S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5324{
97aff369 5325 dVAR;
a0ed51b3 5326 if (off >= 0) {
1aa99e6b 5327 while (off-- && s < lim) {
ffc61ed2 5328 /* XXX could check well-formedness here */
a0ed51b3 5329 s += UTF8SKIP(s);
ffc61ed2 5330 }
a0ed51b3 5331 if (off >= 0)
3dab1dad 5332 return NULL;
a0ed51b3
LW
5333 }
5334 else {
5335 while (off++) {
1aa99e6b 5336 if (s > lim) {
a0ed51b3 5337 s--;
ffc61ed2 5338 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5339 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5340 s--;
ffc61ed2
JH
5341 }
5342 /* XXX could check well-formedness here */
a0ed51b3
LW
5343 }
5344 else
5345 break;
5346 }
5347 if (off <= 0)
3dab1dad 5348 return NULL;
a0ed51b3
LW
5349 }
5350 return s;
5351}
51371543 5352
51371543 5353static void
acfe0abc 5354restore_pos(pTHX_ void *arg)
51371543 5355{
97aff369 5356 dVAR;
097eb12c 5357 regexp * const rex = (regexp *)arg;
51371543
GS
5358 if (PL_reg_eval_set) {
5359 if (PL_reg_oldsaved) {
4f639d21
DM
5360 rex->subbeg = PL_reg_oldsaved;
5361 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5362#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5363 rex->saved_copy = PL_nrs;
ed252734 5364#endif
4f639d21 5365 RX_MATCH_COPIED_on(rex);
51371543
GS
5366 }
5367 PL_reg_magic->mg_len = PL_reg_oldpos;
5368 PL_reg_eval_set = 0;
5369 PL_curpm = PL_reg_oldcurpm;
5370 }
5371}
33b8afdf
JH
5372
5373STATIC void
5374S_to_utf8_substr(pTHX_ register regexp *prog)
5375{
33b8afdf 5376 if (prog->float_substr && !prog->float_utf8) {
097eb12c
AL
5377 SV* const sv = newSVsv(prog->float_substr);
5378 prog->float_utf8 = sv;
33b8afdf
JH
5379 sv_utf8_upgrade(sv);
5380 if (SvTAIL(prog->float_substr))
5381 SvTAIL_on(sv);
5382 if (prog->float_substr == prog->check_substr)
5383 prog->check_utf8 = sv;
5384 }
5385 if (prog->anchored_substr && !prog->anchored_utf8) {
097eb12c
AL
5386 SV* const sv = newSVsv(prog->anchored_substr);
5387 prog->anchored_utf8 = sv;
33b8afdf
JH
5388 sv_utf8_upgrade(sv);
5389 if (SvTAIL(prog->anchored_substr))
5390 SvTAIL_on(sv);
5391 if (prog->anchored_substr == prog->check_substr)
5392 prog->check_utf8 = sv;
5393 }
5394}
5395
5396STATIC void
5397S_to_byte_substr(pTHX_ register regexp *prog)
5398{
97aff369 5399 dVAR;
33b8afdf 5400 if (prog->float_utf8 && !prog->float_substr) {
097eb12c
AL
5401 SV* sv = newSVsv(prog->float_utf8);
5402 prog->float_substr = sv;
33b8afdf
JH
5403 if (sv_utf8_downgrade(sv, TRUE)) {
5404 if (SvTAIL(prog->float_utf8))
5405 SvTAIL_on(sv);
5406 } else {
5407 SvREFCNT_dec(sv);
5408 prog->float_substr = sv = &PL_sv_undef;
5409 }
5410 if (prog->float_utf8 == prog->check_utf8)
5411 prog->check_substr = sv;
5412 }
5413 if (prog->anchored_utf8 && !prog->anchored_substr) {
097eb12c
AL
5414 SV* sv = newSVsv(prog->anchored_utf8);
5415 prog->anchored_substr = sv;
33b8afdf
JH
5416 if (sv_utf8_downgrade(sv, TRUE)) {
5417 if (SvTAIL(prog->anchored_utf8))
5418 SvTAIL_on(sv);
5419 } else {
5420 SvREFCNT_dec(sv);
5421 prog->anchored_substr = sv = &PL_sv_undef;
5422 }
5423 if (prog->anchored_utf8 == prog->check_utf8)
5424 prog->check_substr = sv;
5425 }
5426}
66610fdd
RGS
5427
5428/*
5429 * Local variables:
5430 * c-indentation-style: bsd
5431 * c-basic-offset: 4
5432 * indent-tabs-mode: t
5433 * End:
5434 *
37442d52
RGS
5435 * ex: set ts=8 sts=4 sw=4 noet:
5436 */