This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: [perl #56644] PerlIO resource leaks on open() and then :pop in :unix and :stdio
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
c277df42
IZ
83#define RF_tainted 1 /* tainted information used? */
84#define RF_warned 2 /* warned about big count? */
faec1544 85
ab3bbdeb 86#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 87
eb160463 88#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
89
90#define RS_init 1 /* eval environment created */
91#define RS_set 2 /* replsv value is set */
c277df42 92
a687059c
LW
93#ifndef STATIC
94#define STATIC static
95#endif
96
32fc9b6a 97#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 98
c277df42
IZ
99/*
100 * Forwards.
101 */
102
33b8afdf 103#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 104#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 105
3dab1dad
YO
106#define HOPc(pos,off) \
107 (char *)(PL_reg_match_utf8 \
52657f30 108 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
109 : (U8*)(pos + off))
110#define HOPBACKc(pos, off) \
07be1b83
YO
111 (char*)(PL_reg_match_utf8\
112 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
113 : (pos - off >= PL_bostr) \
8e11feef 114 ? (U8*)pos - off \
3dab1dad 115 : NULL)
efb30f32 116
e7409c1b 117#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 118#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 119
1a4fad37
AL
120#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
121 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
122#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
123#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
124#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
125#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 126
3dab1dad
YO
127/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
128
5f80c4cf 129/* for use after a quantifier and before an EXACT-like node -- japhy */
3e901dc0
YO
130/* it would be nice to rework regcomp.sym to generate this stuff. sigh */
131#define JUMPABLE(rn) ( \
132 OP(rn) == OPEN || \
133 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
134 OP(rn) == EVAL || \
cca55fe3
JP
135 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
136 OP(rn) == PLUS || OP(rn) == MINMOD || \
ee9b8eae 137 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
3dab1dad 138 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 139)
ee9b8eae 140#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 141
ee9b8eae
YO
142#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
143
144#if 0
145/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
146 we don't need this definition. */
147#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
148#define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
149#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
150
151#else
152/* ... so we use this as its faster. */
153#define IS_TEXT(rn) ( OP(rn)==EXACT )
154#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
155#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
156
157#endif
e2d8ce26 158
a84d97b6
HS
159/*
160 Search for mandatory following text node; for lookahead, the text must
161 follow but for lookbehind (rn->flags != 0) we skip to the next step.
162*/
cca55fe3 163#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
164 while (JUMPABLE(rn)) { \
165 const OPCODE type = OP(rn); \
166 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 167 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 168 else if (type == PLUS) \
cca55fe3 169 rn = NEXTOPER(rn); \
3dab1dad 170 else if (type == IFMATCH) \
a84d97b6 171 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 172 else rn += NEXT_OFF(rn); \
3dab1dad 173 } \
5f80c4cf 174} STMT_END
74750237 175
c476f425 176
acfe0abc 177static void restore_pos(pTHX_ void *arg);
51371543 178
76e3520e 179STATIC CHECKPOINT
cea2e8a9 180S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 181{
97aff369 182 dVAR;
a3b680e6 183 const int retval = PL_savestack_ix;
b1ce53c5 184#define REGCP_PAREN_ELEMS 4
a3b680e6 185 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 186 int p;
40a82448 187 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 188
e49a9654
IH
189 if (paren_elems_to_push < 0)
190 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
191
f0ab9afb 192#define REGCP_OTHER_ELEMS 7
4b3c1a47 193 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
7f69552c 194
3280af22 195 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 196/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
f0ab9afb
NC
197 SSPUSHINT(PL_regoffs[p].end);
198 SSPUSHINT(PL_regoffs[p].start);
3280af22 199 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 200 SSPUSHINT(p);
e7707071 201 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
40a82448 202 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
f0ab9afb 203 (UV)p, (IV)PL_regoffs[p].start,
40a82448 204 (IV)(PL_reg_start_tmp[p] - PL_bostr),
f0ab9afb 205 (IV)PL_regoffs[p].end
40a82448 206 ));
a0d0e21e 207 }
b1ce53c5 208/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
f0ab9afb 209 SSPUSHPTR(PL_regoffs);
3280af22
NIS
210 SSPUSHINT(PL_regsize);
211 SSPUSHINT(*PL_reglastparen);
a01268b5 212 SSPUSHINT(*PL_reglastcloseparen);
3280af22 213 SSPUSHPTR(PL_reginput);
41123dfd
JH
214#define REGCP_FRAME_ELEMS 2
215/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
216 * are needed for the regexp context stack bookkeeping. */
217 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 218 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 219
a0d0e21e
LW
220 return retval;
221}
222
c277df42 223/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
224#define REGCP_SET(cp) \
225 DEBUG_STATE_r( \
ab3bbdeb 226 PerlIO_printf(Perl_debug_log, \
e4f74956 227 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
228 (IV)PL_savestack_ix)); \
229 cp = PL_savestack_ix
c3464db5 230
ab3bbdeb 231#define REGCP_UNWIND(cp) \
e4f74956 232 DEBUG_STATE_r( \
ab3bbdeb 233 if (cp != PL_savestack_ix) \
e4f74956
YO
234 PerlIO_printf(Perl_debug_log, \
235 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
236 (IV)(cp), (IV)PL_savestack_ix)); \
237 regcpblow(cp)
c277df42 238
76e3520e 239STATIC char *
097eb12c 240S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 241{
97aff369 242 dVAR;
3b6647e0 243 U32 i;
a0d0e21e 244 char *input;
a3621e74
YO
245 GET_RE_DEBUG_FLAGS_DECL;
246
7918f24d
NC
247 PERL_ARGS_ASSERT_REGCPPOP;
248
b1ce53c5 249 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 250 i = SSPOPINT;
b1ce53c5
JH
251 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
252 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 253 input = (char *) SSPOPPTR;
a01268b5 254 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
255 *PL_reglastparen = SSPOPINT;
256 PL_regsize = SSPOPINT;
f0ab9afb 257 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
b1ce53c5 258
6bda09f9 259
b1ce53c5 260 /* Now restore the parentheses context. */
41123dfd
JH
261 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
262 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 263 I32 tmps;
097eb12c 264 U32 paren = (U32)SSPOPINT;
3280af22 265 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
f0ab9afb 266 PL_regoffs[paren].start = SSPOPINT;
cf93c79d 267 tmps = SSPOPINT;
3280af22 268 if (paren <= *PL_reglastparen)
f0ab9afb 269 PL_regoffs[paren].end = tmps;
e7707071 270 DEBUG_BUFFERS_r(
c3464db5 271 PerlIO_printf(Perl_debug_log,
b900a521 272 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
f0ab9afb 273 (UV)paren, (IV)PL_regoffs[paren].start,
b900a521 274 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
f0ab9afb 275 (IV)PL_regoffs[paren].end,
3280af22 276 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 277 );
a0d0e21e 278 }
e7707071 279 DEBUG_BUFFERS_r(
bb7a0f54 280 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 281 PerlIO_printf(Perl_debug_log,
faccc32b 282 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 283 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
284 }
285 );
daf18116 286#if 1
dafc8851
JH
287 /* It would seem that the similar code in regtry()
288 * already takes care of this, and in fact it is in
289 * a better location to since this code can #if 0-ed out
290 * but the code in regtry() is needed or otherwise tests
291 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
292 * (as of patchlevel 7877) will fail. Then again,
293 * this code seems to be necessary or otherwise
225593e1
DM
294 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
295 * --jhi updated by dapm */
3b6647e0 296 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
097eb12c 297 if (i > PL_regsize)
f0ab9afb
NC
298 PL_regoffs[i].start = -1;
299 PL_regoffs[i].end = -1;
a0d0e21e 300 }
dafc8851 301#endif
a0d0e21e
LW
302 return input;
303}
304
02db2b7b 305#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 306
a687059c 307/*
e50aee73 308 * pregexec and friends
a687059c
LW
309 */
310
76234dfb 311#ifndef PERL_IN_XSUB_RE
a687059c 312/*
c277df42 313 - pregexec - match a regexp against a string
a687059c 314 */
c277df42 315I32
49d7dfbc 316Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 317 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
318/* strend: pointer to null at end of string */
319/* strbeg: real beginning of string */
320/* minend: end of match must be >=minend after stringarg. */
321/* nosave: For optimizations. */
322{
7918f24d
NC
323 PERL_ARGS_ASSERT_PREGEXEC;
324
c277df42 325 return
9041c2e3 326 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
327 nosave ? 0 : REXEC_COPY_STR);
328}
76234dfb 329#endif
22e551b9 330
9041c2e3 331/*
cad2e5aa
JH
332 * Need to implement the following flags for reg_anch:
333 *
334 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
335 * USE_INTUIT_ML
336 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
337 * INTUIT_AUTORITATIVE_ML
338 * INTUIT_ONCE_NOML - Intuit can match in one location only.
339 * INTUIT_ONCE_ML
340 *
341 * Another flag for this function: SECOND_TIME (so that float substrs
342 * with giant delta may be not rechecked).
343 */
344
345/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
346
3f7c398e 347/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
348 Otherwise, only SvCUR(sv) is used to get strbeg. */
349
350/* XXXX We assume that strpos is strbeg unless sv. */
351
6eb5f6b9
JH
352/* XXXX Some places assume that there is a fixed substring.
353 An update may be needed if optimizer marks as "INTUITable"
354 RExen without fixed substrings. Similarly, it is assumed that
355 lengths of all the strings are no more than minlen, thus they
356 cannot come from lookahead.
40d049e4
YO
357 (Or minlen should take into account lookahead.)
358 NOTE: Some of this comment is not correct. minlen does now take account
359 of lookahead/behind. Further research is required. -- demerphq
360
361*/
6eb5f6b9 362
2c2d71f5
JH
363/* A failure to find a constant substring means that there is no need to make
364 an expensive call to REx engine, thus we celebrate a failure. Similarly,
365 finding a substring too deep into the string means that less calls to
30944b6d
IZ
366 regtry() should be needed.
367
368 REx compiler's optimizer found 4 possible hints:
369 a) Anchored substring;
370 b) Fixed substring;
371 c) Whether we are anchored (beginning-of-line or \G);
372 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 373 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
374 string which does not contradict any of them.
375 */
2c2d71f5 376
6eb5f6b9
JH
377/* Most of decisions we do here should have been done at compile time.
378 The nodes of the REx which we used for the search should have been
379 deleted from the finite automaton. */
380
cad2e5aa 381char *
288b8c02 382Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 383 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 384{
97aff369 385 dVAR;
288b8c02 386 struct regexp *const prog = (struct regexp *)SvANY(rx);
b7953727 387 register I32 start_shift = 0;
cad2e5aa 388 /* Should be nonnegative! */
b7953727 389 register I32 end_shift = 0;
2c2d71f5
JH
390 register char *s;
391 register SV *check;
a1933d95 392 char *strbeg;
cad2e5aa 393 char *t;
1de06328 394 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 395 I32 ml_anch;
bd61b366
SS
396 register char *other_last = NULL; /* other substr checked before this */
397 char *check_at = NULL; /* check substr found at this pos */
bbe252da 398 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 399 RXi_GET_DECL(prog,progi);
30944b6d 400#ifdef DEBUGGING
890ce7af 401 const char * const i_strpos = strpos;
30944b6d 402#endif
a3621e74
YO
403 GET_RE_DEBUG_FLAGS_DECL;
404
7918f24d
NC
405 PERL_ARGS_ASSERT_RE_INTUIT_START;
406
288b8c02 407 RX_MATCH_UTF8_set(rx,do_utf8);
cad2e5aa 408
3c8556c3 409 if (RX_UTF8(rx)) {
b8d68ded
JH
410 PL_reg_flags |= RF_utf8;
411 }
ab3bbdeb 412 DEBUG_EXECUTE_r(
efd26800 413 debug_start_match(rx, do_utf8, strpos, strend,
1de06328
YO
414 sv ? "Guessing start of match in sv for"
415 : "Guessing start of match in string for");
2a782b5b 416 );
cad2e5aa 417
c344f387
JH
418 /* CHR_DIST() would be more correct here but it makes things slow. */
419 if (prog->minlen > strend - strpos) {
a3621e74 420 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 421 "String too short... [re_intuit_start]\n"));
cad2e5aa 422 goto fail;
2c2d71f5 423 }
1de06328 424
a1933d95 425 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 426 PL_regeol = strend;
33b8afdf
JH
427 if (do_utf8) {
428 if (!prog->check_utf8 && prog->check_substr)
429 to_utf8_substr(prog);
430 check = prog->check_utf8;
431 } else {
432 if (!prog->check_substr && prog->check_utf8)
433 to_byte_substr(prog);
434 check = prog->check_substr;
435 }
1de06328 436 if (check == &PL_sv_undef) {
a3621e74 437 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 438 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
439 goto fail;
440 }
bbe252da
YO
441 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
442 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
443 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 444 && !multiline ) ); /* Check after \n? */
cad2e5aa 445
7e25d62c 446 if (!ml_anch) {
bbe252da
YO
447 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
448 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 449 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
450 && sv && !SvROK(sv)
451 && (strpos != strbeg)) {
a3621e74 452 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
453 goto fail;
454 }
455 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 456 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 457 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
458 I32 slen;
459
1aa99e6b 460 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 461
653099ff
GS
462 if (SvTAIL(check)) {
463 slen = SvCUR(check); /* >= 1 */
cad2e5aa 464
9041c2e3 465 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 466 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 467 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 468 goto fail_finish;
cad2e5aa
JH
469 }
470 /* Now should match s[0..slen-2] */
471 slen--;
3f7c398e 472 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 473 || (slen > 1
3f7c398e 474 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 475 report_neq:
a3621e74 476 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
477 goto fail_finish;
478 }
cad2e5aa 479 }
3f7c398e 480 else if (*SvPVX_const(check) != *s
653099ff 481 || ((slen = SvCUR(check)) > 1
3f7c398e 482 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 483 goto report_neq;
c315bfe8 484 check_at = s;
2c2d71f5 485 goto success_at_start;
7e25d62c 486 }
cad2e5aa 487 }
2c2d71f5 488 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 489 s = strpos;
2c2d71f5 490 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
491 end_shift = prog->check_end_shift;
492
2c2d71f5 493 if (!ml_anch) {
a3b680e6 494 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 495 - (SvTAIL(check) != 0);
a3b680e6 496 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
497
498 if (end_shift < eshift)
499 end_shift = eshift;
500 }
cad2e5aa 501 }
2c2d71f5 502 else { /* Can match at random position */
cad2e5aa
JH
503 ml_anch = 0;
504 s = strpos;
1de06328
YO
505 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
506 end_shift = prog->check_end_shift;
507
508 /* end shift should be non negative here */
cad2e5aa
JH
509 }
510
bcdf7404 511#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 512 if (end_shift < 0)
1de06328 513 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 514 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
515#endif
516
2c2d71f5
JH
517 restart:
518 /* Find a possible match in the region s..strend by looking for
519 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
520
521 {
522 I32 srch_start_shift = start_shift;
523 I32 srch_end_shift = end_shift;
524 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
525 srch_end_shift -= ((strbeg - s) - srch_start_shift);
526 srch_start_shift = strbeg - s;
527 }
6bda09f9 528 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
529 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
530 (IV)prog->check_offset_min,
531 (IV)srch_start_shift,
532 (IV)srch_end_shift,
533 (IV)prog->check_end_shift);
534 });
535
cad2e5aa 536 if (flags & REXEC_SCREAM) {
cad2e5aa 537 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 538 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 539
2c2d71f5
JH
540 if (PL_screamfirst[BmRARE(check)] >= 0
541 || ( BmRARE(check) == '\n'
85c508c3 542 && (BmPREVIOUS(check) == SvCUR(check) - 1)
2c2d71f5 543 && SvTAIL(check) ))
9041c2e3 544 s = screaminstr(sv, check,
1de06328 545 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 546 else
2c2d71f5 547 goto fail_finish;
4addbd3b 548 /* we may be pointing at the wrong string */
07bc277f 549 if (s && RXp_MATCH_COPIED(prog))
3f7c398e 550 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
551 if (data)
552 *data->scream_olds = s;
553 }
1de06328
YO
554 else {
555 U8* start_point;
556 U8* end_point;
bbe252da 557 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
558 start_point= (U8*)(s + srch_start_shift);
559 end_point= (U8*)(strend - srch_end_shift);
560 } else {
561 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
562 end_point= HOP3(strend, -srch_end_shift, strbeg);
563 }
6bda09f9 564 DEBUG_OPTIMISE_MORE_r({
56570a2c 565 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 566 (int)(end_point - start_point),
fc8cd66c 567 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
568 start_point);
569 });
570
571 s = fbm_instr( start_point, end_point,
7fba1cd6 572 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
573 }
574 }
cad2e5aa
JH
575 /* Update the count-of-usability, remove useless subpatterns,
576 unshift s. */
2c2d71f5 577
ab3bbdeb
YO
578 DEBUG_EXECUTE_r({
579 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
580 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
581 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 582 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
583 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
584 ? "anchored" : "floating"),
585 quoted,
586 RE_SV_TAIL(check),
587 (s ? " at offset " : "...\n") );
588 });
2c2d71f5
JH
589
590 if (!s)
591 goto fail_finish;
2c2d71f5 592 /* Finish the diagnostic message */
a3621e74 593 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 594
1de06328
YO
595 /* XXX dmq: first branch is for positive lookbehind...
596 Our check string is offset from the beginning of the pattern.
597 So we need to do any stclass tests offset forward from that
598 point. I think. :-(
599 */
600
601
602
603 check_at=s;
604
605
2c2d71f5
JH
606 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
607 Start with the other substr.
608 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 609 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
610 *always* match. Probably should be marked during compile...
611 Probably it is right to do no SCREAM here...
612 */
613
1de06328
YO
614 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
615 : (prog->float_substr && prog->anchored_substr))
616 {
30944b6d 617 /* Take into account the "other" substring. */
2c2d71f5
JH
618 /* XXXX May be hopelessly wrong for UTF... */
619 if (!other_last)
6eb5f6b9 620 other_last = strpos;
33b8afdf 621 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
622 do_other_anchored:
623 {
890ce7af
AL
624 char * const last = HOP3c(s, -start_shift, strbeg);
625 char *last1, *last2;
be8e71aa 626 char * const saved_s = s;
33b8afdf 627 SV* must;
2c2d71f5 628
2c2d71f5
JH
629 t = s - prog->check_offset_max;
630 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 631 && (!do_utf8
0ce71af7 632 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 633 && t > strpos)))
6f207bd3 634 NOOP;
2c2d71f5
JH
635 else
636 t = strpos;
1aa99e6b 637 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
638 if (t < other_last) /* These positions already checked */
639 t = other_last;
1aa99e6b 640 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
641 if (last < last1)
642 last1 = last;
1de06328
YO
643 /* XXXX It is not documented what units *_offsets are in.
644 We assume bytes, but this is clearly wrong.
645 Meaning this code needs to be carefully reviewed for errors.
646 dmq.
647 */
648
2c2d71f5 649 /* On end-of-str: see comment below. */
33b8afdf
JH
650 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
651 if (must == &PL_sv_undef) {
652 s = (char*)NULL;
1de06328 653 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
654 }
655 else
656 s = fbm_instr(
657 (unsigned char*)t,
658 HOP3(HOP3(last1, prog->anchored_offset, strend)
659 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
660 must,
7fba1cd6 661 multiline ? FBMrf_MULTILINE : 0
33b8afdf 662 );
ab3bbdeb
YO
663 DEBUG_EXECUTE_r({
664 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
665 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
666 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 667 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
668 quoted, RE_SV_TAIL(must));
669 });
670
671
2c2d71f5
JH
672 if (!s) {
673 if (last1 >= last2) {
a3621e74 674 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
675 ", giving up...\n"));
676 goto fail_finish;
677 }
a3621e74 678 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 679 ", trying floating at offset %ld...\n",
be8e71aa 680 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
681 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
682 s = HOP3c(last, 1, strend);
2c2d71f5
JH
683 goto restart;
684 }
685 else {
a3621e74 686 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 687 (long)(s - i_strpos)));
1aa99e6b
IH
688 t = HOP3c(s, -prog->anchored_offset, strbeg);
689 other_last = HOP3c(s, 1, strend);
be8e71aa 690 s = saved_s;
2c2d71f5
JH
691 if (t == strpos)
692 goto try_at_start;
2c2d71f5
JH
693 goto try_at_offset;
694 }
30944b6d 695 }
2c2d71f5
JH
696 }
697 else { /* Take into account the floating substring. */
33b8afdf 698 char *last, *last1;
be8e71aa 699 char * const saved_s = s;
33b8afdf
JH
700 SV* must;
701
702 t = HOP3c(s, -start_shift, strbeg);
703 last1 = last =
704 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
705 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
706 last = HOP3c(t, prog->float_max_offset, strend);
707 s = HOP3c(t, prog->float_min_offset, strend);
708 if (s < other_last)
709 s = other_last;
2c2d71f5 710 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
711 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
712 /* fbm_instr() takes into account exact value of end-of-str
713 if the check is SvTAIL(ed). Since false positives are OK,
714 and end-of-str is not later than strend we are OK. */
715 if (must == &PL_sv_undef) {
716 s = (char*)NULL;
1de06328 717 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
718 }
719 else
2c2d71f5 720 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
721 (unsigned char*)last + SvCUR(must)
722 - (SvTAIL(must)!=0),
7fba1cd6 723 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
724 DEBUG_EXECUTE_r({
725 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
726 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
727 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 728 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
729 quoted, RE_SV_TAIL(must));
730 });
33b8afdf
JH
731 if (!s) {
732 if (last1 == last) {
a3621e74 733 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
734 ", giving up...\n"));
735 goto fail_finish;
2c2d71f5 736 }
a3621e74 737 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 738 ", trying anchored starting at offset %ld...\n",
be8e71aa 739 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
740 other_last = last;
741 s = HOP3c(t, 1, strend);
742 goto restart;
743 }
744 else {
a3621e74 745 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
746 (long)(s - i_strpos)));
747 other_last = s; /* Fix this later. --Hugo */
be8e71aa 748 s = saved_s;
33b8afdf
JH
749 if (t == strpos)
750 goto try_at_start;
751 goto try_at_offset;
752 }
2c2d71f5 753 }
cad2e5aa 754 }
2c2d71f5 755
1de06328 756
9ef43ace 757 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 758
6bda09f9 759 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
760 PerlIO_printf(Perl_debug_log,
761 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
762 (IV)prog->check_offset_min,
763 (IV)prog->check_offset_max,
764 (IV)(s-strpos),
765 (IV)(t-strpos),
766 (IV)(t-s),
767 (IV)(strend-strpos)
768 )
769 );
770
2c2d71f5 771 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 772 && (!do_utf8
9ef43ace 773 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
774 && t > strpos)))
775 {
2c2d71f5
JH
776 /* Fixed substring is found far enough so that the match
777 cannot start at strpos. */
778 try_at_offset:
cad2e5aa 779 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
780 /* Eventually fbm_*() should handle this, but often
781 anchored_offset is not 0, so this check will not be wasted. */
782 /* XXXX In the code below we prefer to look for "^" even in
783 presence of anchored substrings. And we search even
784 beyond the found float position. These pessimizations
785 are historical artefacts only. */
786 find_anchor:
2c2d71f5 787 while (t < strend - prog->minlen) {
cad2e5aa 788 if (*t == '\n') {
4ee3650e 789 if (t < check_at - prog->check_offset_min) {
33b8afdf 790 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
791 /* Since we moved from the found position,
792 we definitely contradict the found anchored
30944b6d
IZ
793 substr. Due to the above check we do not
794 contradict "check" substr.
795 Thus we can arrive here only if check substr
796 is float. Redo checking for "other"=="fixed".
797 */
9041c2e3 798 strpos = t + 1;
a3621e74 799 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 800 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
801 goto do_other_anchored;
802 }
4ee3650e
GS
803 /* We don't contradict the found floating substring. */
804 /* XXXX Why not check for STCLASS? */
cad2e5aa 805 s = t + 1;
a3621e74 806 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 807 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
808 goto set_useful;
809 }
4ee3650e
GS
810 /* Position contradicts check-string */
811 /* XXXX probably better to look for check-string
812 than for "\n", so one should lower the limit for t? */
a3621e74 813 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 814 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 815 other_last = strpos = s = t + 1;
cad2e5aa
JH
816 goto restart;
817 }
818 t++;
819 }
a3621e74 820 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 821 PL_colors[0], PL_colors[1]));
2c2d71f5 822 goto fail_finish;
cad2e5aa 823 }
f5952150 824 else {
a3621e74 825 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 826 PL_colors[0], PL_colors[1]));
f5952150 827 }
cad2e5aa
JH
828 s = t;
829 set_useful:
33b8afdf 830 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
831 }
832 else {
f5952150 833 /* The found string does not prohibit matching at strpos,
2c2d71f5 834 - no optimization of calling REx engine can be performed,
f5952150
GS
835 unless it was an MBOL and we are not after MBOL,
836 or a future STCLASS check will fail this. */
2c2d71f5
JH
837 try_at_start:
838 /* Even in this situation we may use MBOL flag if strpos is offset
839 wrt the start of the string. */
05b4157f 840 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 841 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 842 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 843 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 844 {
cad2e5aa
JH
845 t = strpos;
846 goto find_anchor;
847 }
a3621e74 848 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 849 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 850 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 851 );
2c2d71f5 852 success_at_start:
bbe252da 853 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
854 && (do_utf8 ? (
855 prog->check_utf8 /* Could be deleted already */
856 && --BmUSEFUL(prog->check_utf8) < 0
857 && (prog->check_utf8 == prog->float_utf8)
858 ) : (
859 prog->check_substr /* Could be deleted already */
860 && --BmUSEFUL(prog->check_substr) < 0
861 && (prog->check_substr == prog->float_substr)
862 )))
66e933ab 863 {
cad2e5aa 864 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 865 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
866 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
867 if (do_utf8 ? prog->check_substr : prog->check_utf8)
868 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
869 prog->check_substr = prog->check_utf8 = NULL; /* disable */
870 prog->float_substr = prog->float_utf8 = NULL; /* clear */
871 check = NULL; /* abort */
cad2e5aa 872 s = strpos;
3cf5c195
IZ
873 /* XXXX This is a remnant of the old implementation. It
874 looks wasteful, since now INTUIT can use many
6eb5f6b9 875 other heuristics. */
bbe252da 876 prog->extflags &= ~RXf_USE_INTUIT;
cad2e5aa
JH
877 }
878 else
879 s = strpos;
880 }
881
6eb5f6b9
JH
882 /* Last resort... */
883 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
884 /* trie stclasses are too expensive to use here, we are better off to
885 leave it to regmatch itself */
f8fc2ecf 886 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
887 /* minlen == 0 is possible if regstclass is \b or \B,
888 and the fixed substr is ''$.
889 Since minlen is already taken into account, s+1 is before strend;
890 accidentally, minlen >= 1 guaranties no false positives at s + 1
891 even for \b or \B. But (minlen? 1 : 0) below assumes that
892 regstclass does not come from lookahead... */
893 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
894 This leaves EXACTF only, which is dealt with in find_byclass(). */
f8fc2ecf
YO
895 const U8* const str = (U8*)STRING(progi->regstclass);
896 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
897 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 898 : 1);
1de06328
YO
899 char * endpos;
900 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
901 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
902 else if (prog->float_substr || prog->float_utf8)
903 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
904 else
905 endpos= strend;
906
70685ca0
JH
907 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
908 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1de06328 909
6eb5f6b9 910 t = s;
f8fc2ecf 911 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
6eb5f6b9
JH
912 if (!s) {
913#ifdef DEBUGGING
cbbf8932 914 const char *what = NULL;
6eb5f6b9
JH
915#endif
916 if (endpos == strend) {
a3621e74 917 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
918 "Could not match STCLASS...\n") );
919 goto fail;
920 }
a3621e74 921 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 922 "This position contradicts STCLASS...\n") );
bbe252da 923 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 924 goto fail;
6eb5f6b9 925 /* Contradict one of substrings */
33b8afdf
JH
926 if (prog->anchored_substr || prog->anchored_utf8) {
927 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 928 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 929 hop_and_restart:
1aa99e6b 930 s = HOP3c(t, 1, strend);
66e933ab
GS
931 if (s + start_shift + end_shift > strend) {
932 /* XXXX Should be taken into account earlier? */
a3621e74 933 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
934 "Could not match STCLASS...\n") );
935 goto fail;
936 }
5e39e1e5
HS
937 if (!check)
938 goto giveup;
a3621e74 939 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 940 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
941 what, (long)(s + start_shift - i_strpos)) );
942 goto restart;
943 }
66e933ab 944 /* Have both, check_string is floating */
6eb5f6b9
JH
945 if (t + start_shift >= check_at) /* Contradicts floating=check */
946 goto retry_floating_check;
947 /* Recheck anchored substring, but not floating... */
9041c2e3 948 s = check_at;
5e39e1e5
HS
949 if (!check)
950 goto giveup;
a3621e74 951 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 952 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
953 (long)(other_last - i_strpos)) );
954 goto do_other_anchored;
955 }
60e71179
GS
956 /* Another way we could have checked stclass at the
957 current position only: */
958 if (ml_anch) {
959 s = t = t + 1;
5e39e1e5
HS
960 if (!check)
961 goto giveup;
a3621e74 962 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 963 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 964 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 965 goto try_at_offset;
66e933ab 966 }
33b8afdf 967 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 968 goto fail;
6eb5f6b9
JH
969 /* Check is floating subtring. */
970 retry_floating_check:
971 t = check_at - start_shift;
a3621e74 972 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
973 goto hop_and_restart;
974 }
b7953727 975 if (t != s) {
a3621e74 976 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 977 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
978 (long)(t - i_strpos), (long)(s - i_strpos))
979 );
980 }
981 else {
a3621e74 982 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
983 "Does not contradict STCLASS...\n");
984 );
985 }
6eb5f6b9 986 }
5e39e1e5 987 giveup:
a3621e74 988 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
989 PL_colors[4], (check ? "Guessed" : "Giving up"),
990 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 991 return s;
2c2d71f5
JH
992
993 fail_finish: /* Substring not found */
33b8afdf
JH
994 if (prog->check_substr || prog->check_utf8) /* could be removed already */
995 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 996 fail:
a3621e74 997 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 998 PL_colors[4], PL_colors[5]));
bd61b366 999 return NULL;
cad2e5aa 1000}
9661b544 1001
a0a388a1
YO
1002#define DECL_TRIE_TYPE(scan) \
1003 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1004 trie_type = (scan->flags != EXACT) \
1005 ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
1006 : (do_utf8 ? trie_utf8 : trie_plain)
3b0527fe 1007
55eed653
NC
1008#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1009uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
4cadc6a9
YO
1010 switch (trie_type) { \
1011 case trie_utf8_fold: \
1012 if ( foldlen>0 ) { \
1013 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1014 foldlen -= len; \
1015 uscan += len; \
1016 len=0; \
1017 } else { \
1018 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1019 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1020 foldlen -= UNISKIP( uvc ); \
1021 uscan = foldbuf + UNISKIP( uvc ); \
1022 } \
1023 break; \
a0a388a1
YO
1024 case trie_latin_utf8_fold: \
1025 if ( foldlen>0 ) { \
1026 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1027 foldlen -= len; \
1028 uscan += len; \
1029 len=0; \
1030 } else { \
1031 len = 1; \
1032 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
1033 foldlen -= UNISKIP( uvc ); \
1034 uscan = foldbuf + UNISKIP( uvc ); \
1035 } \
1036 break; \
4cadc6a9
YO
1037 case trie_utf8: \
1038 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1039 break; \
1040 case trie_plain: \
1041 uvc = (UV)*uc; \
1042 len = 1; \
1043 } \
1044 \
1045 if (uvc < 256) { \
1046 charid = trie->charmap[ uvc ]; \
1047 } \
1048 else { \
1049 charid = 0; \
55eed653
NC
1050 if (widecharmap) { \
1051 SV** const svpp = hv_fetch(widecharmap, \
4cadc6a9
YO
1052 (char*)&uvc, sizeof(UV), 0); \
1053 if (svpp) \
1054 charid = (U16)SvIV(*svpp); \
1055 } \
1056 } \
1057} STMT_END
1058
a0a388a1
YO
1059#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1060{ \
1061 char *my_strend= (char *)strend; \
4cadc6a9
YO
1062 if ( (CoNd) \
1063 && (ln == len || \
a0a388a1 1064 !ibcmp_utf8(s, &my_strend, 0, do_utf8, \
4cadc6a9 1065 m, NULL, ln, (bool)UTF)) \
a0a388a1 1066 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1067 goto got_it; \
1068 else { \
1069 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1070 uvchr_to_utf8(tmpbuf, c); \
1071 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1072 if ( f != c \
1073 && (f == c1 || f == c2) \
a0a388a1
YO
1074 && (ln == len || \
1075 !ibcmp_utf8(s, &my_strend, 0, do_utf8,\
1076 m, NULL, ln, (bool)UTF)) \
1077 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1078 goto got_it; \
1079 } \
a0a388a1
YO
1080} \
1081s += len
4cadc6a9
YO
1082
1083#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1084STMT_START { \
1085 while (s <= e) { \
1086 if ( (CoNd) \
1087 && (ln == 1 || !(OP(c) == EXACTF \
1088 ? ibcmp(s, m, ln) \
1089 : ibcmp_locale(s, m, ln))) \
24b23f37 1090 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1091 goto got_it; \
1092 s++; \
1093 } \
1094} STMT_END
1095
1096#define REXEC_FBC_UTF8_SCAN(CoDe) \
1097STMT_START { \
1098 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1099 CoDe \
1100 s += uskip; \
1101 } \
1102} STMT_END
1103
1104#define REXEC_FBC_SCAN(CoDe) \
1105STMT_START { \
1106 while (s < strend) { \
1107 CoDe \
1108 s++; \
1109 } \
1110} STMT_END
1111
1112#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1113REXEC_FBC_UTF8_SCAN( \
1114 if (CoNd) { \
24b23f37 1115 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1116 goto got_it; \
1117 else \
1118 tmp = doevery; \
1119 } \
1120 else \
1121 tmp = 1; \
1122)
1123
1124#define REXEC_FBC_CLASS_SCAN(CoNd) \
1125REXEC_FBC_SCAN( \
1126 if (CoNd) { \
24b23f37 1127 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1128 goto got_it; \
1129 else \
1130 tmp = doevery; \
1131 } \
1132 else \
1133 tmp = 1; \
1134)
1135
1136#define REXEC_FBC_TRYIT \
24b23f37 1137if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1138 goto got_it
1139
e1d1eefb
YO
1140#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1141 if (do_utf8) { \
1142 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1143 } \
1144 else { \
1145 REXEC_FBC_CLASS_SCAN(CoNd); \
1146 } \
1147 break
1148
4cadc6a9
YO
1149#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1150 if (do_utf8) { \
1151 UtFpReLoAd; \
1152 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1153 } \
1154 else { \
1155 REXEC_FBC_CLASS_SCAN(CoNd); \
1156 } \
1157 break
1158
1159#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1160 PL_reg_flags |= RF_tainted; \
1161 if (do_utf8) { \
1162 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1163 } \
1164 else { \
1165 REXEC_FBC_CLASS_SCAN(CoNd); \
1166 } \
1167 break
1168
786e8c11
YO
1169#define DUMP_EXEC_POS(li,s,doutf8) \
1170 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1171
1172/* We know what class REx starts with. Try to find this position... */
1173/* if reginfo is NULL, its a dryrun */
1174/* annoyingly all the vars in this routine have different names from their counterparts
1175 in regmatch. /grrr */
1176
3c3eec57 1177STATIC char *
07be1b83 1178S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1179 const char *strend, regmatch_info *reginfo)
a687059c 1180{
27da23d5 1181 dVAR;
bbe252da 1182 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
6eb5f6b9 1183 char *m;
d8093b23 1184 STRLEN ln;
5dab1207 1185 STRLEN lnc;
078c425b 1186 register STRLEN uskip;
d8093b23
G
1187 unsigned int c1;
1188 unsigned int c2;
6eb5f6b9
JH
1189 char *e;
1190 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1191 register const bool do_utf8 = PL_reg_match_utf8;
f8fc2ecf 1192 RXi_GET_DECL(prog,progi);
7918f24d
NC
1193
1194 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1195
6eb5f6b9
JH
1196 /* We know what class it must start with. */
1197 switch (OP(c)) {
6eb5f6b9 1198 case ANYOF:
388cc4de 1199 if (do_utf8) {
4cadc6a9 1200 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1201 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1202 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1203 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1204 }
1205 else {
1206 while (s < strend) {
1207 STRLEN skip = 1;
1208
32fc9b6a 1209 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1210 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1211 /* The assignment of 2 is intentional:
1212 * for the folded sharp s, the skip is 2. */
1213 (skip = SHARP_S_SKIP))) {
24b23f37 1214 if (tmp && (!reginfo || regtry(reginfo, &s)))
388cc4de
HS
1215 goto got_it;
1216 else
1217 tmp = doevery;
1218 }
1219 else
1220 tmp = 1;
1221 s += skip;
1222 }
a0d0e21e 1223 }
6eb5f6b9 1224 break;
f33976b4 1225 case CANY:
4cadc6a9 1226 REXEC_FBC_SCAN(
24b23f37 1227 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1228 goto got_it;
1229 else
1230 tmp = doevery;
4cadc6a9 1231 );
f33976b4 1232 break;
6eb5f6b9 1233 case EXACTF:
5dab1207
NIS
1234 m = STRING(c);
1235 ln = STR_LEN(c); /* length to match in octets/bytes */
1236 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1237 if (UTF) {
a2a2844f 1238 STRLEN ulen1, ulen2;
5dab1207 1239 U8 *sm = (U8 *) m;
89ebb4a3
JH
1240 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1241 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
97dc7d3e
RGS
1242 /* used by commented-out code below */
1243 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
a0a388a1
YO
1244
1245 /* XXX: Since the node will be case folded at compile
1246 time this logic is a little odd, although im not
1247 sure that its actually wrong. --dmq */
1248
1249 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1250 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1251
1252 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1253 codepoint of the first character in the converted
1254 form, yet originally we did the extra step.
1255 No tests fail by commenting this code out however
1256 so Ive left it out. -- dmq.
1257
89ebb4a3 1258 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1259 0, uniflags);
89ebb4a3 1260 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1261 0, uniflags);
a0a388a1
YO
1262 */
1263
5dab1207
NIS
1264 lnc = 0;
1265 while (sm < ((U8 *) m + ln)) {
1266 lnc++;
1267 sm += UTF8SKIP(sm);
1268 }
1aa99e6b
IH
1269 }
1270 else {
1271 c1 = *(U8*)m;
1272 c2 = PL_fold[c1];
1273 }
6eb5f6b9
JH
1274 goto do_exactf;
1275 case EXACTFL:
5dab1207
NIS
1276 m = STRING(c);
1277 ln = STR_LEN(c);
1278 lnc = (I32) ln;
d8093b23 1279 c1 = *(U8*)m;
6eb5f6b9
JH
1280 c2 = PL_fold_locale[c1];
1281 do_exactf:
db12adc6 1282 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1283
3b0527fe 1284 if (!reginfo && e < s)
6eb5f6b9 1285 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1286
60a8b682
JH
1287 /* The idea in the EXACTF* cases is to first find the
1288 * first character of the EXACTF* node and then, if
1289 * necessary, case-insensitively compare the full
1290 * text of the node. The c1 and c2 are the first
1291 * characters (though in Unicode it gets a bit
1292 * more complicated because there are more cases
7f16dd3d
JH
1293 * than just upper and lower: one needs to use
1294 * the so-called folding case for case-insensitive
1295 * matching (called "loose matching" in Unicode).
1296 * ibcmp_utf8() will do just that. */
60a8b682 1297
a0a388a1 1298 if (do_utf8 || UTF) {
575cac57 1299 UV c, f;
89ebb4a3 1300 U8 tmpbuf [UTF8_MAXBYTES+1];
a0a388a1
YO
1301 STRLEN len = 1;
1302 STRLEN foldlen;
4ad0818d 1303 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1304 if (c1 == c2) {
5dab1207
NIS
1305 /* Upper and lower of 1st char are equal -
1306 * probably not a "letter". */
1aa99e6b 1307 while (s <= e) {
a0a388a1
YO
1308 if (do_utf8) {
1309 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1310 uniflags);
a0a388a1
YO
1311 } else {
1312 c = *((U8*)s);
1313 }
4cadc6a9 1314 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1315 }
09091399
JH
1316 }
1317 else {
1aa99e6b 1318 while (s <= e) {
a0a388a1
YO
1319 if (do_utf8) {
1320 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1321 uniflags);
a0a388a1
YO
1322 } else {
1323 c = *((U8*)s);
1324 }
80aecb99 1325
60a8b682 1326 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1327 * Note that not all the possible combinations
1328 * are handled here: some of them are handled
1329 * by the standard folding rules, and some of
1330 * them (the character class or ANYOF cases)
1331 * are handled during compiletime in
1332 * regexec.c:S_regclass(). */
880bd946
JH
1333 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1334 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1335 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1336
4cadc6a9 1337 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1338 }
09091399 1339 }
1aa99e6b
IH
1340 }
1341 else {
a0a388a1 1342 /* Neither pattern nor string are UTF8 */
1aa99e6b 1343 if (c1 == c2)
4cadc6a9 1344 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1345 else
4cadc6a9 1346 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1347 }
1348 break;
bbce6d69 1349 case BOUNDL:
3280af22 1350 PL_reg_flags |= RF_tainted;
bbce6d69 1351 /* FALL THROUGH */
a0d0e21e 1352 case BOUND:
ffc61ed2 1353 if (do_utf8) {
12d33761 1354 if (s == PL_bostr)
ffc61ed2
JH
1355 tmp = '\n';
1356 else {
6136c704 1357 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1358 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1359 }
1360 tmp = ((OP(c) == BOUND ?
9041c2e3 1361 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1362 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1363 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1364 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1365 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1366 isALNUM_LC_utf8((U8*)s)))
1367 {
1368 tmp = !tmp;
4cadc6a9 1369 REXEC_FBC_TRYIT;
a687059c 1370 }
4cadc6a9 1371 );
a0d0e21e 1372 }
667bb95a 1373 else {
12d33761 1374 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1375 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1376 REXEC_FBC_SCAN(
ffc61ed2
JH
1377 if (tmp ==
1378 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1379 tmp = !tmp;
4cadc6a9 1380 REXEC_FBC_TRYIT;
a0ed51b3 1381 }
4cadc6a9 1382 );
a0ed51b3 1383 }
24b23f37 1384 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1385 goto got_it;
1386 break;
bbce6d69 1387 case NBOUNDL:
3280af22 1388 PL_reg_flags |= RF_tainted;
bbce6d69 1389 /* FALL THROUGH */
a0d0e21e 1390 case NBOUND:
ffc61ed2 1391 if (do_utf8) {
12d33761 1392 if (s == PL_bostr)
ffc61ed2
JH
1393 tmp = '\n';
1394 else {
6136c704 1395 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1396 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1397 }
1398 tmp = ((OP(c) == NBOUND ?
9041c2e3 1399 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1400 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1401 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1402 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1403 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1404 isALNUM_LC_utf8((U8*)s)))
1405 tmp = !tmp;
4cadc6a9
YO
1406 else REXEC_FBC_TRYIT;
1407 );
a0d0e21e 1408 }
667bb95a 1409 else {
12d33761 1410 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1411 tmp = ((OP(c) == NBOUND ?
1412 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1413 REXEC_FBC_SCAN(
ffc61ed2
JH
1414 if (tmp ==
1415 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1416 tmp = !tmp;
4cadc6a9
YO
1417 else REXEC_FBC_TRYIT;
1418 );
a0ed51b3 1419 }
24b23f37 1420 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1421 goto got_it;
1422 break;
a0d0e21e 1423 case ALNUM:
4cadc6a9
YO
1424 REXEC_FBC_CSCAN_PRELOAD(
1425 LOAD_UTF8_CHARCLASS_ALNUM(),
1426 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1427 isALNUM(*s)
1428 );
bbce6d69 1429 case ALNUML:
4cadc6a9
YO
1430 REXEC_FBC_CSCAN_TAINT(
1431 isALNUM_LC_utf8((U8*)s),
1432 isALNUM_LC(*s)
1433 );
a0d0e21e 1434 case NALNUM:
4cadc6a9
YO
1435 REXEC_FBC_CSCAN_PRELOAD(
1436 LOAD_UTF8_CHARCLASS_ALNUM(),
1437 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1438 !isALNUM(*s)
1439 );
bbce6d69 1440 case NALNUML:
4cadc6a9
YO
1441 REXEC_FBC_CSCAN_TAINT(
1442 !isALNUM_LC_utf8((U8*)s),
1443 !isALNUM_LC(*s)
1444 );
a0d0e21e 1445 case SPACE:
4cadc6a9
YO
1446 REXEC_FBC_CSCAN_PRELOAD(
1447 LOAD_UTF8_CHARCLASS_SPACE(),
1448 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1449 isSPACE(*s)
1450 );
bbce6d69 1451 case SPACEL:
4cadc6a9
YO
1452 REXEC_FBC_CSCAN_TAINT(
1453 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1454 isSPACE_LC(*s)
1455 );
a0d0e21e 1456 case NSPACE:
4cadc6a9
YO
1457 REXEC_FBC_CSCAN_PRELOAD(
1458 LOAD_UTF8_CHARCLASS_SPACE(),
1459 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1460 !isSPACE(*s)
1461 );
bbce6d69 1462 case NSPACEL:
4cadc6a9
YO
1463 REXEC_FBC_CSCAN_TAINT(
1464 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1465 !isSPACE_LC(*s)
1466 );
a0d0e21e 1467 case DIGIT:
4cadc6a9
YO
1468 REXEC_FBC_CSCAN_PRELOAD(
1469 LOAD_UTF8_CHARCLASS_DIGIT(),
1470 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1471 isDIGIT(*s)
1472 );
b8c5462f 1473 case DIGITL:
4cadc6a9
YO
1474 REXEC_FBC_CSCAN_TAINT(
1475 isDIGIT_LC_utf8((U8*)s),
1476 isDIGIT_LC(*s)
1477 );
a0d0e21e 1478 case NDIGIT:
4cadc6a9
YO
1479 REXEC_FBC_CSCAN_PRELOAD(
1480 LOAD_UTF8_CHARCLASS_DIGIT(),
1481 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1482 !isDIGIT(*s)
1483 );
b8c5462f 1484 case NDIGITL:
4cadc6a9
YO
1485 REXEC_FBC_CSCAN_TAINT(
1486 !isDIGIT_LC_utf8((U8*)s),
1487 !isDIGIT_LC(*s)
1488 );
e1d1eefb
YO
1489 case LNBREAK:
1490 REXEC_FBC_CSCAN(
1491 is_LNBREAK_utf8(s),
1492 is_LNBREAK_latin1(s)
1493 );
1494 case VERTWS:
1495 REXEC_FBC_CSCAN(
1496 is_VERTWS_utf8(s),
1497 is_VERTWS_latin1(s)
1498 );
1499 case NVERTWS:
1500 REXEC_FBC_CSCAN(
1501 !is_VERTWS_utf8(s),
1502 !is_VERTWS_latin1(s)
1503 );
1504 case HORIZWS:
1505 REXEC_FBC_CSCAN(
1506 is_HORIZWS_utf8(s),
1507 is_HORIZWS_latin1(s)
1508 );
1509 case NHORIZWS:
1510 REXEC_FBC_CSCAN(
1511 !is_HORIZWS_utf8(s),
1512 !is_HORIZWS_latin1(s)
1513 );
1de06328
YO
1514 case AHOCORASICKC:
1515 case AHOCORASICK:
07be1b83 1516 {
a0a388a1 1517 DECL_TRIE_TYPE(c);
07be1b83
YO
1518 /* what trie are we using right now */
1519 reg_ac_data *aho
f8fc2ecf 1520 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1521 reg_trie_data *trie
1522 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1523 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1524
1525 const char *last_start = strend - trie->minlen;
6148ee25 1526#ifdef DEBUGGING
07be1b83 1527 const char *real_start = s;
6148ee25 1528#endif
07be1b83 1529 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1530 SV *sv_points;
1531 U8 **points; /* map of where we were in the input string
786e8c11 1532 when reading a given char. For ASCII this
be8e71aa 1533 is unnecessary overhead as the relationship
38a44b82
NC
1534 is always 1:1, but for Unicode, especially
1535 case folded Unicode this is not true. */
f9e705e8 1536 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1537 U8 *bitmap=NULL;
1538
07be1b83
YO
1539
1540 GET_RE_DEBUG_FLAGS_DECL;
1541
be8e71aa
YO
1542 /* We can't just allocate points here. We need to wrap it in
1543 * an SV so it gets freed properly if there is a croak while
1544 * running the match */
1545 ENTER;
1546 SAVETMPS;
1547 sv_points=newSV(maxlen * sizeof(U8 *));
1548 SvCUR_set(sv_points,
1549 maxlen * sizeof(U8 *));
1550 SvPOK_on(sv_points);
1551 sv_2mortal(sv_points);
1552 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1553 if ( trie_type != trie_utf8_fold
1554 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1555 {
786e8c11
YO
1556 if (trie->bitmap)
1557 bitmap=(U8*)trie->bitmap;
1558 else
1559 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1560 }
786e8c11
YO
1561 /* this is the Aho-Corasick algorithm modified a touch
1562 to include special handling for long "unknown char"
1563 sequences. The basic idea being that we use AC as long
1564 as we are dealing with a possible matching char, when
1565 we encounter an unknown char (and we have not encountered
1566 an accepting state) we scan forward until we find a legal
1567 starting char.
1568 AC matching is basically that of trie matching, except
1569 that when we encounter a failing transition, we fall back
1570 to the current states "fail state", and try the current char
1571 again, a process we repeat until we reach the root state,
1572 state 1, or a legal transition. If we fail on the root state
1573 then we can either terminate if we have reached an accepting
1574 state previously, or restart the entire process from the beginning
1575 if we have not.
1576
1577 */
07be1b83
YO
1578 while (s <= last_start) {
1579 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1580 U8 *uc = (U8*)s;
1581 U16 charid = 0;
1582 U32 base = 1;
1583 U32 state = 1;
1584 UV uvc = 0;
1585 STRLEN len = 0;
1586 STRLEN foldlen = 0;
1587 U8 *uscan = (U8*)NULL;
1588 U8 *leftmost = NULL;
786e8c11
YO
1589#ifdef DEBUGGING
1590 U32 accepted_word= 0;
1591#endif
07be1b83
YO
1592 U32 pointpos = 0;
1593
1594 while ( state && uc <= (U8*)strend ) {
1595 int failed=0;
786e8c11
YO
1596 U32 word = aho->states[ state ].wordnum;
1597
1de06328
YO
1598 if( state==1 ) {
1599 if ( bitmap ) {
1600 DEBUG_TRIE_EXECUTE_r(
1601 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1602 dump_exec_pos( (char *)uc, c, strend, real_start,
1603 (char *)uc, do_utf8 );
1604 PerlIO_printf( Perl_debug_log,
1605 " Scanning for legal start char...\n");
1606 }
1607 );
1608 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1609 uc++;
786e8c11 1610 }
1de06328 1611 s= (char *)uc;
786e8c11 1612 }
786e8c11
YO
1613 if (uc >(U8*)last_start) break;
1614 }
1615
1616 if ( word ) {
1617 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1618 if (!leftmost || lpos < leftmost) {
1619 DEBUG_r(accepted_word=word);
07be1b83 1620 leftmost= lpos;
786e8c11 1621 }
07be1b83 1622 if (base==0) break;
786e8c11 1623
07be1b83
YO
1624 }
1625 points[pointpos++ % maxlen]= uc;
55eed653
NC
1626 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1627 uscan, len, uvc, charid, foldlen,
1628 foldbuf, uniflags);
786e8c11
YO
1629 DEBUG_TRIE_EXECUTE_r({
1630 dump_exec_pos( (char *)uc, c, strend, real_start,
1631 s, do_utf8 );
07be1b83 1632 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1633 " Charid:%3u CP:%4"UVxf" ",
1634 charid, uvc);
1635 });
07be1b83
YO
1636
1637 do {
6148ee25 1638#ifdef DEBUGGING
786e8c11 1639 word = aho->states[ state ].wordnum;
6148ee25 1640#endif
07be1b83
YO
1641 base = aho->states[ state ].trans.base;
1642
786e8c11
YO
1643 DEBUG_TRIE_EXECUTE_r({
1644 if (failed)
1645 dump_exec_pos( (char *)uc, c, strend, real_start,
1646 s, do_utf8 );
07be1b83 1647 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1648 "%sState: %4"UVxf", word=%"UVxf,
1649 failed ? " Fail transition to " : "",
1650 (UV)state, (UV)word);
1651 });
07be1b83
YO
1652 if ( base ) {
1653 U32 tmp;
1654 if (charid &&
1655 (base + charid > trie->uniquecharcount )
1656 && (base + charid - 1 - trie->uniquecharcount
1657 < trie->lasttrans)
1658 && trie->trans[base + charid - 1 -
1659 trie->uniquecharcount].check == state
1660 && (tmp=trie->trans[base + charid - 1 -
1661 trie->uniquecharcount ].next))
1662 {
786e8c11
YO
1663 DEBUG_TRIE_EXECUTE_r(
1664 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1665 state = tmp;
1666 break;
1667 }
1668 else {
786e8c11
YO
1669 DEBUG_TRIE_EXECUTE_r(
1670 PerlIO_printf( Perl_debug_log," - fail\n"));
1671 failed = 1;
1672 state = aho->fail[state];
07be1b83
YO
1673 }
1674 }
1675 else {
1676 /* we must be accepting here */
786e8c11
YO
1677 DEBUG_TRIE_EXECUTE_r(
1678 PerlIO_printf( Perl_debug_log," - accepting\n"));
1679 failed = 1;
07be1b83
YO
1680 break;
1681 }
1682 } while(state);
786e8c11 1683 uc += len;
07be1b83
YO
1684 if (failed) {
1685 if (leftmost)
1686 break;
786e8c11 1687 if (!state) state = 1;
07be1b83
YO
1688 }
1689 }
1690 if ( aho->states[ state ].wordnum ) {
1691 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
786e8c11
YO
1692 if (!leftmost || lpos < leftmost) {
1693 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1694 leftmost = lpos;
786e8c11 1695 }
07be1b83 1696 }
07be1b83
YO
1697 if (leftmost) {
1698 s = (char*)leftmost;
786e8c11
YO
1699 DEBUG_TRIE_EXECUTE_r({
1700 PerlIO_printf(
70685ca0
JH
1701 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1702 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
1703 );
1704 });
24b23f37 1705 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
1706 FREETMPS;
1707 LEAVE;
07be1b83 1708 goto got_it;
be8e71aa 1709 }
07be1b83 1710 s = HOPc(s,1);
786e8c11
YO
1711 DEBUG_TRIE_EXECUTE_r({
1712 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1713 });
07be1b83 1714 } else {
786e8c11
YO
1715 DEBUG_TRIE_EXECUTE_r(
1716 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1717 break;
1718 }
1719 }
be8e71aa
YO
1720 FREETMPS;
1721 LEAVE;
07be1b83
YO
1722 }
1723 break;
b3c9acc1 1724 default:
3c3eec57
GS
1725 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1726 break;
d6a28714 1727 }
6eb5f6b9
JH
1728 return 0;
1729 got_it:
1730 return s;
1731}
1732
e6aad7ab 1733static void
7918f24d
NC
1734S_swap_match_buff (pTHX_ regexp *prog)
1735{
f0ab9afb 1736 regexp_paren_pair *t;
fae667d5 1737
7918f24d
NC
1738 PERL_ARGS_ASSERT_SWAP_MATCH_BUFF;
1739
28d8d7f4 1740 if (!prog->swap) {
fae667d5
YO
1741 /* We have to be careful. If the previous successful match
1742 was from this regex we don't want a subsequent paritally
1743 successful match to clobber the old results.
1744 So when we detect this possibility we add a swap buffer
1745 to the re, and switch the buffer each match. If we fail
1746 we switch it back, otherwise we leave it swapped.
1747 */
f0ab9afb 1748 Newxz(prog->swap, (prog->nparens + 1), regexp_paren_pair);
fae667d5 1749 }
f0ab9afb
NC
1750 t = prog->swap;
1751 prog->swap = prog->offs;
1752 prog->offs = t;
fae667d5
YO
1753}
1754
1755
6eb5f6b9
JH
1756/*
1757 - regexec_flags - match a regexp against a string
1758 */
1759I32
288b8c02 1760Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9
JH
1761 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1762/* strend: pointer to null at end of string */
1763/* strbeg: real beginning of string */
1764/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
1765/* data: May be used for some additional optimizations.
1766 Currently its only used, with a U32 cast, for transmitting
1767 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
1768/* nosave: For optimizations. */
1769{
97aff369 1770 dVAR;
288b8c02 1771 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 1772 /*register*/ char *s;
6eb5f6b9 1773 register regnode *c;
24b23f37 1774 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
1775 I32 minlen; /* must match at least this many chars */
1776 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1777 I32 end_shift = 0; /* Same for the end. */ /* CC */
1778 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1779 char *scream_olds = NULL;
f9f4320a 1780 const bool do_utf8 = (bool)DO_UTF8(sv);
2757e526 1781 I32 multiline;
f8fc2ecf 1782 RXi_GET_DECL(prog,progi);
3b0527fe 1783 regmatch_info reginfo; /* create some info to pass to regtry etc */
fae667d5 1784 bool swap_on_fail = 0;
a3621e74
YO
1785 GET_RE_DEBUG_FLAGS_DECL;
1786
7918f24d 1787 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 1788 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1789
1790 /* Be paranoid... */
1791 if (prog == NULL || startpos == NULL) {
1792 Perl_croak(aTHX_ "NULL regexp parameter");
1793 return 0;
1794 }
1795
bbe252da 1796 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 1797 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 1798
288b8c02 1799 RX_MATCH_UTF8_set(rx, do_utf8);
1de06328 1800 DEBUG_EXECUTE_r(
efd26800 1801 debug_start_match(rx, do_utf8, startpos, strend,
1de06328
YO
1802 "Matching");
1803 );
bac06658 1804
6eb5f6b9 1805 minlen = prog->minlen;
1de06328
YO
1806
1807 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 1808 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1809 "String too short [regexec_flags]...\n"));
1810 goto phooey;
1aa99e6b 1811 }
6eb5f6b9 1812
1de06328 1813
6eb5f6b9 1814 /* Check validity of program. */
f8fc2ecf 1815 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
1816 Perl_croak(aTHX_ "corrupted regexp program");
1817 }
1818
1819 PL_reg_flags = 0;
1820 PL_reg_eval_set = 0;
1821 PL_reg_maxiter = 0;
1822
3c8556c3 1823 if (RX_UTF8(rx))
6eb5f6b9
JH
1824 PL_reg_flags |= RF_utf8;
1825
1826 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1827 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1828 PL_bostr = strbeg;
3b0527fe 1829 reginfo.sv = sv;
6eb5f6b9
JH
1830
1831 /* Mark end of line for $ (and such) */
1832 PL_regeol = strend;
1833
1834 /* see how far we have to get to not match where we matched before */
3b0527fe 1835 reginfo.till = startpos+minend;
6eb5f6b9 1836
6eb5f6b9
JH
1837 /* If there is a "must appear" string, look for it. */
1838 s = startpos;
1839
bbe252da 1840 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1841 MAGIC *mg;
1842
1843 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
58e23c8d 1844 reginfo.ganch = startpos + prog->gofs;
6eb5f6b9
JH
1845 else if (sv && SvTYPE(sv) >= SVt_PVMG
1846 && SvMAGIC(sv)
14befaf4
DM
1847 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1848 && mg->mg_len >= 0) {
3b0527fe 1849 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
bbe252da 1850 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 1851 if (s > reginfo.ganch)
6eb5f6b9 1852 goto phooey;
58e23c8d 1853 s = reginfo.ganch - prog->gofs;
6eb5f6b9
JH
1854 }
1855 }
58e23c8d 1856 else if (data) {
70685ca0 1857 reginfo.ganch = strbeg + PTR2UV(data);
58e23c8d 1858 } else /* pos() not defined */
3b0527fe 1859 reginfo.ganch = strbeg;
6eb5f6b9 1860 }
288b8c02 1861 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
fae667d5
YO
1862 swap_on_fail = 1;
1863 swap_match_buff(prog); /* do we need a save destructor here for
1864 eval dies? */
c74340f9 1865 }
a0714e2c 1866 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1867 re_scream_pos_data d;
1868
1869 d.scream_olds = &scream_olds;
1870 d.scream_pos = &scream_pos;
288b8c02 1871 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 1872 if (!s) {
a3621e74 1873 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1874 goto phooey; /* not present */
3fa9c3d7 1875 }
6eb5f6b9
JH
1876 }
1877
1de06328 1878
6eb5f6b9
JH
1879
1880 /* Simplest case: anchored match need be tried only once. */
1881 /* [unless only anchor is BOL and multiline is set] */
bbe252da 1882 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 1883 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 1884 goto got_it;
bbe252da
YO
1885 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1886 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
1887 {
1888 char *end;
1889
1890 if (minlen)
1891 dontbother = minlen - 1;
1aa99e6b 1892 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1893 /* for multiline we only have to try after newlines */
33b8afdf 1894 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1895 if (s == startpos)
1896 goto after_try;
1897 while (1) {
24b23f37 1898 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1899 goto got_it;
1900 after_try:
5339e136 1901 if (s > end)
6eb5f6b9 1902 goto phooey;
bbe252da 1903 if (prog->extflags & RXf_USE_INTUIT) {
288b8c02 1904 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
6eb5f6b9
JH
1905 if (!s)
1906 goto phooey;
1907 }
1908 else
1909 s++;
1910 }
1911 } else {
1912 if (s > startpos)
1913 s--;
1914 while (s < end) {
1915 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 1916 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1917 goto got_it;
1918 }
1919 }
1920 }
1921 }
1922 goto phooey;
bbe252da 1923 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a
YO
1924 {
1925 /* the warning about reginfo.ganch being used without intialization
bbe252da 1926 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 1927 and we only enter this block when the same bit is set. */
58e23c8d
YO
1928 char *tmp_s = reginfo.ganch - prog->gofs;
1929 if (regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
1930 goto got_it;
1931 goto phooey;
1932 }
1933
1934 /* Messy cases: unanchored match. */
bbe252da 1935 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9
JH
1936 /* we have /x+whatever/ */
1937 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1938 char ch;
bf93d4cc
GS
1939#ifdef DEBUGGING
1940 int did_match = 0;
1941#endif
33b8afdf
JH
1942 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1943 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1944 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1945
1aa99e6b 1946 if (do_utf8) {
4cadc6a9 1947 REXEC_FBC_SCAN(
6eb5f6b9 1948 if (*s == ch) {
a3621e74 1949 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 1950 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
1951 s += UTF8SKIP(s);
1952 while (s < strend && *s == ch)
1953 s += UTF8SKIP(s);
1954 }
4cadc6a9 1955 );
6eb5f6b9
JH
1956 }
1957 else {
4cadc6a9 1958 REXEC_FBC_SCAN(
6eb5f6b9 1959 if (*s == ch) {
a3621e74 1960 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 1961 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
1962 s++;
1963 while (s < strend && *s == ch)
1964 s++;
1965 }
4cadc6a9 1966 );
6eb5f6b9 1967 }
a3621e74 1968 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1969 PerlIO_printf(Perl_debug_log,
b7953727
JH
1970 "Did not find anchored character...\n")
1971 );
6eb5f6b9 1972 }
a0714e2c
SS
1973 else if (prog->anchored_substr != NULL
1974 || prog->anchored_utf8 != NULL
1975 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1976 && prog->float_max_offset < strend - s)) {
1977 SV *must;
1978 I32 back_max;
1979 I32 back_min;
1980 char *last;
6eb5f6b9 1981 char *last1; /* Last position checked before */
bf93d4cc
GS
1982#ifdef DEBUGGING
1983 int did_match = 0;
1984#endif
33b8afdf
JH
1985 if (prog->anchored_substr || prog->anchored_utf8) {
1986 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1987 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1988 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1989 back_max = back_min = prog->anchored_offset;
1990 } else {
1991 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1992 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1993 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1994 back_max = prog->float_max_offset;
1995 back_min = prog->float_min_offset;
1996 }
1de06328
YO
1997
1998
33b8afdf
JH
1999 if (must == &PL_sv_undef)
2000 /* could not downgrade utf8 check substring, so must fail */
2001 goto phooey;
2002
1de06328
YO
2003 if (back_min<0) {
2004 last = strend;
2005 } else {
2006 last = HOP3c(strend, /* Cannot start after this */
2007 -(I32)(CHR_SVLEN(must)
2008 - (SvTAIL(must) != 0) + back_min), strbeg);
2009 }
6eb5f6b9
JH
2010 if (s > PL_bostr)
2011 last1 = HOPc(s, -1);
2012 else
2013 last1 = s - 1; /* bogus */
2014
a0288114 2015 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2016 check_substr==must. */
2017 scream_pos = -1;
2018 dontbother = end_shift;
2019 strend = HOPc(strend, -dontbother);
2020 while ( (s <= last) &&
9041c2e3 2021 ((flags & REXEC_SCREAM)
1de06328 2022 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 2023 end_shift, &scream_pos, 0))
1de06328 2024 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2025 (unsigned char*)strend, must,
7fba1cd6 2026 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b 2027 /* we may be pointing at the wrong string */
07bc277f 2028 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
3f7c398e 2029 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2030 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2031 if (HOPc(s, -back_max) > last1) {
2032 last1 = HOPc(s, -back_min);
2033 s = HOPc(s, -back_max);
2034 }
2035 else {
52657f30 2036 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2037
2038 last1 = HOPc(s, -back_min);
52657f30 2039 s = t;
6eb5f6b9 2040 }
1aa99e6b 2041 if (do_utf8) {
6eb5f6b9 2042 while (s <= last1) {
24b23f37 2043 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2044 goto got_it;
2045 s += UTF8SKIP(s);
2046 }
2047 }
2048 else {
2049 while (s <= last1) {
24b23f37 2050 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2051 goto got_it;
2052 s++;
2053 }
2054 }
2055 }
ab3bbdeb
YO
2056 DEBUG_EXECUTE_r(if (!did_match) {
2057 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2058 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2059 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2060 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2061 ? "anchored" : "floating"),
ab3bbdeb
YO
2062 quoted, RE_SV_TAIL(must));
2063 });
6eb5f6b9
JH
2064 goto phooey;
2065 }
f8fc2ecf 2066 else if ( (c = progi->regstclass) ) {
f14c76ed 2067 if (minlen) {
f8fc2ecf 2068 const OPCODE op = OP(progi->regstclass);
66e933ab 2069 /* don't bother with what can't match */
786e8c11 2070 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2071 strend = HOPc(strend, -(minlen - 1));
2072 }
a3621e74 2073 DEBUG_EXECUTE_r({
be8e71aa 2074 SV * const prop = sv_newmortal();
32fc9b6a 2075 regprop(prog, prop, c);
0df25f3d 2076 {
02daf0ab 2077 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2078 s,strend-s,60);
0df25f3d 2079 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2080 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 2081 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2082 quoted, (int)(strend - s));
0df25f3d 2083 }
ffc61ed2 2084 });
3b0527fe 2085 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2086 goto got_it;
07be1b83 2087 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2088 }
2089 else {
2090 dontbother = 0;
a0714e2c 2091 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2092 /* Trim the end. */
d6a28714 2093 char *last;
33b8afdf
JH
2094 SV* float_real;
2095
2096 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2097 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2098 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2099
2100 if (flags & REXEC_SCREAM) {
33b8afdf 2101 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2102 end_shift, &scream_pos, 1); /* last one */
2103 if (!last)
ffc61ed2 2104 last = scream_olds; /* Only one occurrence. */
4addbd3b 2105 /* we may be pointing at the wrong string */
07bc277f 2106 else if (RXp_MATCH_COPIED(prog))
3f7c398e 2107 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2108 }
d6a28714
JH
2109 else {
2110 STRLEN len;
cfd0369c 2111 const char * const little = SvPV_const(float_real, len);
d6a28714 2112
33b8afdf 2113 if (SvTAIL(float_real)) {
d6a28714
JH
2114 if (memEQ(strend - len + 1, little, len - 1))
2115 last = strend - len + 1;
7fba1cd6 2116 else if (!multiline)
9041c2e3 2117 last = memEQ(strend - len, little, len)
bd61b366 2118 ? strend - len : NULL;
b8c5462f 2119 else
d6a28714
JH
2120 goto find_last;
2121 } else {
2122 find_last:
9041c2e3 2123 if (len)
d6a28714 2124 last = rninstr(s, strend, little, little + len);
b8c5462f 2125 else
a0288114 2126 last = strend; /* matching "$" */
b8c5462f 2127 }
b8c5462f 2128 }
bf93d4cc 2129 if (last == NULL) {
6bda09f9
YO
2130 DEBUG_EXECUTE_r(
2131 PerlIO_printf(Perl_debug_log,
2132 "%sCan't trim the tail, match fails (should not happen)%s\n",
2133 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2134 goto phooey; /* Should not happen! */
2135 }
d6a28714
JH
2136 dontbother = strend - last + prog->float_min_offset;
2137 }
2138 if (minlen && (dontbother < minlen))
2139 dontbother = minlen - 1;
2140 strend -= dontbother; /* this one's always in bytes! */
2141 /* We don't know much -- general case. */
1aa99e6b 2142 if (do_utf8) {
d6a28714 2143 for (;;) {
24b23f37 2144 if (regtry(&reginfo, &s))
d6a28714
JH
2145 goto got_it;
2146 if (s >= strend)
2147 break;
b8c5462f 2148 s += UTF8SKIP(s);
d6a28714
JH
2149 };
2150 }
2151 else {
2152 do {
24b23f37 2153 if (regtry(&reginfo, &s))
d6a28714
JH
2154 goto got_it;
2155 } while (s++ < strend);
2156 }
2157 }
2158
2159 /* Failure. */
2160 goto phooey;
2161
2162got_it:
288b8c02 2163 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2164
19b95bf0 2165 if (PL_reg_eval_set)
4f639d21 2166 restore_pos(aTHX_ prog);
5daac39c
NC
2167 if (RXp_PAREN_NAMES(prog))
2168 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2169
2170 /* make sure $`, $&, $', and $digit will work later */
2171 if ( !(flags & REXEC_NOT_FIRST) ) {
288b8c02 2172 RX_MATCH_COPY_FREE(rx);
d6a28714 2173 if (flags & REXEC_COPY_STR) {
be8e71aa 2174 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2175#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2176 if ((SvIsCOW(sv)
2177 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2178 if (DEBUG_C_TEST) {
2179 PerlIO_printf(Perl_debug_log,
2180 "Copy on write: regexp capture, type %d\n",
2181 (int) SvTYPE(sv));
2182 }
2183 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2184 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2185 assert (SvPOKp(prog->saved_copy));
2186 } else
2187#endif
2188 {
288b8c02 2189 RX_MATCH_COPIED_on(rx);
ed252734
NC
2190 s = savepvn(strbeg, i);
2191 prog->subbeg = s;
2192 }
d6a28714 2193 prog->sublen = i;
d6a28714
JH
2194 }
2195 else {
2196 prog->subbeg = strbeg;
2197 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2198 }
2199 }
9041c2e3 2200
d6a28714
JH
2201 return 1;
2202
2203phooey:
a3621e74 2204 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2205 PL_colors[4], PL_colors[5]));
d6a28714 2206 if (PL_reg_eval_set)
4f639d21 2207 restore_pos(aTHX_ prog);
fae667d5 2208 if (swap_on_fail)
c74340f9 2209 /* we failed :-( roll it back */
fae667d5
YO
2210 swap_match_buff(prog);
2211
d6a28714
JH
2212 return 0;
2213}
2214
6bda09f9 2215
d6a28714
JH
2216/*
2217 - regtry - try match at specific point
2218 */
2219STATIC I32 /* 0 failure, 1 success */
24b23f37 2220S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2221{
97aff369 2222 dVAR;
d6a28714 2223 CHECKPOINT lastcp;
288b8c02
NC
2224 REGEXP *const rx = reginfo->prog;
2225 regexp *const prog = (struct regexp *)SvANY(rx);
f8fc2ecf 2226 RXi_GET_DECL(prog,progi);
a3621e74 2227 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2228
2229 PERL_ARGS_ASSERT_REGTRY;
2230
24b23f37 2231 reginfo->cutpoint=NULL;
d6a28714 2232
bbe252da 2233 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
d6a28714
JH
2234 MAGIC *mg;
2235
2236 PL_reg_eval_set = RS_init;
a3621e74 2237 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2238 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2239 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2240 ));
ea8d6ae1 2241 SAVESTACK_CXPOS();
d6a28714
JH
2242 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2243 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2244 SAVETMPS;
2245 /* Apparently this is not needed, judging by wantarray. */
e8347627 2246 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2247 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2248
3b0527fe 2249 if (reginfo->sv) {
d6a28714 2250 /* Make $_ available to executed code. */
3b0527fe 2251 if (reginfo->sv != DEFSV) {
59f00321 2252 SAVE_DEFSV;
3b0527fe 2253 DEFSV = reginfo->sv;
b8c5462f 2254 }
d6a28714 2255
3b0527fe
DM
2256 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2257 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2258 /* prepare for quick setting of pos */
d300d9fa 2259#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2260 if (SvIsCOW(reginfo->sv))
2261 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2262#endif
3dab1dad 2263 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2264 &PL_vtbl_mglob, NULL, 0);
d6a28714 2265 mg->mg_len = -1;
b8c5462f 2266 }
d6a28714
JH
2267 PL_reg_magic = mg;
2268 PL_reg_oldpos = mg->mg_len;
4f639d21 2269 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2270 }
09687e5a 2271 if (!PL_reg_curpm) {
a02a5408 2272 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2273#ifdef USE_ITHREADS
2274 {
14a49a24 2275 SV* const repointer = &PL_sv_undef;
92313705
NC
2276 /* this regexp is also owned by the new PL_reg_curpm, which
2277 will try to free it. */
d2ece331 2278 av_push(PL_regex_padav, repointer);
09687e5a
AB
2279 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2280 PL_regex_pad = AvARRAY(PL_regex_padav);
2281 }
2282#endif
2283 }
86c29d75
NC
2284#ifdef USE_ITHREADS
2285 /* It seems that non-ithreads works both with and without this code.
2286 So for efficiency reasons it seems best not to have the code
2287 compiled when it is not needed. */
92313705
NC
2288 /* This is safe against NULLs: */
2289 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2290 /* PM_reg_curpm owns a reference to this regexp. */
2291 ReREFCNT_inc(rx);
86c29d75 2292#endif
288b8c02 2293 PM_SETRE(PL_reg_curpm, rx);
d6a28714
JH
2294 PL_reg_oldcurpm = PL_curpm;
2295 PL_curpm = PL_reg_curpm;
07bc277f 2296 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2297 /* Here is a serious problem: we cannot rewrite subbeg,
2298 since it may be needed if this match fails. Thus
2299 $` inside (?{}) could fail... */
2300 PL_reg_oldsaved = prog->subbeg;
2301 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2302#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2303 PL_nrs = prog->saved_copy;
2304#endif
07bc277f 2305 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2306 }
2307 else
bd61b366 2308 PL_reg_oldsaved = NULL;
d6a28714
JH
2309 prog->subbeg = PL_bostr;
2310 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2311 }
24b23f37 2312 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
f0ab9afb 2313 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2314 PL_reginput = *startpos;
d6a28714 2315 PL_reglastparen = &prog->lastparen;
a01268b5 2316 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2317 prog->lastparen = 0;
03994de8 2318 prog->lastcloseparen = 0;
d6a28714 2319 PL_regsize = 0;
f0ab9afb 2320 PL_regoffs = prog->offs;
d6a28714
JH
2321 if (PL_reg_start_tmpl <= prog->nparens) {
2322 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2323 if(PL_reg_start_tmp)
2324 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2325 else
a02a5408 2326 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2327 }
2328
2329 /* XXXX What this code is doing here?!!! There should be no need
2330 to do this again and again, PL_reglastparen should take care of
3dd2943c 2331 this! --ilya*/
dafc8851
JH
2332
2333 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2334 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116 2335 * PL_reglastparen), is not needed at all by the test suite
225593e1
DM
2336 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2337 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2338 * Meanwhile, this code *is* needed for the
daf18116
JH
2339 * above-mentioned test suite tests to succeed. The common theme
2340 * on those tests seems to be returning null fields from matches.
225593e1 2341 * --jhi updated by dapm */
dafc8851 2342#if 1
d6a28714 2343 if (prog->nparens) {
f0ab9afb 2344 regexp_paren_pair *pp = PL_regoffs;
097eb12c 2345 register I32 i;
eb160463 2346 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
f0ab9afb
NC
2347 ++pp;
2348 pp->start = -1;
2349 pp->end = -1;
d6a28714
JH
2350 }
2351 }
dafc8851 2352#endif
02db2b7b 2353 REGCP_SET(lastcp);
f8fc2ecf 2354 if (regmatch(reginfo, progi->program + 1)) {
f0ab9afb 2355 PL_regoffs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2356 return 1;
2357 }
24b23f37
YO
2358 if (reginfo->cutpoint)
2359 *startpos= reginfo->cutpoint;
02db2b7b 2360 REGCP_UNWIND(lastcp);
d6a28714
JH
2361 return 0;
2362}
2363
02db2b7b 2364
8ba1375e
MJD
2365#define sayYES goto yes
2366#define sayNO goto no
262b90c4 2367#define sayNO_SILENT goto no_silent
8ba1375e 2368
f9f4320a
YO
2369/* we dont use STMT_START/END here because it leads to
2370 "unreachable code" warnings, which are bogus, but distracting. */
2371#define CACHEsayNO \
c476f425
DM
2372 if (ST.cache_mask) \
2373 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2374 sayNO
3298f257 2375
a3621e74 2376/* this is used to determine how far from the left messages like
265c4333
YO
2377 'failed...' are printed. It should be set such that messages
2378 are inline with the regop output that created them.
a3621e74 2379*/
265c4333 2380#define REPORT_CODE_OFF 32
a3621e74
YO
2381
2382
2383/* Make sure there is a test for this +1 options in re_tests */
2384#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2385
40a82448
DM
2386#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2387#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2388
86545054
DM
2389#define SLAB_FIRST(s) (&(s)->states[0])
2390#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2391
5d9a96ca
DM
2392/* grab a new slab and return the first slot in it */
2393
2394STATIC regmatch_state *
2395S_push_slab(pTHX)
2396{
a35a87e7 2397#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2398 dMY_CXT;
2399#endif
5d9a96ca
DM
2400 regmatch_slab *s = PL_regmatch_slab->next;
2401 if (!s) {
2402 Newx(s, 1, regmatch_slab);
2403 s->prev = PL_regmatch_slab;
2404 s->next = NULL;
2405 PL_regmatch_slab->next = s;
2406 }
2407 PL_regmatch_slab = s;
86545054 2408 return SLAB_FIRST(s);
5d9a96ca 2409}
5b47454d 2410
95b24440 2411
40a82448
DM
2412/* push a new state then goto it */
2413
2414#define PUSH_STATE_GOTO(state, node) \
2415 scan = node; \
2416 st->resume_state = state; \
2417 goto push_state;
2418
2419/* push a new state with success backtracking, then goto it */
2420
2421#define PUSH_YES_STATE_GOTO(state, node) \
2422 scan = node; \
2423 st->resume_state = state; \
2424 goto push_yes_state;
2425
aa283a38 2426
aa283a38 2427
d6a28714 2428/*
95b24440 2429
bf1f174e
DM
2430regmatch() - main matching routine
2431
2432This is basically one big switch statement in a loop. We execute an op,
2433set 'next' to point the next op, and continue. If we come to a point which
2434we may need to backtrack to on failure such as (A|B|C), we push a
2435backtrack state onto the backtrack stack. On failure, we pop the top
2436state, and re-enter the loop at the state indicated. If there are no more
2437states to pop, we return failure.
2438
2439Sometimes we also need to backtrack on success; for example /A+/, where
2440after successfully matching one A, we need to go back and try to
2441match another one; similarly for lookahead assertions: if the assertion
2442completes successfully, we backtrack to the state just before the assertion
2443and then carry on. In these cases, the pushed state is marked as
2444'backtrack on success too'. This marking is in fact done by a chain of
2445pointers, each pointing to the previous 'yes' state. On success, we pop to
2446the nearest yes state, discarding any intermediate failure-only states.
2447Sometimes a yes state is pushed just to force some cleanup code to be
2448called at the end of a successful match or submatch; e.g. (??{$re}) uses
2449it to free the inner regex.
2450
2451Note that failure backtracking rewinds the cursor position, while
2452success backtracking leaves it alone.
2453
2454A pattern is complete when the END op is executed, while a subpattern
2455such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2456ops trigger the "pop to last yes state if any, otherwise return true"
2457behaviour.
2458
2459A common convention in this function is to use A and B to refer to the two
2460subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2461the subpattern to be matched possibly multiple times, while B is the entire
2462rest of the pattern. Variable and state names reflect this convention.
2463
2464The states in the main switch are the union of ops and failure/success of
2465substates associated with with that op. For example, IFMATCH is the op
2466that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2467'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2468successfully matched A and IFMATCH_A_fail is a state saying that we have
2469just failed to match A. Resume states always come in pairs. The backtrack
2470state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2471at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2472on success or failure.
2473
2474The struct that holds a backtracking state is actually a big union, with
2475one variant for each major type of op. The variable st points to the
2476top-most backtrack struct. To make the code clearer, within each
2477block of code we #define ST to alias the relevant union.
2478
2479Here's a concrete example of a (vastly oversimplified) IFMATCH
2480implementation:
2481
2482 switch (state) {
2483 ....
2484
2485#define ST st->u.ifmatch
2486
2487 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2488 ST.foo = ...; // some state we wish to save
95b24440 2489 ...
bf1f174e
DM
2490 // push a yes backtrack state with a resume value of
2491 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2492 // first node of A:
2493 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2494 // NOTREACHED
2495
2496 case IFMATCH_A: // we have successfully executed A; now continue with B
2497 next = B;
2498 bar = ST.foo; // do something with the preserved value
2499 break;
2500
2501 case IFMATCH_A_fail: // A failed, so the assertion failed
2502 ...; // do some housekeeping, then ...
2503 sayNO; // propagate the failure
2504
2505#undef ST
95b24440 2506
bf1f174e
DM
2507 ...
2508 }
95b24440 2509
bf1f174e
DM
2510For any old-timers reading this who are familiar with the old recursive
2511approach, the code above is equivalent to:
95b24440 2512
bf1f174e
DM
2513 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2514 {
2515 int foo = ...
95b24440 2516 ...
bf1f174e
DM
2517 if (regmatch(A)) {
2518 next = B;
2519 bar = foo;
2520 break;
95b24440 2521 }
bf1f174e
DM
2522 ...; // do some housekeeping, then ...
2523 sayNO; // propagate the failure
95b24440 2524 }
bf1f174e
DM
2525
2526The topmost backtrack state, pointed to by st, is usually free. If you
2527want to claim it, populate any ST.foo fields in it with values you wish to
2528save, then do one of
2529
2530 PUSH_STATE_GOTO(resume_state, node);
2531 PUSH_YES_STATE_GOTO(resume_state, node);
2532
2533which sets that backtrack state's resume value to 'resume_state', pushes a
2534new free entry to the top of the backtrack stack, then goes to 'node'.
2535On backtracking, the free slot is popped, and the saved state becomes the
2536new free state. An ST.foo field in this new top state can be temporarily
2537accessed to retrieve values, but once the main loop is re-entered, it
2538becomes available for reuse.
2539
2540Note that the depth of the backtrack stack constantly increases during the
2541left-to-right execution of the pattern, rather than going up and down with
2542the pattern nesting. For example the stack is at its maximum at Z at the
2543end of the pattern, rather than at X in the following:
2544
2545 /(((X)+)+)+....(Y)+....Z/
2546
2547The only exceptions to this are lookahead/behind assertions and the cut,
2548(?>A), which pop all the backtrack states associated with A before
2549continuing.
2550
2551Bascktrack state structs are allocated in slabs of about 4K in size.
2552PL_regmatch_state and st always point to the currently active state,
2553and PL_regmatch_slab points to the slab currently containing
2554PL_regmatch_state. The first time regmatch() is called, the first slab is
2555allocated, and is never freed until interpreter destruction. When the slab
2556is full, a new one is allocated and chained to the end. At exit from
2557regmatch(), slabs allocated since entry are freed.
2558
2559*/
95b24440 2560
40a82448 2561
5bc10b2c 2562#define DEBUG_STATE_pp(pp) \
265c4333 2563 DEBUG_STATE_r({ \
5bc10b2c
DM
2564 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2565 PerlIO_printf(Perl_debug_log, \
5d458dd8 2566 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2567 depth*2, "", \
13d6edb4 2568 PL_reg_name[st->resume_state], \
5d458dd8
YO
2569 ((st==yes_state||st==mark_state) ? "[" : ""), \
2570 ((st==yes_state) ? "Y" : ""), \
2571 ((st==mark_state) ? "M" : ""), \
2572 ((st==yes_state||st==mark_state) ? "]" : "") \
2573 ); \
265c4333 2574 });
5bc10b2c 2575
40a82448 2576
3dab1dad 2577#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2578
3df15adc 2579#ifdef DEBUGGING
5bc10b2c 2580
ab3bbdeb 2581STATIC void
efd26800 2582S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8,
ab3bbdeb
YO
2583 const char *start, const char *end, const char *blurb)
2584{
efd26800 2585 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
2586
2587 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2588
ab3bbdeb
YO
2589 if (!PL_colorset)
2590 reginitcolors();
2591 {
2592 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 2593 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb
YO
2594
2595 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2596 start, end - start, 60);
2597
2598 PerlIO_printf(Perl_debug_log,
2599 "%s%s REx%s %s against %s\n",
2600 PL_colors[4], blurb, PL_colors[5], s0, s1);
2601
2602 if (do_utf8||utf8_pat)
1de06328
YO
2603 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2604 utf8_pat ? "pattern" : "",
2605 utf8_pat && do_utf8 ? " and " : "",
2606 do_utf8 ? "string" : ""
ab3bbdeb
YO
2607 );
2608 }
2609}
3df15adc
YO
2610
2611STATIC void
786e8c11
YO
2612S_dump_exec_pos(pTHX_ const char *locinput,
2613 const regnode *scan,
2614 const char *loc_regeol,
2615 const char *loc_bostr,
2616 const char *loc_reg_starttry,
2617 const bool do_utf8)
07be1b83 2618{
786e8c11 2619 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2620 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2621 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2622 /* The part of the string before starttry has one color
2623 (pref0_len chars), between starttry and current
2624 position another one (pref_len - pref0_len chars),
2625 after the current position the third one.
2626 We assume that pref0_len <= pref_len, otherwise we
2627 decrease pref0_len. */
786e8c11
YO
2628 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2629 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2630 int pref0_len;
2631
7918f24d
NC
2632 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2633
07be1b83
YO
2634 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2635 pref_len++;
786e8c11
YO
2636 pref0_len = pref_len - (locinput - loc_reg_starttry);
2637 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2638 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2639 ? (5 + taill) - pref_len : loc_regeol - locinput);
07be1b83
YO
2640 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2641 l--;
2642 if (pref0_len < 0)
2643 pref0_len = 0;
2644 if (pref0_len > pref_len)
2645 pref0_len = pref_len;
2646 {
3df15adc 2647 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2648
ab3bbdeb 2649 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2650 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2651
ab3bbdeb 2652 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2653 (locinput - pref_len + pref0_len),
1de06328 2654 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2655
ab3bbdeb 2656 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2657 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2658
1de06328 2659 const STRLEN tlen=len0+len1+len2;
3df15adc 2660 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2661 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2662 (IV)(locinput - loc_bostr),
07be1b83 2663 len0, s0,
07be1b83 2664 len1, s1,
07be1b83 2665 (docolor ? "" : "> <"),
07be1b83 2666 len2, s2,
f9f4320a 2667 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
2668 "");
2669 }
2670}
3df15adc 2671
07be1b83
YO
2672#endif
2673
0a4db386
YO
2674/* reg_check_named_buff_matched()
2675 * Checks to see if a named buffer has matched. The data array of
2676 * buffer numbers corresponding to the buffer is expected to reside
2677 * in the regexp->data->data array in the slot stored in the ARG() of
2678 * node involved. Note that this routine doesn't actually care about the
2679 * name, that information is not preserved from compilation to execution.
2680 * Returns the index of the leftmost defined buffer with the given name
2681 * or 0 if non of the buffers matched.
2682 */
2683STATIC I32
7918f24d
NC
2684S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2685{
0a4db386 2686 I32 n;
f8fc2ecf 2687 RXi_GET_DECL(rex,rexi);
ad64d0ec 2688 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 2689 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
2690
2691 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2692
0a4db386
YO
2693 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2694 if ((I32)*PL_reglastparen >= nums[n] &&
f0ab9afb 2695 PL_regoffs[nums[n]].end != -1)
0a4db386
YO
2696 {
2697 return nums[n];
2698 }
2699 }
2700 return 0;
2701}
2702
2f554ef7
DM
2703
2704/* free all slabs above current one - called during LEAVE_SCOPE */
2705
2706STATIC void
2707S_clear_backtrack_stack(pTHX_ void *p)
2708{
2709 regmatch_slab *s = PL_regmatch_slab->next;
2710 PERL_UNUSED_ARG(p);
2711
2712 if (!s)
2713 return;
2714 PL_regmatch_slab->next = NULL;
2715 while (s) {
2716 regmatch_slab * const osl = s;
2717 s = s->next;
2718 Safefree(osl);
2719 }
2720}
2721
2722
28d8d7f4
YO
2723#define SETREX(Re1,Re2) \
2724 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2725 Re1 = (Re2)
2726
d6a28714 2727STATIC I32 /* 0 failure, 1 success */
24b23f37 2728S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 2729{
a35a87e7 2730#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2731 dMY_CXT;
2732#endif
27da23d5 2733 dVAR;
95b24440 2734 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2735 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
2736 REGEXP *rex_sv = reginfo->prog;
2737 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 2738 RXi_GET_DECL(rex,rexi);
2f554ef7 2739 I32 oldsave;
5d9a96ca
DM
2740 /* the current state. This is a cached copy of PL_regmatch_state */
2741 register regmatch_state *st;
5d9a96ca
DM
2742 /* cache heavy used fields of st in registers */
2743 register regnode *scan;
2744 register regnode *next;
438e9bae 2745 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 2746 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 2747 register char *locinput = PL_reginput;
5d9a96ca 2748 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 2749
b69b0499 2750 bool result = 0; /* return value of S_regmatch */
24d3c4a9 2751 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
2752 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2753 const U32 max_nochange_depth =
2754 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2755 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
2756 regmatch_state *yes_state = NULL; /* state to pop to on success of
2757 subpattern */
e2e6a0f1
YO
2758 /* mark_state piggy backs on the yes_state logic so that when we unwind
2759 the stack on success we can update the mark_state as we go */
2760 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 2761 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 2762 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 2763 U32 state_num;
5d458dd8
YO
2764 bool no_final = 0; /* prevent failure from backtracking? */
2765 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 2766 char *startpoint = PL_reginput;
5d458dd8
YO
2767 SV *popmark = NULL; /* are we looking for a mark? */
2768 SV *sv_commit = NULL; /* last mark name seen in failure */
2769 SV *sv_yes_mark = NULL; /* last mark name we have seen
2770 during a successfull match */
2771 U32 lastopen = 0; /* last open we saw */
2772 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 2773 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
2774 /* these three flags are set by various ops to signal information to
2775 * the very next op. They have a useful lifetime of exactly one loop
2776 * iteration, and are not preserved or restored by state pushes/pops
2777 */
2778 bool sw = 0; /* the condition value in (?(cond)a|b) */
2779 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2780 int logical = 0; /* the following EVAL is:
2781 0: (?{...})
2782 1: (?(?{...})X|Y)
2783 2: (??{...})
2784 or the following IFMATCH/UNLESSM is:
2785 false: plain (?=foo)
2786 true: used as a condition: (?(?=foo))
2787 */
95b24440 2788#ifdef DEBUGGING
e68ec53f 2789 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2790#endif
2791
7918f24d
NC
2792 PERL_ARGS_ASSERT_REGMATCH;
2793
3b57cd43 2794 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 2795 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 2796 }));
5d9a96ca
DM
2797 /* on first ever call to regmatch, allocate first slab */
2798 if (!PL_regmatch_slab) {
2799 Newx(PL_regmatch_slab, 1, regmatch_slab);
2800 PL_regmatch_slab->prev = NULL;
2801 PL_regmatch_slab->next = NULL;
86545054 2802 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2803 }
2804
2f554ef7
DM
2805 oldsave = PL_savestack_ix;
2806 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2807 SAVEVPTR(PL_regmatch_slab);
2808 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
2809
2810 /* grab next free state slot */
2811 st = ++PL_regmatch_state;
86545054 2812 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2813 st = PL_regmatch_state = S_push_slab(aTHX);
2814
d6a28714
JH
2815 /* Note that nextchr is a byte even in UTF */
2816 nextchr = UCHARAT(locinput);
2817 scan = prog;
2818 while (scan != NULL) {
8ba1375e 2819
a3621e74 2820 DEBUG_EXECUTE_r( {
6136c704 2821 SV * const prop = sv_newmortal();
1de06328 2822 regnode *rnext=regnext(scan);
786e8c11 2823 DUMP_EXEC_POS( locinput, scan, do_utf8 );
32fc9b6a 2824 regprop(rex, prop, scan);
07be1b83
YO
2825
2826 PerlIO_printf(Perl_debug_log,
2827 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 2828 (IV)(scan - rexi->program), depth*2, "",
07be1b83 2829 SvPVX_const(prop),
1de06328 2830 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 2831 0 : (IV)(rnext - rexi->program));
2a782b5b 2832 });
d6a28714
JH
2833
2834 next = scan + NEXT_OFF(scan);
2835 if (next == scan)
2836 next = NULL;
40a82448 2837 state_num = OP(scan);
d6a28714 2838
40a82448
DM
2839 reenter_switch:
2840 switch (state_num) {
d6a28714 2841 case BOL:
7fba1cd6 2842 if (locinput == PL_bostr)
d6a28714 2843 {
3b0527fe 2844 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2845 break;
2846 }
d6a28714
JH
2847 sayNO;
2848 case MBOL:
12d33761
HS
2849 if (locinput == PL_bostr ||
2850 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2851 {
b8c5462f
JH
2852 break;
2853 }
d6a28714
JH
2854 sayNO;
2855 case SBOL:
c2a73568 2856 if (locinput == PL_bostr)
b8c5462f 2857 break;
d6a28714
JH
2858 sayNO;
2859 case GPOS:
3b0527fe 2860 if (locinput == reginfo->ganch)
d6a28714
JH
2861 break;
2862 sayNO;
ee9b8eae
YO
2863
2864 case KEEPS:
2865 /* update the startpoint */
f0ab9afb 2866 st->u.keeper.val = PL_regoffs[0].start;
ee9b8eae 2867 PL_reginput = locinput;
f0ab9afb 2868 PL_regoffs[0].start = locinput - PL_bostr;
ee9b8eae
YO
2869 PUSH_STATE_GOTO(KEEPS_next, next);
2870 /*NOT-REACHED*/
2871 case KEEPS_next_fail:
2872 /* rollback the start point change */
f0ab9afb 2873 PL_regoffs[0].start = st->u.keeper.val;
ee9b8eae
YO
2874 sayNO_SILENT;
2875 /*NOT-REACHED*/
d6a28714 2876 case EOL:
d6a28714
JH
2877 goto seol;
2878 case MEOL:
d6a28714 2879 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2880 sayNO;
b8c5462f 2881 break;
d6a28714
JH
2882 case SEOL:
2883 seol:
2884 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2885 sayNO;
d6a28714 2886 if (PL_regeol - locinput > 1)
b8c5462f 2887 sayNO;
b8c5462f 2888 break;
d6a28714
JH
2889 case EOS:
2890 if (PL_regeol != locinput)
b8c5462f 2891 sayNO;
d6a28714 2892 break;
ffc61ed2 2893 case SANY:
d6a28714 2894 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2895 sayNO;
f33976b4
DB
2896 if (do_utf8) {
2897 locinput += PL_utf8skip[nextchr];
2898 if (locinput > PL_regeol)
2899 sayNO;
2900 nextchr = UCHARAT(locinput);
2901 }
2902 else
2903 nextchr = UCHARAT(++locinput);
2904 break;
2905 case CANY:
2906 if (!nextchr && locinput >= PL_regeol)
2907 sayNO;
b8c5462f 2908 nextchr = UCHARAT(++locinput);
a0d0e21e 2909 break;
ffc61ed2 2910 case REG_ANY:
1aa99e6b
IH
2911 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2912 sayNO;
2913 if (do_utf8) {
b8c5462f 2914 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2915 if (locinput > PL_regeol)
2916 sayNO;
a0ed51b3 2917 nextchr = UCHARAT(locinput);
a0ed51b3 2918 }
1aa99e6b
IH
2919 else
2920 nextchr = UCHARAT(++locinput);
a0ed51b3 2921 break;
166ba7cd
DM
2922
2923#undef ST
2924#define ST st->u.trie
786e8c11
YO
2925 case TRIEC:
2926 /* In this case the charclass data is available inline so
2927 we can fail fast without a lot of extra overhead.
2928 */
2929 if (scan->flags == EXACT || !do_utf8) {
2930 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2931 DEBUG_EXECUTE_r(
2932 PerlIO_printf(Perl_debug_log,
2933 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2934 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
786e8c11
YO
2935 );
2936 sayNO_SILENT;
2937 /* NOTREACHED */
2938 }
2939 }
2940 /* FALL THROUGH */
5b47454d 2941 case TRIE:
3dab1dad 2942 {
07be1b83 2943 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 2944 DECL_TRIE_TYPE(scan);
3dab1dad
YO
2945
2946 /* what trie are we using right now */
be8e71aa 2947 reg_trie_data * const trie
f8fc2ecf 2948 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 2949 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 2950 U32 state = trie->startstate;
166ba7cd 2951
3dab1dad
YO
2952 if (trie->bitmap && trie_type != trie_utf8_fold &&
2953 !TRIE_BITMAP_TEST(trie,*locinput)
2954 ) {
2955 if (trie->states[ state ].wordnum) {
2956 DEBUG_EXECUTE_r(
2957 PerlIO_printf(Perl_debug_log,
2958 "%*s %smatched empty string...%s\n",
5bc10b2c 2959 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2960 );
2961 break;
2962 } else {
2963 DEBUG_EXECUTE_r(
2964 PerlIO_printf(Perl_debug_log,
786e8c11 2965 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2966 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2967 );
2968 sayNO_SILENT;
2969 }
2970 }
166ba7cd 2971
786e8c11
YO
2972 {
2973 U8 *uc = ( U8* )locinput;
2974
2975 STRLEN len = 0;
2976 STRLEN foldlen = 0;
2977 U8 *uscan = (U8*)NULL;
2978 STRLEN bufflen=0;
2979 SV *sv_accept_buff = NULL;
2980 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2981
2982 ST.accepted = 0; /* how many accepting states we have seen */
2983 ST.B = next;
2984 ST.jump = trie->jump;
786e8c11 2985 ST.me = scan;
07be1b83
YO
2986 /*
2987 traverse the TRIE keeping track of all accepting states
2988 we transition through until we get to a failing node.
2989 */
2990
a3621e74 2991 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 2992 U32 base = trie->states[ state ].trans.base;
f9f4320a 2993 UV uvc = 0;
786e8c11
YO
2994 U16 charid;
2995 /* We use charid to hold the wordnum as we don't use it
2996 for charid until after we have done the wordnum logic.
2997 We define an alias just so that the wordnum logic reads
2998 more naturally. */
2999
3000#define got_wordnum charid
3001 got_wordnum = trie->states[ state ].wordnum;
3002
3003 if ( got_wordnum ) {
3004 if ( ! ST.accepted ) {
5b47454d 3005 ENTER;
6b173516 3006 /* SAVETMPS; */ /* XXX is this necessary? dmq */
5b47454d
DM
3007 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
3008 sv_accept_buff=newSV(bufflen *
3009 sizeof(reg_trie_accepted) - 1);
786e8c11 3010 SvCUR_set(sv_accept_buff, 0);
5b47454d
DM
3011 SvPOK_on(sv_accept_buff);
3012 sv_2mortal(sv_accept_buff);
166ba7cd
DM
3013 SAVETMPS;
3014 ST.accept_buff =
5b47454d
DM
3015 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
3016 }
786e8c11 3017 do {
166ba7cd 3018 if (ST.accepted >= bufflen) {
5b47454d 3019 bufflen *= 2;
166ba7cd 3020 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
3021 SvGROW(sv_accept_buff,
3022 bufflen * sizeof(reg_trie_accepted));
3023 }
3024 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
3025 + sizeof(reg_trie_accepted));
a3621e74 3026
786e8c11
YO
3027
3028 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
3029 ST.accept_buff[ST.accepted].endpos = uc;
3030 ++ST.accepted;
3031 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
3032 }
3033#undef got_wordnum
a3621e74 3034
07be1b83 3035 DEBUG_TRIE_EXECUTE_r({
786e8c11 3036 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
a3621e74 3037 PerlIO_printf( Perl_debug_log,
786e8c11 3038 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
5bc10b2c 3039 2+depth * 2, "", PL_colors[4],
786e8c11 3040 (UV)state, (UV)ST.accepted );
07be1b83 3041 });
a3621e74
YO
3042
3043 if ( base ) {
55eed653
NC
3044 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3045 uscan, len, uvc, charid, foldlen,
3046 foldbuf, uniflags);
a3621e74 3047
5b47454d
DM
3048 if (charid &&
3049 (base + charid > trie->uniquecharcount )
3050 && (base + charid - 1 - trie->uniquecharcount
3051 < trie->lasttrans)
3052 && trie->trans[base + charid - 1 -
3053 trie->uniquecharcount].check == state)
3054 {
3055 state = trie->trans[base + charid - 1 -
3056 trie->uniquecharcount ].next;
3057 }
3058 else {
3059 state = 0;
3060 }
3061 uc += len;
3062
3063 }
3064 else {
a3621e74
YO
3065 state = 0;
3066 }
3067 DEBUG_TRIE_EXECUTE_r(
e4584336 3068 PerlIO_printf( Perl_debug_log,
786e8c11 3069 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3070 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3071 );
3072 }
166ba7cd 3073 if (!ST.accepted )
a3621e74 3074 sayNO;
a3621e74 3075
166ba7cd
DM
3076 DEBUG_EXECUTE_r(
3077 PerlIO_printf( Perl_debug_log,
3078 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3079 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3080 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3081 );
786e8c11 3082 }}
fae667d5
YO
3083 goto trie_first_try; /* jump into the fail handler */
3084 /* NOTREACHED */
166ba7cd 3085 case TRIE_next_fail: /* we failed - try next alterative */
fae667d5
YO
3086 if ( ST.jump) {
3087 REGCP_UNWIND(ST.cp);
3088 for (n = *PL_reglastparen; n > ST.lastparen; n--)
f0ab9afb 3089 PL_regoffs[n].end = -1;
fae667d5
YO
3090 *PL_reglastparen = n;
3091 }
3092 trie_first_try:
5d458dd8
YO
3093 if (do_cutgroup) {
3094 do_cutgroup = 0;
3095 no_final = 0;
3096 }
fae667d5
YO
3097
3098 if ( ST.jump) {
3099 ST.lastparen = *PL_reglastparen;
3100 REGCP_SET(ST.cp);
3101 }
166ba7cd
DM
3102 if ( ST.accepted == 1 ) {
3103 /* only one choice left - just continue */
3104 DEBUG_EXECUTE_r({
2b8b4781 3105 AV *const trie_words
502c6561 3106 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
2b8b4781 3107 SV ** const tmp = av_fetch( trie_words,
5c9f2f80 3108 ST.accept_buff[ 0 ].wordnum-1, 0 );
de734bd5
A
3109 SV *sv= tmp ? sv_newmortal() : NULL;
3110
166ba7cd
DM
3111 PerlIO_printf( Perl_debug_log,
3112 "%*s %sonly one match left: #%d <%s>%s\n",
5bc10b2c 3113 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3114 ST.accept_buff[ 0 ].wordnum,
de734bd5
A
3115 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3116 PL_colors[0], PL_colors[1],
3117 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3118 )
3119 : "not compiled under -Dr",
166ba7cd
DM
3120 PL_colors[5] );
3121 });
3122 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3123 /* in this case we free tmps/leave before we call regmatch
3124 as we wont be using accept_buff again. */
5d458dd8 3125
166ba7cd
DM
3126 locinput = PL_reginput;
3127 nextchr = UCHARAT(locinput);
5d458dd8
YO
3128 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3129 scan = ST.B;
3130 else
3131 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3132 if (!has_cutgroup) {
3133 FREETMPS;
3134 LEAVE;
3135 } else {
3136 ST.accepted--;
3137 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3138 }
786e8c11 3139
166ba7cd
DM
3140 continue; /* execute rest of RE */
3141 }
fae667d5
YO
3142
3143 if ( !ST.accepted-- ) {
5d458dd8
YO
3144 DEBUG_EXECUTE_r({
3145 PerlIO_printf( Perl_debug_log,
3146 "%*s %sTRIE failed...%s\n",
3147 REPORT_CODE_OFF+depth*2, "",
3148 PL_colors[4],
3149 PL_colors[5] );
3150 });
166ba7cd
DM
3151 FREETMPS;
3152 LEAVE;
5d458dd8 3153 sayNO_SILENT;
fae667d5
YO
3154 /*NOTREACHED*/
3155 }
166ba7cd 3156
a3621e74 3157 /*
166ba7cd
DM
3158 There are at least two accepting states left. Presumably
3159 the number of accepting states is going to be low,
3160 typically two. So we simply scan through to find the one
3161 with lowest wordnum. Once we find it, we swap the last
3162 state into its place and decrement the size. We then try to
3163 match the rest of the pattern at the point where the word
3164 ends. If we succeed, control just continues along the
3165 regex; if we fail we return here to try the next accepting
3166 state
3167 */
a3621e74 3168
166ba7cd
DM
3169 {
3170 U32 best = 0;
3171 U32 cur;
3172 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3173 DEBUG_TRIE_EXECUTE_r(
f2278c82 3174 PerlIO_printf( Perl_debug_log,
166ba7cd 3175 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
5bc10b2c 3176 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
166ba7cd
DM
3177 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3178 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3179 );
3180
3181 if (ST.accept_buff[cur].wordnum <
3182 ST.accept_buff[best].wordnum)
3183 best = cur;
a3621e74 3184 }
166ba7cd
DM
3185
3186 DEBUG_EXECUTE_r({
2b8b4781 3187 AV *const trie_words
502c6561 3188 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
2b8b4781 3189 SV ** const tmp = av_fetch( trie_words,
5c9f2f80 3190 ST.accept_buff[ best ].wordnum - 1, 0 );
7f69552c 3191 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
265c4333 3192 ST.B :
7f69552c 3193 ST.me + ST.jump[ST.accept_buff[best].wordnum];
de734bd5
A
3194 SV *sv= tmp ? sv_newmortal() : NULL;
3195
265c4333
YO
3196 PerlIO_printf( Perl_debug_log,
3197 "%*s %strying alternation #%d <%s> at node #%d %s\n",
5bc10b2c 3198 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3199 ST.accept_buff[best].wordnum,
de734bd5
A
3200 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3201 PL_colors[0], PL_colors[1],
3202 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3203 ) : "not compiled under -Dr",
265c4333 3204 REG_NODE_NUM(nextop),
166ba7cd
DM
3205 PL_colors[5] );
3206 });
3207
3208 if ( best<ST.accepted ) {
3209 reg_trie_accepted tmp = ST.accept_buff[ best ];
3210 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3211 ST.accept_buff[ ST.accepted ] = tmp;
3212 best = ST.accepted;
a3621e74 3213 }
166ba7cd 3214 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
7f69552c 3215 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
5d458dd8 3216 scan = ST.B;
786e8c11 3217 } else {
5d458dd8 3218 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
5d458dd8 3219 }
6b173516 3220 PUSH_YES_STATE_GOTO(TRIE_next, scan);
786e8c11 3221 /* NOTREACHED */
166ba7cd 3222 }
166ba7cd 3223 /* NOTREACHED */
5d458dd8
YO
3224 case TRIE_next:
3225 FREETMPS;
3226 LEAVE;
3227 sayYES;
166ba7cd
DM
3228#undef ST
3229
95b24440
DM
3230 case EXACT: {
3231 char *s = STRING(scan);
24d3c4a9 3232 ln = STR_LEN(scan);
eb160463 3233 if (do_utf8 != UTF) {
bc517b45 3234 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3235 char *l = locinput;
24d3c4a9 3236 const char * const e = s + ln;
a72c7584 3237
5ff6fc6d
JH
3238 if (do_utf8) {
3239 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3240 while (s < e) {
a3b680e6 3241 STRLEN ulen;
1aa99e6b 3242 if (l >= PL_regeol)
5ff6fc6d
JH
3243 sayNO;
3244 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3245 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3246 uniflags))
5ff6fc6d 3247 sayNO;
bc517b45 3248 l += ulen;
5ff6fc6d 3249 s ++;
1aa99e6b 3250 }
5ff6fc6d
JH
3251 }
3252 else {
3253 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3254 while (s < e) {
a3b680e6 3255 STRLEN ulen;
1aa99e6b
IH
3256 if (l >= PL_regeol)
3257 sayNO;
5ff6fc6d 3258 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3259 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3260 uniflags))
1aa99e6b 3261 sayNO;
bc517b45 3262 s += ulen;
a72c7584 3263 l ++;
1aa99e6b 3264 }
5ff6fc6d 3265 }
1aa99e6b
IH
3266 locinput = l;
3267 nextchr = UCHARAT(locinput);
3268 break;
3269 }
bc517b45 3270 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3271 /* Inline the first character, for speed. */
3272 if (UCHARAT(s) != nextchr)
3273 sayNO;
24d3c4a9 3274 if (PL_regeol - locinput < ln)
d6a28714 3275 sayNO;
24d3c4a9 3276 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 3277 sayNO;
24d3c4a9 3278 locinput += ln;
d6a28714
JH
3279 nextchr = UCHARAT(locinput);
3280 break;
95b24440 3281 }
d6a28714 3282 case EXACTFL:
b8c5462f
JH
3283 PL_reg_flags |= RF_tainted;
3284 /* FALL THROUGH */
95b24440 3285 case EXACTF: {
be8e71aa 3286 char * const s = STRING(scan);
24d3c4a9 3287 ln = STR_LEN(scan);
d6a28714 3288
d07ddd77
JH
3289 if (do_utf8 || UTF) {
3290 /* Either target or the pattern are utf8. */
be8e71aa 3291 const char * const l = locinput;
d07ddd77 3292 char *e = PL_regeol;
bc517b45 3293
24d3c4a9 3294 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 3295 l, &e, 0, do_utf8)) {
5486206c
JH
3296 /* One more case for the sharp s:
3297 * pack("U0U*", 0xDF) =~ /ss/i,
3298 * the 0xC3 0x9F are the UTF-8
3299 * byte sequence for the U+00DF. */
e1d1eefb 3300
5486206c 3301 if (!(do_utf8 &&
e1d1eefb 3302 toLOWER(s[0]) == 's' &&
24d3c4a9 3303 ln >= 2 &&
5486206c
JH
3304 toLOWER(s[1]) == 's' &&
3305 (U8)l[0] == 0xC3 &&
3306 e - l >= 2 &&
3307 (U8)l[1] == 0x9F))
3308 sayNO;
3309 }
d07ddd77
JH
3310 locinput = e;
3311 nextchr = UCHARAT(locinput);
3312 break;
a0ed51b3 3313 }
d6a28714 3314
bc517b45
JH
3315 /* Neither the target and the pattern are utf8. */
3316
d6a28714
JH
3317 /* Inline the first character, for speed. */
3318 if (UCHARAT(s) != nextchr &&
3319 UCHARAT(s) != ((OP(scan) == EXACTF)
3320 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3321 sayNO;
24d3c4a9 3322 if (PL_regeol - locinput < ln)
b8c5462f 3323 sayNO;
24d3c4a9
DM
3324 if (ln > 1 && (OP(scan) == EXACTF
3325 ? ibcmp(s, locinput, ln)
3326 : ibcmp_locale(s, locinput, ln)))
4633a7c4 3327 sayNO;
24d3c4a9 3328 locinput += ln;
d6a28714 3329 nextchr = UCHARAT(locinput);
a0d0e21e 3330 break;
95b24440 3331 }
d6a28714 3332 case ANYOF:
ffc61ed2 3333 if (do_utf8) {
9e55ce06
JH
3334 STRLEN inclasslen = PL_regeol - locinput;
3335
32fc9b6a 3336 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
262b90c4 3337 goto anyof_fail;
ffc61ed2
JH
3338 if (locinput >= PL_regeol)
3339 sayNO;
0f0076b4 3340 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3341 nextchr = UCHARAT(locinput);
e0f9d4a8 3342 break;
ffc61ed2
JH
3343 }
3344 else {
3345 if (nextchr < 0)
3346 nextchr = UCHARAT(locinput);
32fc9b6a 3347 if (!REGINCLASS(rex, scan, (U8*)locinput))
262b90c4 3348 goto anyof_fail;
ffc61ed2
JH
3349 if (!nextchr && locinput >= PL_regeol)
3350 sayNO;
3351 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3352 break;
3353 }
262b90c4 3354 anyof_fail:
e0f9d4a8
JH
3355 /* If we might have the case of the German sharp s
3356 * in a casefolding Unicode character class. */
3357
ebc501f0
JH
3358 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3359 locinput += SHARP_S_SKIP;
e0f9d4a8 3360 nextchr = UCHARAT(locinput);
ffc61ed2 3361 }
e0f9d4a8
JH
3362 else
3363 sayNO;
b8c5462f 3364 break;
d6a28714 3365 case ALNUML:
b8c5462f
JH
3366 PL_reg_flags |= RF_tainted;
3367 /* FALL THROUGH */
d6a28714 3368 case ALNUM:
b8c5462f 3369 if (!nextchr)
4633a7c4 3370 sayNO;
ffc61ed2 3371 if (do_utf8) {
1a4fad37 3372 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3373 if (!(OP(scan) == ALNUM
bb7a0f54 3374 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3375 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3376 {
3377 sayNO;
a0ed51b3 3378 }
b8c5462f 3379 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3380 nextchr = UCHARAT(locinput);
3381 break;
3382 }
ffc61ed2 3383 if (!(OP(scan) == ALNUM
d6a28714 3384 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3385 sayNO;
b8c5462f 3386 nextchr = UCHARAT(++locinput);
a0d0e21e 3387 break;
d6a28714 3388 case NALNUML:
b8c5462f
JH
3389 PL_reg_flags |= RF_tainted;
3390 /* FALL THROUGH */
d6a28714
JH
3391 case NALNUM:
3392 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3393 sayNO;
ffc61ed2 3394 if (do_utf8) {
1a4fad37 3395 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3396 if (OP(scan) == NALNUM
bb7a0f54 3397 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3398 : isALNUM_LC_utf8((U8*)locinput))
3399 {
b8c5462f 3400 sayNO;
d6a28714 3401 }
b8c5462f
JH
3402 locinput += PL_utf8skip[nextchr];
3403 nextchr = UCHARAT(locinput);
3404 break;
3405 }
ffc61ed2 3406 if (OP(scan) == NALNUM
d6a28714 3407 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3408 sayNO;
76e3520e 3409 nextchr = UCHARAT(++locinput);
a0d0e21e 3410 break;
d6a28714
JH
3411 case BOUNDL:
3412 case NBOUNDL:
3280af22 3413 PL_reg_flags |= RF_tainted;
bbce6d69 3414 /* FALL THROUGH */
d6a28714
JH
3415 case BOUND:
3416 case NBOUND:
3417 /* was last char in word? */
ffc61ed2 3418 if (do_utf8) {
12d33761 3419 if (locinput == PL_bostr)
24d3c4a9 3420 ln = '\n';
ffc61ed2 3421 else {
a3b680e6 3422 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3423
24d3c4a9 3424 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3425 }
3426 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3427 ln = isALNUM_uni(ln);
1a4fad37 3428 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3429 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3430 }
3431 else {
24d3c4a9 3432 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
3433 n = isALNUM_LC_utf8((U8*)locinput);
3434 }
a0ed51b3 3435 }
d6a28714 3436 else {
24d3c4a9 3437 ln = (locinput != PL_bostr) ?
12d33761 3438 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3439 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3440 ln = isALNUM(ln);
ffc61ed2
JH
3441 n = isALNUM(nextchr);
3442 }
3443 else {
24d3c4a9 3444 ln = isALNUM_LC(ln);
ffc61ed2
JH
3445 n = isALNUM_LC(nextchr);
3446 }
d6a28714 3447 }
24d3c4a9 3448 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3449 OP(scan) == BOUNDL))
3450 sayNO;
a0ed51b3 3451 break;
d6a28714 3452 case SPACEL:
3280af22 3453 PL_reg_flags |= RF_tainted;
bbce6d69 3454 /* FALL THROUGH */
d6a28714 3455 case SPACE:
9442cb0e 3456 if (!nextchr)
4633a7c4 3457 sayNO;
1aa99e6b 3458 if (do_utf8) {
fd400ab9 3459 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3460 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3461 if (!(OP(scan) == SPACE
bb7a0f54 3462 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3463 : isSPACE_LC_utf8((U8*)locinput)))
3464 {
3465 sayNO;
3466 }
3467 locinput += PL_utf8skip[nextchr];
3468 nextchr = UCHARAT(locinput);
3469 break;
d6a28714 3470 }
ffc61ed2
JH
3471 if (!(OP(scan) == SPACE
3472 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3473 sayNO;
3474 nextchr = UCHARAT(++locinput);
3475 }
3476 else {
3477 if (!(OP(scan) == SPACE
3478 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3479 sayNO;
3480 nextchr = UCHARAT(++locinput);
a0ed51b3 3481 }
a0ed51b3 3482 break;
d6a28714 3483 case NSPACEL:
3280af22 3484 PL_reg_flags |= RF_tainted;
bbce6d69 3485 /* FALL THROUGH */
d6a28714 3486 case NSPACE:
9442cb0e 3487 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3488 sayNO;
1aa99e6b 3489 if (do_utf8) {
1a4fad37 3490 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3491 if (OP(scan) == NSPACE
bb7a0f54 3492 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3493 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3494 {
3495 sayNO;
3496 }
3497 locinput += PL_utf8skip[nextchr];
3498 nextchr = UCHARAT(locinput);
3499 break;
a0ed51b3 3500 }
ffc61ed2 3501 if (OP(scan) == NSPACE
d6a28714 3502 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3503 sayNO;
76e3520e 3504 nextchr = UCHARAT(++locinput);
a0d0e21e 3505 break;
d6a28714 3506 case DIGITL:
a0ed51b3
LW
3507 PL_reg_flags |= RF_tainted;
3508 /* FALL THROUGH */
d6a28714 3509 case DIGIT:
9442cb0e 3510 if (!nextchr)
a0ed51b3 3511 sayNO;
1aa99e6b 3512 if (do_utf8) {
1a4fad37 3513 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3514 if (!(OP(scan) == DIGIT
bb7a0f54 3515 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3516 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3517 {
a0ed51b3 3518 sayNO;
dfe13c55 3519 }
6f06b55f 3520 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3521 nextchr = UCHARAT(locinput);
3522 break;
3523 }
ffc61ed2 3524 if (!(OP(scan) == DIGIT
9442cb0e 3525 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3526 sayNO;
3527 nextchr = UCHARAT(++locinput);
3528 break;
d6a28714 3529 case NDIGITL:
b8c5462f
JH
3530 PL_reg_flags |= RF_tainted;
3531 /* FALL THROUGH */
d6a28714 3532 case NDIGIT:
9442cb0e 3533 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3534 sayNO;
1aa99e6b 3535 if (do_utf8) {
1a4fad37 3536 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3537 if (OP(scan) == NDIGIT
bb7a0f54 3538 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3539 : isDIGIT_LC_utf8((U8*)locinput))
3540 {
a0ed51b3 3541 sayNO;
9442cb0e 3542 }
6f06b55f 3543 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3544 nextchr = UCHARAT(locinput);
3545 break;
3546 }
ffc61ed2 3547 if (OP(scan) == NDIGIT
9442cb0e 3548 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3549 sayNO;
3550 nextchr = UCHARAT(++locinput);
3551 break;
3552 case CLUMP:
b7c83a7e 3553 if (locinput >= PL_regeol)
a0ed51b3 3554 sayNO;
b7c83a7e 3555 if (do_utf8) {
1a4fad37 3556 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3557 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3558 sayNO;
3559 locinput += PL_utf8skip[nextchr];
3560 while (locinput < PL_regeol &&
3561 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3562 locinput += UTF8SKIP(locinput);
3563 if (locinput > PL_regeol)
3564 sayNO;
eb08e2da
JH
3565 }
3566 else
3567 locinput++;
a0ed51b3
LW
3568 nextchr = UCHARAT(locinput);
3569 break;
81714fb9
YO
3570
3571 case NREFFL:
3572 {
3573 char *s;
ff1157ca 3574 char type;
81714fb9
YO
3575 PL_reg_flags |= RF_tainted;
3576 /* FALL THROUGH */
3577 case NREF:
3578 case NREFF:
ff1157ca 3579 type = OP(scan);
0a4db386
YO
3580 n = reg_check_named_buff_matched(rex,scan);
3581
3582 if ( n ) {
3583 type = REF + ( type - NREF );
3584 goto do_ref;
3585 } else {
81714fb9 3586 sayNO;
0a4db386
YO
3587 }
3588 /* unreached */
c8756f30 3589 case REFFL:
3280af22 3590 PL_reg_flags |= RF_tainted;
c8756f30 3591 /* FALL THROUGH */
c277df42 3592 case REF:
81714fb9 3593 case REFF:
c277df42 3594 n = ARG(scan); /* which paren pair */
81714fb9
YO
3595 type = OP(scan);
3596 do_ref:
f0ab9afb 3597 ln = PL_regoffs[n].start;
2c2d71f5 3598 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3b6647e0 3599 if (*PL_reglastparen < n || ln == -1)
af3f8c16 3600 sayNO; /* Do not match unless seen CLOSEn. */
f0ab9afb 3601 if (ln == PL_regoffs[n].end)
a0d0e21e 3602 break;
a0ed51b3 3603
24d3c4a9 3604 s = PL_bostr + ln;
81714fb9 3605 if (do_utf8 && type != REF) { /* REF can do byte comparison */
a0ed51b3 3606 char *l = locinput;
f0ab9afb 3607 const char *e = PL_bostr + PL_regoffs[n].end;
a0ed51b3
LW
3608 /*
3609 * Note that we can't do the "other character" lookup trick as
3610 * in the 8-bit case (no pun intended) because in Unicode we
3611 * have to map both upper and title case to lower case.
3612 */
81714fb9 3613 if (type == REFF) {
a0ed51b3 3614 while (s < e) {
a3b680e6
AL
3615 STRLEN ulen1, ulen2;
3616 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3617 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3618
a0ed51b3
LW
3619 if (l >= PL_regeol)
3620 sayNO;
a2a2844f
JH
3621 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3622 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3623 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3624 sayNO;
a2a2844f
JH
3625 s += ulen1;
3626 l += ulen2;
a0ed51b3
LW
3627 }
3628 }
3629 locinput = l;
3630 nextchr = UCHARAT(locinput);
3631 break;
3632 }
3633
a0d0e21e 3634 /* Inline the first character, for speed. */
76e3520e 3635 if (UCHARAT(s) != nextchr &&
81714fb9
YO
3636 (type == REF ||
3637 (UCHARAT(s) != (type == REFF
3638 ? PL_fold : PL_fold_locale)[nextchr])))
4633a7c4 3639 sayNO;
f0ab9afb 3640 ln = PL_regoffs[n].end - ln;
24d3c4a9 3641 if (locinput + ln > PL_regeol)
4633a7c4 3642 sayNO;
81714fb9 3643 if (ln > 1 && (type == REF
24d3c4a9 3644 ? memNE(s, locinput, ln)
81714fb9 3645 : (type == REFF
24d3c4a9
DM
3646 ? ibcmp(s, locinput, ln)
3647 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3648 sayNO;
24d3c4a9 3649 locinput += ln;
76e3520e 3650 nextchr = UCHARAT(locinput);
a0d0e21e 3651 break;
81714fb9 3652 }
a0d0e21e 3653 case NOTHING:
c277df42 3654 case TAIL:
a0d0e21e
LW
3655 break;
3656 case BACK:
3657 break;
40a82448
DM
3658
3659#undef ST
3660#define ST st->u.eval
c277df42 3661 {
c277df42 3662 SV *ret;
d2f13c59 3663 REGEXP *re_sv;
6bda09f9 3664 regexp *re;
f8fc2ecf 3665 regexp_internal *rei;
1a147d38
YO
3666 regnode *startpoint;
3667
3668 case GOSTART:
e7707071
YO
3669 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
3670 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 3671 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 3672 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 3673 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
3674 Perl_croak(aTHX_
3675 "Pattern subroutine nesting without pos change"
3676 " exceeded limit in regex");
6bda09f9
YO
3677 } else {
3678 nochange_depth = 0;
1a147d38 3679 }
288b8c02 3680 re_sv = rex_sv;
6bda09f9 3681 re = rex;
f8fc2ecf 3682 rei = rexi;
288b8c02 3683 (void)ReREFCNT_inc(rex_sv);
1a147d38 3684 if (OP(scan)==GOSUB) {
6bda09f9
YO
3685 startpoint = scan + ARG2L(scan);
3686 ST.close_paren = ARG(scan);
3687 } else {
f8fc2ecf 3688 startpoint = rei->program+1;
6bda09f9
YO
3689 ST.close_paren = 0;
3690 }
3691 goto eval_recurse_doit;
3692 /* NOTREACHED */
3693 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3694 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 3695 if ( ++nochange_depth > max_nochange_depth )
1a147d38 3696 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
3697 } else {
3698 nochange_depth = 0;
3699 }
8e5e9ebe 3700 {
4aabdb9b
DM
3701 /* execute the code in the {...} */
3702 dSP;
6136c704 3703 SV ** const before = SP;
4aabdb9b
DM
3704 OP_4tree * const oop = PL_op;
3705 COP * const ocurcop = PL_curcop;
3706 PAD *old_comppad;
4aabdb9b
DM
3707
3708 n = ARG(scan);
f8fc2ecf 3709 PL_op = (OP_4tree*)rexi->data->data[n];
24b23f37
YO
3710 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3711 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f8fc2ecf 3712 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
f0ab9afb 3713 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4aabdb9b 3714
2bf803e2
YO
3715 if (sv_yes_mark) {
3716 SV *sv_mrk = get_sv("REGMARK", 1);
3717 sv_setsv(sv_mrk, sv_yes_mark);
3718 }
3719
8e5e9ebe
RGS
3720 CALLRUNOPS(aTHX); /* Scalar context. */
3721 SPAGAIN;
3722 if (SP == before)
075aa684 3723 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3724 else {
3725 ret = POPs;
3726 PUTBACK;
3727 }
4aabdb9b
DM
3728
3729 PL_op = oop;
3730 PAD_RESTORE_LOCAL(old_comppad);
3731 PL_curcop = ocurcop;
24d3c4a9 3732 if (!logical) {
4aabdb9b
DM
3733 /* /(?{...})/ */
3734 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3735 break;
3736 }
8e5e9ebe 3737 }
24d3c4a9
DM
3738 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3739 logical = 0;
4aabdb9b 3740 {
4f639d21
DM
3741 /* extract RE object from returned value; compiling if
3742 * necessary */
6136c704 3743 MAGIC *mg = NULL;
288b8c02 3744 REGEXP *rx = NULL;
5c35adbb
NC
3745
3746 if (SvROK(ret)) {
288b8c02 3747 SV *const sv = SvRV(ret);
5c35adbb
NC
3748
3749 if (SvTYPE(sv) == SVt_REGEXP) {
d2f13c59 3750 rx = (REGEXP*) sv;
5c35adbb
NC
3751 } else if (SvSMAGICAL(sv)) {
3752 mg = mg_find(sv, PERL_MAGIC_qr);
3753 assert(mg);
3754 }
3755 } else if (SvTYPE(ret) == SVt_REGEXP) {
d2f13c59 3756 rx = (REGEXP*) ret;
5c35adbb 3757 } else if (SvSMAGICAL(ret)) {
124ee91a
NC
3758 if (SvGMAGICAL(ret)) {
3759 /* I don't believe that there is ever qr magic
3760 here. */
3761 assert(!mg_find(ret, PERL_MAGIC_qr));
faf82a0b 3762 sv_unmagic(ret, PERL_MAGIC_qr);
124ee91a
NC
3763 }
3764 else {
faf82a0b 3765 mg = mg_find(ret, PERL_MAGIC_qr);
124ee91a
NC
3766 /* testing suggests mg only ends up non-NULL for
3767 scalars who were upgraded and compiled in the
3768 else block below. In turn, this is only
3769 triggered in the "postponed utf8 string" tests
3770 in t/op/pat.t */
3771 }
0f5d15d6 3772 }
faf82a0b 3773
0f5d15d6 3774 if (mg) {
d2f13c59 3775 rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
e2560c33 3776 assert(rx);
0f5d15d6 3777 }
288b8c02
NC
3778 if (rx) {
3779 rx = reg_temp_copy(rx);
3780 }
0f5d15d6 3781 else {
c737faaf 3782 U32 pm_flags = 0;
a3b680e6 3783 const I32 osize = PL_regsize;
0f5d15d6 3784
b9ad30b4
NC
3785 if (DO_UTF8(ret)) {
3786 assert (SvUTF8(ret));
3787 } else if (SvUTF8(ret)) {
3788 /* Not doing UTF-8, despite what the SV says. Is
3789 this only if we're trapped in use 'bytes'? */
3790 /* Make a copy of the octet sequence, but without
3791 the flag on, as the compiler now honours the
3792 SvUTF8 flag on ret. */
3793 STRLEN len;
3794 const char *const p = SvPV(ret, len);
3795 ret = newSVpvn_flags(p, len, SVs_TEMP);
3796 }
288b8c02 3797 rx = CALLREGCOMP(ret, pm_flags);
9041c2e3 3798 if (!(SvFLAGS(ret)
faf82a0b 3799 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3ce3ed55 3800 | SVs_GMG))) {
a2794585
NC
3801 /* This isn't a first class regexp. Instead, it's
3802 caching a regexp onto an existing, Perl visible
3803 scalar. */
ad64d0ec 3804 sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
3ce3ed55 3805 }
0f5d15d6 3806 PL_regsize = osize;
0f5d15d6 3807 }
288b8c02
NC
3808 re_sv = rx;
3809 re = (struct regexp *)SvANY(rx);
4aabdb9b 3810 }
07bc277f 3811 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
3812 re->subbeg = rex->subbeg;
3813 re->sublen = rex->sublen;
f8fc2ecf 3814 rei = RXi_GET(re);
6bda09f9 3815 DEBUG_EXECUTE_r(
efd26800 3816 debug_start_match(re_sv, do_utf8, locinput, PL_regeol,
6bda09f9
YO
3817 "Matching embedded");
3818 );
f8fc2ecf 3819 startpoint = rei->program + 1;
1a147d38 3820 ST.close_paren = 0; /* only used for GOSUB */
6bda09f9
YO
3821 /* borrowed from regtry */
3822 if (PL_reg_start_tmpl <= re->nparens) {
3823 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3824 if(PL_reg_start_tmp)
3825 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3826 else
3827 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
dd5def09 3828 }
aa283a38 3829
1a147d38 3830 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 3831 /* run the pattern returned from (??{...}) */
40a82448
DM
3832 ST.cp = regcppush(0); /* Save *all* the positions. */
3833 REGCP_SET(ST.lastcp);
6bda09f9 3834
f0ab9afb 3835 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
6bda09f9 3836
0357f1fd
ML
3837 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
3838 PL_reglastparen = &re->lastparen;
3839 PL_reglastcloseparen = &re->lastcloseparen;
3840 re->lastparen = 0;
3841 re->lastcloseparen = 0;
3842
4aabdb9b 3843 PL_reginput = locinput;
ae0beba1 3844 PL_regsize = 0;
4aabdb9b
DM
3845
3846 /* XXXX This is too dramatic a measure... */
3847 PL_reg_maxiter = 0;
3848
faec1544 3849 ST.toggle_reg_flags = PL_reg_flags;
3c8556c3 3850 if (RX_UTF8(re_sv))
faec1544
DM
3851 PL_reg_flags |= RF_utf8;
3852 else
3853 PL_reg_flags &= ~RF_utf8;
3854 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3855
288b8c02 3856 ST.prev_rex = rex_sv;
faec1544 3857 ST.prev_curlyx = cur_curlyx;
288b8c02
NC
3858 SETREX(rex_sv,re_sv);
3859 rex = re;
f8fc2ecf 3860 rexi = rei;
faec1544 3861 cur_curlyx = NULL;
40a82448 3862 ST.B = next;
faec1544
DM
3863 ST.prev_eval = cur_eval;
3864 cur_eval = st;
faec1544 3865 /* now continue from first node in postoned RE */
6bda09f9 3866 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4aabdb9b 3867 /* NOTREACHED */
a0ed51b3 3868 }
24d3c4a9
DM
3869 /* logical is 1, /(?(?{...})X|Y)/ */
3870 sw = (bool)SvTRUE(ret);
3871 logical = 0;
c277df42
IZ
3872 break;
3873 }
40a82448 3874
faec1544
DM
3875 case EVAL_AB: /* cleanup after a successful (??{A})B */
3876 /* note: this is called twice; first after popping B, then A */
3877 PL_reg_flags ^= ST.toggle_reg_flags;
288b8c02
NC
3878 ReREFCNT_dec(rex_sv);
3879 SETREX(rex_sv,ST.prev_rex);
3880 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3881 rexi = RXi_GET(rex);
faec1544
DM
3882 regcpblow(ST.cp);
3883 cur_eval = ST.prev_eval;
3884 cur_curlyx = ST.prev_curlyx;
0357f1fd
ML
3885
3886 PL_reglastparen = &rex->lastparen;
3887 PL_reglastcloseparen = &rex->lastcloseparen;
3888
40a82448
DM
3889 /* XXXX This is too dramatic a measure... */
3890 PL_reg_maxiter = 0;
e7707071 3891 if ( nochange_depth )
4b196cd4 3892 nochange_depth--;
262b90c4 3893 sayYES;
40a82448 3894
40a82448 3895
faec1544
DM
3896 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3897 /* note: this is called twice; first after popping B, then A */
3898 PL_reg_flags ^= ST.toggle_reg_flags;
288b8c02
NC
3899 ReREFCNT_dec(rex_sv);
3900 SETREX(rex_sv,ST.prev_rex);
3901 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3902 rexi = RXi_GET(rex);
0357f1fd
ML
3903 PL_reglastparen = &rex->lastparen;
3904 PL_reglastcloseparen = &rex->lastcloseparen;
3905
40a82448
DM
3906 PL_reginput = locinput;
3907 REGCP_UNWIND(ST.lastcp);
3908 regcppop(rex);
faec1544
DM
3909 cur_eval = ST.prev_eval;
3910 cur_curlyx = ST.prev_curlyx;
3911 /* XXXX This is too dramatic a measure... */
3912 PL_reg_maxiter = 0;
e7707071 3913 if ( nochange_depth )
4b196cd4 3914 nochange_depth--;
40a82448 3915 sayNO_SILENT;
40a82448
DM
3916#undef ST
3917
a0d0e21e 3918 case OPEN:
c277df42 3919 n = ARG(scan); /* which paren pair */
3280af22
NIS
3920 PL_reg_start_tmp[n] = locinput;
3921 if (n > PL_regsize)
3922 PL_regsize = n;
e2e6a0f1 3923 lastopen = n;
a0d0e21e
LW
3924 break;
3925 case CLOSE:
c277df42 3926 n = ARG(scan); /* which paren pair */
f0ab9afb
NC
3927 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
3928 PL_regoffs[n].end = locinput - PL_bostr;
7f69552c
YO
3929 /*if (n > PL_regsize)
3930 PL_regsize = n;*/
3b6647e0 3931 if (n > *PL_reglastparen)
3280af22 3932 *PL_reglastparen = n;
a01268b5 3933 *PL_reglastcloseparen = n;
3b6647e0 3934 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
3935 goto fake_end;
3936 }
a0d0e21e 3937 break;
e2e6a0f1
YO
3938 case ACCEPT:
3939 if (ARG(scan)){
3940 regnode *cursor;
3941 for (cursor=scan;
3942 cursor && OP(cursor)!=END;
3943 cursor=regnext(cursor))
3944 {
3945 if ( OP(cursor)==CLOSE ){
3946 n = ARG(cursor);
3947 if ( n <= lastopen ) {
f0ab9afb
NC
3948 PL_regoffs[n].start
3949 = PL_reg_start_tmp[n] - PL_bostr;
3950 PL_regoffs[n].end = locinput - PL_bostr;
e2e6a0f1
YO
3951 /*if (n > PL_regsize)
3952 PL_regsize = n;*/
3b6647e0 3953 if (n > *PL_reglastparen)
e2e6a0f1
YO
3954 *PL_reglastparen = n;
3955 *PL_reglastcloseparen = n;
3b6647e0
RB
3956 if ( n == ARG(scan) || (cur_eval &&
3957 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
3958 break;
3959 }
3960 }
3961 }
3962 }
3963 goto fake_end;
3964 /*NOTREACHED*/
c277df42
IZ
3965 case GROUPP:
3966 n = ARG(scan); /* which paren pair */
f0ab9afb 3967 sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
c277df42 3968 break;
0a4db386
YO
3969 case NGROUPP:
3970 /* reg_check_named_buff_matched returns 0 for no match */
3971 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3972 break;
1a147d38 3973 case INSUBP:
0a4db386 3974 n = ARG(scan);
3b6647e0 3975 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386
YO
3976 break;
3977 case DEFINEP:
3978 sw = 0;
3979 break;
c277df42 3980 case IFTHEN:
2c2d71f5 3981 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 3982 if (sw)
c277df42
IZ
3983 next = NEXTOPER(NEXTOPER(scan));
3984 else {
3985 next = scan + ARG(scan);
3986 if (OP(next) == IFTHEN) /* Fake one. */
3987 next = NEXTOPER(NEXTOPER(next));
3988 }
3989 break;
3990 case LOGICAL:
24d3c4a9 3991 logical = scan->flags;
c277df42 3992 break;
c476f425 3993
2ab05381 3994/*******************************************************************
2ab05381 3995
c476f425
DM
3996The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3997pattern, where A and B are subpatterns. (For simple A, CURLYM or
3998STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 3999
c476f425 4000A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 4001
c476f425
DM
4002On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4003state, which contains the current count, initialised to -1. It also sets
4004cur_curlyx to point to this state, with any previous value saved in the
4005state block.
2ab05381 4006
c476f425
DM
4007CURLYX then jumps straight to the WHILEM op, rather than executing A,
4008since the pattern may possibly match zero times (i.e. it's a while {} loop
4009rather than a do {} while loop).
2ab05381 4010
c476f425
DM
4011Each entry to WHILEM represents a successful match of A. The count in the
4012CURLYX block is incremented, another WHILEM state is pushed, and execution
4013passes to A or B depending on greediness and the current count.
2ab05381 4014
c476f425
DM
4015For example, if matching against the string a1a2a3b (where the aN are
4016substrings that match /A/), then the match progresses as follows: (the
4017pushed states are interspersed with the bits of strings matched so far):
2ab05381 4018
c476f425
DM
4019 <CURLYX cnt=-1>
4020 <CURLYX cnt=0><WHILEM>
4021 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4022 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4023 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4024 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 4025
c476f425
DM
4026(Contrast this with something like CURLYM, which maintains only a single
4027backtrack state:
2ab05381 4028
c476f425
DM
4029 <CURLYM cnt=0> a1
4030 a1 <CURLYM cnt=1> a2
4031 a1 a2 <CURLYM cnt=2> a3
4032 a1 a2 a3 <CURLYM cnt=3> b
4033)
2ab05381 4034
c476f425
DM
4035Each WHILEM state block marks a point to backtrack to upon partial failure
4036of A or B, and also contains some minor state data related to that
4037iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4038overall state, such as the count, and pointers to the A and B ops.
2ab05381 4039
c476f425
DM
4040This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4041must always point to the *current* CURLYX block, the rules are:
2ab05381 4042
c476f425
DM
4043When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4044and set cur_curlyx to point the new block.
2ab05381 4045
c476f425
DM
4046When popping the CURLYX block after a successful or unsuccessful match,
4047restore the previous cur_curlyx.
2ab05381 4048
c476f425
DM
4049When WHILEM is about to execute B, save the current cur_curlyx, and set it
4050to the outer one saved in the CURLYX block.
2ab05381 4051
c476f425
DM
4052When popping the WHILEM block after a successful or unsuccessful B match,
4053restore the previous cur_curlyx.
2ab05381 4054
c476f425
DM
4055Here's an example for the pattern (AI* BI)*BO
4056I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 4057
c476f425
DM
4058cur_
4059curlyx backtrack stack
4060------ ---------------
4061NULL
4062CO <CO prev=NULL> <WO>
4063CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4064CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4065NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 4066
c476f425
DM
4067At this point the pattern succeeds, and we work back down the stack to
4068clean up, restoring as we go:
95b24440 4069
c476f425
DM
4070CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4071CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4072CO <CO prev=NULL> <WO>
4073NULL
a0374537 4074
c476f425
DM
4075*******************************************************************/
4076
4077#define ST st->u.curlyx
4078
4079 case CURLYX: /* start of /A*B/ (for complex A) */
4080 {
4081 /* No need to save/restore up to this paren */
4082 I32 parenfloor = scan->flags;
4083
4084 assert(next); /* keep Coverity happy */
4085 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4086 next += ARG(next);
4087
4088 /* XXXX Probably it is better to teach regpush to support
4089 parenfloor > PL_regsize... */
4090 if (parenfloor > (I32)*PL_reglastparen)
4091 parenfloor = *PL_reglastparen; /* Pessimization... */
4092
4093 ST.prev_curlyx= cur_curlyx;
4094 cur_curlyx = st;
4095 ST.cp = PL_savestack_ix;
4096
4097 /* these fields contain the state of the current curly.
4098 * they are accessed by subsequent WHILEMs */
4099 ST.parenfloor = parenfloor;
4100 ST.min = ARG1(scan);
4101 ST.max = ARG2(scan);
4102 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4103 ST.B = next;
24d3c4a9
DM
4104 ST.minmod = minmod;
4105 minmod = 0;
c476f425
DM
4106 ST.count = -1; /* this will be updated by WHILEM */
4107 ST.lastloc = NULL; /* this will be updated by WHILEM */
4108
4109 PL_reginput = locinput;
4110 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
5f66b61c 4111 /* NOTREACHED */
c476f425 4112 }
a0d0e21e 4113
c476f425 4114 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
4115 cur_curlyx = ST.prev_curlyx;
4116 sayYES;
4117 /* NOTREACHED */
a0d0e21e 4118
c476f425
DM
4119 case CURLYX_end_fail: /* just failed to match all of A*B */
4120 regcpblow(ST.cp);
4121 cur_curlyx = ST.prev_curlyx;
4122 sayNO;
4123 /* NOTREACHED */
4633a7c4 4124
a0d0e21e 4125
c476f425
DM
4126#undef ST
4127#define ST st->u.whilem
4128
4129 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4130 {
4131 /* see the discussion above about CURLYX/WHILEM */
c476f425
DM
4132 I32 n;
4133 assert(cur_curlyx); /* keep Coverity happy */
4134 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4135 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4136 ST.cache_offset = 0;
4137 ST.cache_mask = 0;
4138
4139 PL_reginput = locinput;
4140
4141 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4142 "%*s whilem: matched %ld out of %ld..%ld\n",
4143 REPORT_CODE_OFF+depth*2, "", (long)n,
4144 (long)cur_curlyx->u.curlyx.min,
4145 (long)cur_curlyx->u.curlyx.max)
4146 );
a0d0e21e 4147
c476f425 4148 /* First just match a string of min A's. */
a0d0e21e 4149
c476f425
DM
4150 if (n < cur_curlyx->u.curlyx.min) {
4151 cur_curlyx->u.curlyx.lastloc = locinput;
4152 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4153 /* NOTREACHED */
4154 }
4155
4156 /* If degenerate A matches "", assume A done. */
4157
4158 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4159 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4160 "%*s whilem: empty match detected, trying continuation...\n",
4161 REPORT_CODE_OFF+depth*2, "")
4162 );
4163 goto do_whilem_B_max;
4164 }
4165
4166 /* super-linear cache processing */
4167
4168 if (scan->flags) {
a0d0e21e 4169
2c2d71f5 4170 if (!PL_reg_maxiter) {
c476f425
DM
4171 /* start the countdown: Postpone detection until we
4172 * know the match is not *that* much linear. */
2c2d71f5 4173 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
4174 /* possible overflow for long strings and many CURLYX's */
4175 if (PL_reg_maxiter < 0)
4176 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
4177 PL_reg_leftiter = PL_reg_maxiter;
4178 }
c476f425 4179
2c2d71f5 4180 if (PL_reg_leftiter-- == 0) {
c476f425 4181 /* initialise cache */
3298f257 4182 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 4183 if (PL_reg_poscache) {
eb160463 4184 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
4185 Renew(PL_reg_poscache, size, char);
4186 PL_reg_poscache_size = size;
4187 }
4188 Zero(PL_reg_poscache, size, char);
4189 }
4190 else {
4191 PL_reg_poscache_size = size;
a02a5408 4192 Newxz(PL_reg_poscache, size, char);
2c2d71f5 4193 }
c476f425
DM
4194 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4195 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4196 PL_colors[4], PL_colors[5])
4197 );
2c2d71f5 4198 }
c476f425 4199
2c2d71f5 4200 if (PL_reg_leftiter < 0) {
c476f425
DM
4201 /* have we already failed at this position? */
4202 I32 offset, mask;
4203 offset = (scan->flags & 0xf) - 1
4204 + (locinput - PL_bostr) * (scan->flags>>4);
4205 mask = 1 << (offset % 8);
4206 offset /= 8;
4207 if (PL_reg_poscache[offset] & mask) {
4208 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4209 "%*s whilem: (cache) already tried at this position...\n",
4210 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 4211 );
3298f257 4212 sayNO; /* cache records failure */
2c2d71f5 4213 }
c476f425
DM
4214 ST.cache_offset = offset;
4215 ST.cache_mask = mask;
2c2d71f5 4216 }
c476f425 4217 }
2c2d71f5 4218
c476f425 4219 /* Prefer B over A for minimal matching. */
a687059c 4220
c476f425
DM
4221 if (cur_curlyx->u.curlyx.minmod) {
4222 ST.save_curlyx = cur_curlyx;
4223 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4224 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4225 REGCP_SET(ST.lastcp);
4226 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4227 /* NOTREACHED */
4228 }
a0d0e21e 4229
c476f425
DM
4230 /* Prefer A over B for maximal matching. */
4231
4232 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4233 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4234 cur_curlyx->u.curlyx.lastloc = locinput;
4235 REGCP_SET(ST.lastcp);
4236 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4237 /* NOTREACHED */
4238 }
4239 goto do_whilem_B_max;
4240 }
4241 /* NOTREACHED */
4242
4243 case WHILEM_B_min: /* just matched B in a minimal match */
4244 case WHILEM_B_max: /* just matched B in a maximal match */
4245 cur_curlyx = ST.save_curlyx;
4246 sayYES;
4247 /* NOTREACHED */
4248
4249 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4250 cur_curlyx = ST.save_curlyx;
4251 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4252 cur_curlyx->u.curlyx.count--;
4253 CACHEsayNO;
4254 /* NOTREACHED */
4255
4256 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4257 REGCP_UNWIND(ST.lastcp);
4258 regcppop(rex);
4259 /* FALL THROUGH */
4260 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4261 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4262 cur_curlyx->u.curlyx.count--;
4263 CACHEsayNO;
4264 /* NOTREACHED */
4265
4266 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4267 REGCP_UNWIND(ST.lastcp);
4268 regcppop(rex); /* Restore some previous $<digit>s? */
4269 PL_reginput = locinput;
4270 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4271 "%*s whilem: failed, trying continuation...\n",
4272 REPORT_CODE_OFF+depth*2, "")
4273 );
4274 do_whilem_B_max:
4275 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4276 && ckWARN(WARN_REGEXP)
4277 && !(PL_reg_flags & RF_warned))
4278 {
4279 PL_reg_flags |= RF_warned;
4280 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4281 "Complex regular subexpression recursion",
4282 REG_INFTY - 1);
4283 }
4284
4285 /* now try B */
4286 ST.save_curlyx = cur_curlyx;
4287 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4288 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4289 /* NOTREACHED */
4290
4291 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4292 cur_curlyx = ST.save_curlyx;
4293 REGCP_UNWIND(ST.lastcp);
4294 regcppop(rex);
4295
4296 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4297 /* Maximum greed exceeded */
4298 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4299 && ckWARN(WARN_REGEXP)
4300 && !(PL_reg_flags & RF_warned))
4301 {
3280af22 4302 PL_reg_flags |= RF_warned;
c476f425
DM
4303 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4304 "%s limit (%d) exceeded",
4305 "Complex regular subexpression recursion",
4306 REG_INFTY - 1);
a0d0e21e 4307 }
c476f425 4308 cur_curlyx->u.curlyx.count--;
3ab3c9b4 4309 CACHEsayNO;
a0d0e21e 4310 }
c476f425
DM
4311
4312 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4313 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4314 );
4315 /* Try grabbing another A and see if it helps. */
4316 PL_reginput = locinput;
4317 cur_curlyx->u.curlyx.lastloc = locinput;
4318 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4319 REGCP_SET(ST.lastcp);
4320 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
5f66b61c 4321 /* NOTREACHED */
40a82448
DM
4322
4323#undef ST
4324#define ST st->u.branch
4325
4326 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
4327 next = scan + ARG(scan);
4328 if (next == scan)
4329 next = NULL;
40a82448
DM
4330 scan = NEXTOPER(scan);
4331 /* FALL THROUGH */
c277df42 4332
40a82448
DM
4333 case BRANCH: /* /(...|A|...)/ */
4334 scan = NEXTOPER(scan); /* scan now points to inner node */
40a82448
DM
4335 ST.lastparen = *PL_reglastparen;
4336 ST.next_branch = next;
4337 REGCP_SET(ST.cp);
4338 PL_reginput = locinput;
02db2b7b 4339
40a82448 4340 /* Now go into the branch */
5d458dd8
YO
4341 if (has_cutgroup) {
4342 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4343 } else {
4344 PUSH_STATE_GOTO(BRANCH_next, scan);
4345 }
40a82448 4346 /* NOTREACHED */
5d458dd8
YO
4347 case CUTGROUP:
4348 PL_reginput = locinput;
4349 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 4350 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
4351 PUSH_STATE_GOTO(CUTGROUP_next,next);
4352 /* NOTREACHED */
4353 case CUTGROUP_next_fail:
4354 do_cutgroup = 1;
4355 no_final = 1;
4356 if (st->u.mark.mark_name)
4357 sv_commit = st->u.mark.mark_name;
4358 sayNO;
4359 /* NOTREACHED */
4360 case BRANCH_next:
4361 sayYES;
4362 /* NOTREACHED */
40a82448 4363 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
4364 if (do_cutgroup) {
4365 do_cutgroup = 0;
4366 no_final = 0;
4367 }
40a82448
DM
4368 REGCP_UNWIND(ST.cp);
4369 for (n = *PL_reglastparen; n > ST.lastparen; n--)
f0ab9afb 4370 PL_regoffs[n].end = -1;
40a82448 4371 *PL_reglastparen = n;
0a4db386 4372 /*dmq: *PL_reglastcloseparen = n; */
40a82448
DM
4373 scan = ST.next_branch;
4374 /* no more branches? */
5d458dd8
YO
4375 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4376 DEBUG_EXECUTE_r({
4377 PerlIO_printf( Perl_debug_log,
4378 "%*s %sBRANCH failed...%s\n",
4379 REPORT_CODE_OFF+depth*2, "",
4380 PL_colors[4],
4381 PL_colors[5] );
4382 });
4383 sayNO_SILENT;
4384 }
40a82448
DM
4385 continue; /* execute next BRANCH[J] op */
4386 /* NOTREACHED */
4387
a0d0e21e 4388 case MINMOD:
24d3c4a9 4389 minmod = 1;
a0d0e21e 4390 break;
40a82448
DM
4391
4392#undef ST
4393#define ST st->u.curlym
4394
4395 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4396
4397 /* This is an optimisation of CURLYX that enables us to push
4398 * only a single backtracking state, no matter now many matches
4399 * there are in {m,n}. It relies on the pattern being constant
4400 * length, with no parens to influence future backrefs
4401 */
4402
4403 ST.me = scan;
dc45a647 4404 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
4405
4406 /* if paren positive, emulate an OPEN/CLOSE around A */
4407 if (ST.me->flags) {
3b6647e0 4408 U32 paren = ST.me->flags;
40a82448
DM
4409 if (paren > PL_regsize)
4410 PL_regsize = paren;
3b6647e0 4411 if (paren > *PL_reglastparen)
40a82448 4412 *PL_reglastparen = paren;
c277df42 4413 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 4414 }
40a82448
DM
4415 ST.A = scan;
4416 ST.B = next;
4417 ST.alen = 0;
4418 ST.count = 0;
24d3c4a9
DM
4419 ST.minmod = minmod;
4420 minmod = 0;
40a82448
DM
4421 ST.c1 = CHRTEST_UNINIT;
4422 REGCP_SET(ST.cp);
6407bf3b 4423
40a82448
DM
4424 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4425 goto curlym_do_B;
4426
4427 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 4428 PL_reginput = locinput;
40a82448
DM
4429 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4430 /* NOTREACHED */
5f80c4cf 4431
40a82448
DM
4432 case CURLYM_A: /* we've just matched an A */
4433 locinput = st->locinput;
4434 nextchr = UCHARAT(locinput);
4435
4436 ST.count++;
4437 /* after first match, determine A's length: u.curlym.alen */
4438 if (ST.count == 1) {
4439 if (PL_reg_match_utf8) {
4440 char *s = locinput;
4441 while (s < PL_reginput) {
4442 ST.alen++;
4443 s += UTF8SKIP(s);
4444 }
4445 }
4446 else {
4447 ST.alen = PL_reginput - locinput;
4448 }
4449 if (ST.alen == 0)
4450 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4451 }
0cadcf80
DM
4452 DEBUG_EXECUTE_r(
4453 PerlIO_printf(Perl_debug_log,
40a82448 4454 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 4455 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 4456 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
4457 );
4458
40a82448 4459 locinput = PL_reginput;
0a4db386
YO
4460
4461 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4462 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
4463 goto fake_end;
4464
4465 if ( ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)) )
40a82448
DM
4466 goto curlym_do_A; /* try to match another A */
4467 goto curlym_do_B; /* try to match B */
5f80c4cf 4468
40a82448
DM
4469 case CURLYM_A_fail: /* just failed to match an A */
4470 REGCP_UNWIND(ST.cp);
0a4db386
YO
4471
4472 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4473 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4474 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 4475 sayNO;
0cadcf80 4476
40a82448
DM
4477 curlym_do_B: /* execute the B in /A{m,n}B/ */
4478 PL_reginput = locinput;
4479 if (ST.c1 == CHRTEST_UNINIT) {
4480 /* calculate c1 and c2 for possible match of 1st char
4481 * following curly */
4482 ST.c1 = ST.c2 = CHRTEST_VOID;
4483 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4484 regnode *text_node = ST.B;
4485 if (! HAS_TEXT(text_node))
4486 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
4487 /* this used to be
4488
4489 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4490
4491 But the former is redundant in light of the latter.
4492
4493 if this changes back then the macro for
4494 IS_TEXT and friends need to change.
4495 */
4496 if (PL_regkind[OP(text_node)] == EXACT)
40a82448 4497 {
ee9b8eae 4498
40a82448
DM
4499 ST.c1 = (U8)*STRING(text_node);
4500 ST.c2 =
ee9b8eae 4501 (IS_TEXTF(text_node))
40a82448 4502 ? PL_fold[ST.c1]
ee9b8eae 4503 : (IS_TEXTFL(text_node))
40a82448
DM
4504 ? PL_fold_locale[ST.c1]
4505 : ST.c1;
c277df42 4506 }
c277df42 4507 }
40a82448
DM
4508 }
4509
4510 DEBUG_EXECUTE_r(
4511 PerlIO_printf(Perl_debug_log,
4512 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 4513 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
4514 "", (IV)ST.count)
4515 );
4516 if (ST.c1 != CHRTEST_VOID
4517 && UCHARAT(PL_reginput) != ST.c1
4518 && UCHARAT(PL_reginput) != ST.c2)
4519 {
4520 /* simulate B failing */
3e901dc0
YO
4521 DEBUG_OPTIMISE_r(
4522 PerlIO_printf(Perl_debug_log,
4523 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4524 (int)(REPORT_CODE_OFF+(depth*2)),"",
4525 (IV)ST.c1,(IV)ST.c2
4526 ));
40a82448
DM
4527 state_num = CURLYM_B_fail;
4528 goto reenter_switch;
4529 }
4530
4531 if (ST.me->flags) {
4532 /* mark current A as captured */
4533 I32 paren = ST.me->flags;
4534 if (ST.count) {
f0ab9afb 4535 PL_regoffs[paren].start
40a82448 4536 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
f0ab9afb 4537 PL_regoffs[paren].end = PL_reginput - PL_bostr;
0a4db386 4538 /*dmq: *PL_reglastcloseparen = paren; */
c277df42 4539 }
40a82448 4540 else
f0ab9afb 4541 PL_regoffs[paren].end = -1;
0a4db386 4542 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4543 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
4544 {
4545 if (ST.count)
4546 goto fake_end;
4547 else
4548 sayNO;
4549 }
c277df42 4550 }
0a4db386 4551
40a82448 4552 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 4553 /* NOTREACHED */
40a82448
DM
4554
4555 case CURLYM_B_fail: /* just failed to match a B */
4556 REGCP_UNWIND(ST.cp);
4557 if (ST.minmod) {
4558 if (ST.count == ARG2(ST.me) /* max */)
4559 sayNO;
4560 goto curlym_do_A; /* try to match a further A */
4561 }
4562 /* backtrack one A */
4563 if (ST.count == ARG1(ST.me) /* min */)
4564 sayNO;
4565 ST.count--;
4566 locinput = HOPc(locinput, -ST.alen);
4567 goto curlym_do_B; /* try to match B */
4568
c255a977
DM
4569#undef ST
4570#define ST st->u.curly
40a82448 4571
c255a977
DM
4572#define CURLY_SETPAREN(paren, success) \
4573 if (paren) { \
4574 if (success) { \
f0ab9afb
NC
4575 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4576 PL_regoffs[paren].end = locinput - PL_bostr; \
0a4db386 4577 *PL_reglastcloseparen = paren; \
c255a977
DM
4578 } \
4579 else \
f0ab9afb 4580 PL_regoffs[paren].end = -1; \
c255a977
DM
4581 }
4582
4583 case STAR: /* /A*B/ where A is width 1 */
4584 ST.paren = 0;
4585 ST.min = 0;
4586 ST.max = REG_INFTY;
a0d0e21e
LW
4587 scan = NEXTOPER(scan);
4588 goto repeat;
c255a977
DM
4589 case PLUS: /* /A+B/ where A is width 1 */
4590 ST.paren = 0;
4591 ST.min = 1;
4592 ST.max = REG_INFTY;
c277df42 4593 scan = NEXTOPER(scan);
c255a977
DM
4594 goto repeat;
4595 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4596 ST.paren = scan->flags; /* Which paren to set */
4597 if (ST.paren > PL_regsize)
4598 PL_regsize = ST.paren;
3b6647e0 4599 if (ST.paren > *PL_reglastparen)
c255a977
DM
4600 *PL_reglastparen = ST.paren;
4601 ST.min = ARG1(scan); /* min to match */
4602 ST.max = ARG2(scan); /* max to match */
0a4db386 4603 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4604 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4605 ST.min=1;
4606 ST.max=1;
4607 }
c255a977
DM
4608 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4609 goto repeat;
4610 case CURLY: /* /A{m,n}B/ where A is width 1 */
4611 ST.paren = 0;
4612 ST.min = ARG1(scan); /* min to match */
4613 ST.max = ARG2(scan); /* max to match */
4614 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 4615 repeat:
a0d0e21e
LW
4616 /*
4617 * Lookahead to avoid useless match attempts
4618 * when we know what character comes next.
c255a977 4619 *
5f80c4cf
JP
4620 * Used to only do .*x and .*?x, but now it allows
4621 * for )'s, ('s and (?{ ... })'s to be in the way
4622 * of the quantifier and the EXACT-like node. -- japhy
4623 */
4624
c255a977
DM
4625 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4626 sayNO;
cca55fe3 4627 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4628 U8 *s;
4629 regnode *text_node = next;
4630
3dab1dad
YO
4631 if (! HAS_TEXT(text_node))
4632 FIND_NEXT_IMPT(text_node);
5f80c4cf 4633
9e137952 4634 if (! HAS_TEXT(text_node))
c255a977 4635 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 4636 else {
ee9b8eae 4637 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 4638 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 4639 goto assume_ok_easy;
cca55fe3 4640 }
be8e71aa
YO
4641 else
4642 s = (U8*)STRING(text_node);
ee9b8eae
YO
4643
4644 /* Currently we only get here when
4645
4646 PL_rekind[OP(text_node)] == EXACT
4647
4648 if this changes back then the macro for IS_TEXT and
4649 friends need to change. */
5f80c4cf 4650 if (!UTF) {
c255a977 4651 ST.c2 = ST.c1 = *s;
ee9b8eae 4652 if (IS_TEXTF(text_node))
c255a977 4653 ST.c2 = PL_fold[ST.c1];
ee9b8eae 4654 else if (IS_TEXTFL(text_node))
c255a977 4655 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 4656 }
5f80c4cf 4657 else { /* UTF */
ee9b8eae 4658 if (IS_TEXTF(text_node)) {
a2a2844f 4659 STRLEN ulen1, ulen2;
89ebb4a3
JH
4660 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4661 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4662
4663 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4664 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
e294cc5d
JH
4665#ifdef EBCDIC
4666 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4667 ckWARN(WARN_UTF8) ?
4668 0 : UTF8_ALLOW_ANY);
4669 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4670 ckWARN(WARN_UTF8) ?
4671 0 : UTF8_ALLOW_ANY);
4672#else
c255a977 4673 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
e294cc5d 4674 uniflags);
c255a977 4675 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
e294cc5d
JH
4676 uniflags);
4677#endif
5f80c4cf
JP
4678 }
4679 else {
c255a977 4680 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4681 uniflags);
5f80c4cf 4682 }
1aa99e6b
IH
4683 }
4684 }
bbce6d69 4685 }
a0d0e21e 4686 else
c255a977 4687 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 4688 assume_ok_easy:
c255a977
DM
4689
4690 ST.A = scan;
4691 ST.B = next;
3280af22 4692 PL_reginput = locinput;
24d3c4a9
DM
4693 if (minmod) {
4694 minmod = 0;
e2e6a0f1 4695 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4633a7c4 4696 sayNO;
c255a977 4697 ST.count = ST.min;
a0ed51b3 4698 locinput = PL_reginput;
c255a977
DM
4699 REGCP_SET(ST.cp);
4700 if (ST.c1 == CHRTEST_VOID)
4701 goto curly_try_B_min;
4702
4703 ST.oldloc = locinput;
4704
4705 /* set ST.maxpos to the furthest point along the
4706 * string that could possibly match */
4707 if (ST.max == REG_INFTY) {
4708 ST.maxpos = PL_regeol - 1;
4709 if (do_utf8)
4710 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4711 ST.maxpos--;
4712 }
4713 else if (do_utf8) {
4714 int m = ST.max - ST.min;
4715 for (ST.maxpos = locinput;
4716 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4717 ST.maxpos += UTF8SKIP(ST.maxpos);
4718 }
4719 else {
4720 ST.maxpos = locinput + ST.max - ST.min;
4721 if (ST.maxpos >= PL_regeol)
4722 ST.maxpos = PL_regeol - 1;
4723 }
4724 goto curly_try_B_min_known;
4725
4726 }
4727 else {
e2e6a0f1 4728 ST.count = regrepeat(rex, ST.A, ST.max, depth);
c255a977
DM
4729 locinput = PL_reginput;
4730 if (ST.count < ST.min)
4731 sayNO;
4732 if ((ST.count > ST.min)
4733 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4734 {
4735 /* A{m,n} must come at the end of the string, there's
4736 * no point in backing off ... */
4737 ST.min = ST.count;
4738 /* ...except that $ and \Z can match before *and* after
4739 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4740 We may back off by one in this case. */
4741 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4742 ST.min--;
4743 }
4744 REGCP_SET(ST.cp);
4745 goto curly_try_B_max;
4746 }
4747 /* NOTREACHED */
4748
4749
4750 case CURLY_B_min_known_fail:
4751 /* failed to find B in a non-greedy match where c1,c2 valid */
4752 if (ST.paren && ST.count)
f0ab9afb 4753 PL_regoffs[ST.paren].end = -1;
c255a977
DM
4754
4755 PL_reginput = locinput; /* Could be reset... */
4756 REGCP_UNWIND(ST.cp);
4757 /* Couldn't or didn't -- move forward. */
4758 ST.oldloc = locinput;
4759 if (do_utf8)
4760 locinput += UTF8SKIP(locinput);
4761 else
4762 locinput++;
4763 ST.count++;
4764 curly_try_B_min_known:
4765 /* find the next place where 'B' could work, then call B */
4766 {
4767 int n;
4768 if (do_utf8) {
4769 n = (ST.oldloc == locinput) ? 0 : 1;
4770 if (ST.c1 == ST.c2) {
4771 STRLEN len;
4772 /* set n to utf8_distance(oldloc, locinput) */
4773 while (locinput <= ST.maxpos &&
4774 utf8n_to_uvchr((U8*)locinput,
4775 UTF8_MAXBYTES, &len,
4776 uniflags) != (UV)ST.c1) {
4777 locinput += len;
4778 n++;
4779 }
1aa99e6b
IH
4780 }
4781 else {
c255a977
DM
4782 /* set n to utf8_distance(oldloc, locinput) */
4783 while (locinput <= ST.maxpos) {
4784 STRLEN len;
4785 const UV c = utf8n_to_uvchr((U8*)locinput,
4786 UTF8_MAXBYTES, &len,
4787 uniflags);
4788 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4789 break;
4790 locinput += len;
4791 n++;
1aa99e6b 4792 }
0fe9bf95
IZ
4793 }
4794 }
c255a977
DM
4795 else {
4796 if (ST.c1 == ST.c2) {
4797 while (locinput <= ST.maxpos &&
4798 UCHARAT(locinput) != ST.c1)
4799 locinput++;
bbce6d69 4800 }
c255a977
DM
4801 else {
4802 while (locinput <= ST.maxpos
4803 && UCHARAT(locinput) != ST.c1
4804 && UCHARAT(locinput) != ST.c2)
4805 locinput++;
a0ed51b3 4806 }
c255a977
DM
4807 n = locinput - ST.oldloc;
4808 }
4809 if (locinput > ST.maxpos)
4810 sayNO;
4811 /* PL_reginput == oldloc now */
4812 if (n) {
4813 ST.count += n;
e2e6a0f1 4814 if (regrepeat(rex, ST.A, n, depth) < n)
4633a7c4 4815 sayNO;
a0d0e21e 4816 }
c255a977
DM
4817 PL_reginput = locinput;
4818 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 4819 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4820 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4821 goto fake_end;
4822 }
c255a977 4823 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 4824 }
c255a977
DM
4825 /* NOTREACHED */
4826
4827
4828 case CURLY_B_min_fail:
4829 /* failed to find B in a non-greedy match where c1,c2 invalid */
4830 if (ST.paren && ST.count)
f0ab9afb 4831 PL_regoffs[ST.paren].end = -1;
c255a977
DM
4832
4833 REGCP_UNWIND(ST.cp);
4834 /* failed -- move forward one */
4835 PL_reginput = locinput;
e2e6a0f1 4836 if (regrepeat(rex, ST.A, 1, depth)) {
c255a977 4837 ST.count++;
a0ed51b3 4838 locinput = PL_reginput;
c255a977
DM
4839 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4840 ST.count > 0)) /* count overflow ? */
15272685 4841 {
c255a977
DM
4842 curly_try_B_min:
4843 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 4844 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4845 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4846 goto fake_end;
4847 }
c255a977 4848 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
4849 }
4850 }
4633a7c4 4851 sayNO;
c255a977
DM
4852 /* NOTREACHED */
4853
4854
4855 curly_try_B_max:
4856 /* a successful greedy match: now try to match B */
40d049e4 4857 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4858 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
4859 goto fake_end;
4860 }
c255a977
DM
4861 {
4862 UV c = 0;
4863 if (ST.c1 != CHRTEST_VOID)
4864 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4865 UTF8_MAXBYTES, 0, uniflags)
466787eb 4866 : (UV) UCHARAT(PL_reginput);
c255a977
DM
4867 /* If it could work, try it. */
4868 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4869 CURLY_SETPAREN(ST.paren, ST.count);
4870 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4871 /* NOTREACHED */
4872 }
4873 }
4874 /* FALL THROUGH */
4875 case CURLY_B_max_fail:
4876 /* failed to find B in a greedy match */
4877 if (ST.paren && ST.count)
f0ab9afb 4878 PL_regoffs[ST.paren].end = -1;
c255a977
DM
4879
4880 REGCP_UNWIND(ST.cp);
4881 /* back up. */
4882 if (--ST.count < ST.min)
4883 sayNO;
4884 PL_reginput = locinput = HOPc(locinput, -1);
4885 goto curly_try_B_max;
4886
4887#undef ST
4888
a0d0e21e 4889 case END:
6bda09f9 4890 fake_end:
faec1544
DM
4891 if (cur_eval) {
4892 /* we've just finished A in /(??{A})B/; now continue with B */
4893 I32 tmpix;
faec1544
DM
4894 st->u.eval.toggle_reg_flags
4895 = cur_eval->u.eval.toggle_reg_flags;
4896 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4897
288b8c02
NC
4898 st->u.eval.prev_rex = rex_sv; /* inner */
4899 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
4900 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 4901 rexi = RXi_GET(rex);
faec1544 4902 cur_curlyx = cur_eval->u.eval.prev_curlyx;
288b8c02 4903 ReREFCNT_inc(rex_sv);
faec1544
DM
4904 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4905 REGCP_SET(st->u.eval.lastcp);
4906 PL_reginput = locinput;
4907
4908 /* Restore parens of the outer rex without popping the
4909 * savestack */
4910 tmpix = PL_savestack_ix;
4911 PL_savestack_ix = cur_eval->u.eval.lastcp;
4912 regcppop(rex);
4913 PL_savestack_ix = tmpix;
4914
4915 st->u.eval.prev_eval = cur_eval;
4916 cur_eval = cur_eval->u.eval.prev_eval;
4917 DEBUG_EXECUTE_r(
2a49f0f5
JH
4918 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
4919 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
4920 if ( nochange_depth )
4921 nochange_depth--;
4922
4923 PUSH_YES_STATE_GOTO(EVAL_AB,
faec1544
DM
4924 st->u.eval.prev_eval->u.eval.B); /* match B */
4925 }
4926
3b0527fe 4927 if (locinput < reginfo->till) {
a3621e74 4928 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4929 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4930 PL_colors[4],
4931 (long)(locinput - PL_reg_starttry),
3b0527fe 4932 (long)(reginfo->till - PL_reg_starttry),
7821416a 4933 PL_colors[5]));
58e23c8d 4934
262b90c4 4935 sayNO_SILENT; /* Cannot match: too short. */
7821416a
IZ
4936 }
4937 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 4938 sayYES; /* Success! */
dad79028
DM
4939
4940 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4941 DEBUG_EXECUTE_r(
4942 PerlIO_printf(Perl_debug_log,
4943 "%*s %ssubpattern success...%s\n",
5bc10b2c 4944 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
3280af22 4945 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 4946 sayYES; /* Success! */
dad79028 4947
40a82448
DM
4948#undef ST
4949#define ST st->u.ifmatch
4950
4951 case SUSPEND: /* (?>A) */
4952 ST.wanted = 1;
9fe1d20c 4953 PL_reginput = locinput;
9041c2e3 4954 goto do_ifmatch;
dad79028 4955
40a82448
DM
4956 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4957 ST.wanted = 0;
dad79028
DM
4958 goto ifmatch_trivial_fail_test;
4959
40a82448
DM
4960 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4961 ST.wanted = 1;
dad79028 4962 ifmatch_trivial_fail_test:
a0ed51b3 4963 if (scan->flags) {
52657f30 4964 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4965 if (!s) {
4966 /* trivial fail */
24d3c4a9
DM
4967 if (logical) {
4968 logical = 0;
4969 sw = 1 - (bool)ST.wanted;
dad79028 4970 }
40a82448 4971 else if (ST.wanted)
dad79028
DM
4972 sayNO;
4973 next = scan + ARG(scan);
4974 if (next == scan)
4975 next = NULL;
4976 break;
4977 }
efb30f32 4978 PL_reginput = s;
a0ed51b3
LW
4979 }
4980 else
4981 PL_reginput = locinput;
4982
c277df42 4983 do_ifmatch:
40a82448 4984 ST.me = scan;
24d3c4a9 4985 ST.logical = logical;
24d786f4
YO
4986 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
4987
40a82448
DM
4988 /* execute body of (?...A) */
4989 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4990 /* NOTREACHED */
4991
4992 case IFMATCH_A_fail: /* body of (?...A) failed */
4993 ST.wanted = !ST.wanted;
4994 /* FALL THROUGH */
4995
4996 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9
DM
4997 if (ST.logical) {
4998 sw = (bool)ST.wanted;
40a82448
DM
4999 }
5000 else if (!ST.wanted)
5001 sayNO;
5002
5003 if (OP(ST.me) == SUSPEND)
5004 locinput = PL_reginput;
5005 else {
5006 locinput = PL_reginput = st->locinput;
5007 nextchr = UCHARAT(locinput);
5008 }
5009 scan = ST.me + ARG(ST.me);
5010 if (scan == ST.me)
5011 scan = NULL;
5012 continue; /* execute B */
5013
5014#undef ST
dad79028 5015
c277df42 5016 case LONGJMP:
c277df42
IZ
5017 next = scan + ARG(scan);
5018 if (next == scan)
5019 next = NULL;
a0d0e21e 5020 break;
54612592 5021 case COMMIT:
e2e6a0f1
YO
5022 reginfo->cutpoint = PL_regeol;
5023 /* FALLTHROUGH */
5d458dd8 5024 case PRUNE:
24b23f37 5025 PL_reginput = locinput;
e2e6a0f1 5026 if (!scan->flags)
ad64d0ec 5027 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
54612592
YO
5028 PUSH_STATE_GOTO(COMMIT_next,next);
5029 /* NOTREACHED */
5030 case COMMIT_next_fail:
5031 no_final = 1;
5032 /* FALLTHROUGH */
7f69552c
YO
5033 case OPFAIL:
5034 sayNO;
e2e6a0f1
YO
5035 /* NOTREACHED */
5036
5037#define ST st->u.mark
5038 case MARKPOINT:
5039 ST.prev_mark = mark_state;
5d458dd8 5040 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 5041 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1
YO
5042 mark_state = st;
5043 ST.mark_loc = PL_reginput = locinput;
5044 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5045 /* NOTREACHED */
5046 case MARKPOINT_next:
5047 mark_state = ST.prev_mark;
5048 sayYES;
5049 /* NOTREACHED */
5050 case MARKPOINT_next_fail:
5d458dd8 5051 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
5052 {
5053 if (ST.mark_loc > startpoint)
5054 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5055 popmark = NULL; /* we found our mark */
5056 sv_commit = ST.mark_name;
5057
5058 DEBUG_EXECUTE_r({
5d458dd8 5059 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
5060 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5061 REPORT_CODE_OFF+depth*2, "",
be2597df 5062 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
5063 });
5064 }
5065 mark_state = ST.prev_mark;
5d458dd8
YO
5066 sv_yes_mark = mark_state ?
5067 mark_state->u.mark.mark_name : NULL;
e2e6a0f1
YO
5068 sayNO;
5069 /* NOTREACHED */
5d458dd8
YO
5070 case SKIP:
5071 PL_reginput = locinput;
5072 if (scan->flags) {
2bf803e2 5073 /* (*SKIP) : if we fail we cut here*/
5d458dd8 5074 ST.mark_name = NULL;
e2e6a0f1 5075 ST.mark_loc = locinput;
5d458dd8
YO
5076 PUSH_STATE_GOTO(SKIP_next,next);
5077 } else {
2bf803e2 5078 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
5079 otherwise do nothing. Meaning we need to scan
5080 */
5081 regmatch_state *cur = mark_state;
ad64d0ec 5082 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
5083
5084 while (cur) {
5085 if ( sv_eq( cur->u.mark.mark_name,
5086 find ) )
5087 {
5088 ST.mark_name = find;
5089 PUSH_STATE_GOTO( SKIP_next, next );
5090 }
5091 cur = cur->u.mark.prev_mark;
5092 }
e2e6a0f1 5093 }
2bf803e2 5094 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8
YO
5095 break;
5096 case SKIP_next_fail:
5097 if (ST.mark_name) {
5098 /* (*CUT:NAME) - Set up to search for the name as we
5099 collapse the stack*/
5100 popmark = ST.mark_name;
5101 } else {
5102 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
5103 if (ST.mark_loc > startpoint)
5104 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
5105 /* but we set sv_commit to latest mark_name if there
5106 is one so they can test to see how things lead to this
5107 cut */
5108 if (mark_state)
5109 sv_commit=mark_state->u.mark.mark_name;
5110 }
e2e6a0f1
YO
5111 no_final = 1;
5112 sayNO;
5113 /* NOTREACHED */
5114#undef ST
32e6a07c
YO
5115 case FOLDCHAR:
5116 n = ARG(scan);
81d4fa0f 5117 if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
e64b1bd1
YO
5118 locinput += ln;
5119 } else if ( 0xDF == n && !do_utf8 && !UTF ) {
5120 sayNO;
5121 } else {
5122 U8 folded[UTF8_MAXBYTES_CASE+1];
5123 STRLEN foldlen;
5124 const char * const l = locinput;
5125 char *e = PL_regeol;
5126 to_uni_fold(n, folded, &foldlen);
5127
59fe32ea 5128 if (ibcmp_utf8((const char*) folded, 0, foldlen, 1,
e64b1bd1 5129 l, &e, 0, do_utf8)) {
32e6a07c 5130 sayNO;
e64b1bd1
YO
5131 }
5132 locinput = e;
32e6a07c
YO
5133 }
5134 nextchr = UCHARAT(locinput);
5135 break;
e1d1eefb
YO
5136 case LNBREAK:
5137 if ((n=is_LNBREAK(locinput,do_utf8))) {
5138 locinput += n;
5139 nextchr = UCHARAT(locinput);
5140 } else
5141 sayNO;
5142 break;
5143
5144#define CASE_CLASS(nAmE) \
5145 case nAmE: \
5146 if ((n=is_##nAmE(locinput,do_utf8))) { \
5147 locinput += n; \
5148 nextchr = UCHARAT(locinput); \
5149 } else \
5150 sayNO; \
5151 break; \
5152 case N##nAmE: \
5153 if ((n=is_##nAmE(locinput,do_utf8))) { \
5154 sayNO; \
5155 } else { \
5156 locinput += UTF8SKIP(locinput); \
5157 nextchr = UCHARAT(locinput); \
5158 } \
5159 break
5160
5161 CASE_CLASS(VERTWS);
5162 CASE_CLASS(HORIZWS);
5163#undef CASE_CLASS
5164
a0d0e21e 5165 default:
b900a521 5166 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 5167 PTR2UV(scan), OP(scan));
cea2e8a9 5168 Perl_croak(aTHX_ "regexp memory corruption");
5d458dd8
YO
5169
5170 } /* end switch */
95b24440 5171
5d458dd8
YO
5172 /* switch break jumps here */
5173 scan = next; /* prepare to execute the next op and ... */
5174 continue; /* ... jump back to the top, reusing st */
95b24440
DM
5175 /* NOTREACHED */
5176
40a82448
DM
5177 push_yes_state:
5178 /* push a state that backtracks on success */
5179 st->u.yes.prev_yes_state = yes_state;
5180 yes_state = st;
5181 /* FALL THROUGH */
5182 push_state:
5183 /* push a new regex state, then continue at scan */
5184 {
5185 regmatch_state *newst;
5186
24b23f37
YO
5187 DEBUG_STACK_r({
5188 regmatch_state *cur = st;
5189 regmatch_state *curyes = yes_state;
5190 int curd = depth;
5191 regmatch_slab *slab = PL_regmatch_slab;
5192 for (;curd > -1;cur--,curd--) {
5193 if (cur < SLAB_FIRST(slab)) {
5194 slab = slab->prev;
5195 cur = SLAB_LAST(slab);
5196 }
5197 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5198 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 5199 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
5200 (curyes == cur) ? "yes" : ""
5201 );
5202 if (curyes == cur)
5203 curyes = cur->u.yes.prev_yes_state;
5204 }
5205 } else
5206 DEBUG_STATE_pp("push")
5207 );
40a82448 5208 depth++;
40a82448
DM
5209 st->locinput = locinput;
5210 newst = st+1;
5211 if (newst > SLAB_LAST(PL_regmatch_slab))
5212 newst = S_push_slab(aTHX);
5213 PL_regmatch_state = newst;
786e8c11 5214
40a82448
DM
5215 locinput = PL_reginput;
5216 nextchr = UCHARAT(locinput);
5217 st = newst;
5218 continue;
5219 /* NOTREACHED */
5220 }
a0d0e21e 5221 }
a687059c 5222
a0d0e21e
LW
5223 /*
5224 * We get here only if there's trouble -- normally "case END" is
5225 * the terminating point.
5226 */
cea2e8a9 5227 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 5228 /*NOTREACHED*/
4633a7c4
LW
5229 sayNO;
5230
262b90c4 5231yes:
77cb431f
DM
5232 if (yes_state) {
5233 /* we have successfully completed a subexpression, but we must now
5234 * pop to the state marked by yes_state and continue from there */
77cb431f 5235 assert(st != yes_state);
5bc10b2c
DM
5236#ifdef DEBUGGING
5237 while (st != yes_state) {
5238 st--;
5239 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5240 PL_regmatch_slab = PL_regmatch_slab->prev;
5241 st = SLAB_LAST(PL_regmatch_slab);
5242 }
e2e6a0f1 5243 DEBUG_STATE_r({
54612592
YO
5244 if (no_final) {
5245 DEBUG_STATE_pp("pop (no final)");
5246 } else {
5247 DEBUG_STATE_pp("pop (yes)");
5248 }
e2e6a0f1 5249 });
5bc10b2c
DM
5250 depth--;
5251 }
5252#else
77cb431f
DM
5253 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5254 || yes_state > SLAB_LAST(PL_regmatch_slab))
5255 {
5256 /* not in this slab, pop slab */
5257 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5258 PL_regmatch_slab = PL_regmatch_slab->prev;
5259 st = SLAB_LAST(PL_regmatch_slab);
5260 }
5261 depth -= (st - yes_state);
5bc10b2c 5262#endif
77cb431f
DM
5263 st = yes_state;
5264 yes_state = st->u.yes.prev_yes_state;
5265 PL_regmatch_state = st;
24b23f37 5266
5d458dd8
YO
5267 if (no_final) {
5268 locinput= st->locinput;
5269 nextchr = UCHARAT(locinput);
5270 }
54612592 5271 state_num = st->resume_state + no_final;
24d3c4a9 5272 goto reenter_switch;
77cb431f
DM
5273 }
5274
a3621e74 5275 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 5276 PL_colors[4], PL_colors[5]));
02db2b7b 5277
19b95bf0
DM
5278 if (PL_reg_eval_set) {
5279 /* each successfully executed (?{...}) block does the equivalent of
5280 * local $^R = do {...}
5281 * When popping the save stack, all these locals would be undone;
5282 * bypass this by setting the outermost saved $^R to the latest
5283 * value */
5284 if (oreplsv != GvSV(PL_replgv))
5285 sv_setsv(oreplsv, GvSV(PL_replgv));
5286 }
95b24440 5287 result = 1;
aa283a38 5288 goto final_exit;
4633a7c4
LW
5289
5290no:
a3621e74 5291 DEBUG_EXECUTE_r(
7821416a 5292 PerlIO_printf(Perl_debug_log,
786e8c11 5293 "%*s %sfailed...%s\n",
5bc10b2c 5294 REPORT_CODE_OFF+depth*2, "",
786e8c11 5295 PL_colors[4], PL_colors[5])
7821416a 5296 );
aa283a38 5297
262b90c4 5298no_silent:
54612592
YO
5299 if (no_final) {
5300 if (yes_state) {
5301 goto yes;
5302 } else {
5303 goto final_exit;
5304 }
5305 }
aa283a38
DM
5306 if (depth) {
5307 /* there's a previous state to backtrack to */
40a82448
DM
5308 st--;
5309 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5310 PL_regmatch_slab = PL_regmatch_slab->prev;
5311 st = SLAB_LAST(PL_regmatch_slab);
5312 }
5313 PL_regmatch_state = st;
40a82448
DM
5314 locinput= st->locinput;
5315 nextchr = UCHARAT(locinput);
5316
5bc10b2c
DM
5317 DEBUG_STATE_pp("pop");
5318 depth--;
262b90c4
DM
5319 if (yes_state == st)
5320 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 5321
24d3c4a9
DM
5322 state_num = st->resume_state + 1; /* failure = success + 1 */
5323 goto reenter_switch;
95b24440 5324 }
24d3c4a9 5325 result = 0;
aa283a38 5326
262b90c4 5327 final_exit:
bbe252da 5328 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
5329 SV *sv_err = get_sv("REGERROR", 1);
5330 SV *sv_mrk = get_sv("REGMARK", 1);
5331 if (result) {
e2e6a0f1 5332 sv_commit = &PL_sv_no;
5d458dd8
YO
5333 if (!sv_yes_mark)
5334 sv_yes_mark = &PL_sv_yes;
5335 } else {
5336 if (!sv_commit)
5337 sv_commit = &PL_sv_yes;
5338 sv_yes_mark = &PL_sv_no;
5339 }
5340 sv_setsv(sv_err, sv_commit);
5341 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 5342 }
19b95bf0 5343
2f554ef7
DM
5344 /* clean up; in particular, free all slabs above current one */
5345 LEAVE_SCOPE(oldsave);
5d9a96ca 5346
95b24440 5347 return result;
a687059c
LW
5348}
5349
5350/*
5351 - regrepeat - repeatedly match something simple, report how many
5352 */
5353/*
5354 * [This routine now assumes that it will only match on things of length 1.
5355 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 5356 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 5357 */
76e3520e 5358STATIC I32
e2e6a0f1 5359S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
a687059c 5360{
27da23d5 5361 dVAR;
a0d0e21e 5362 register char *scan;
a0d0e21e 5363 register I32 c;
3280af22 5364 register char *loceol = PL_regeol;
a0ed51b3 5365 register I32 hardcount = 0;
53c4c00c 5366 register bool do_utf8 = PL_reg_match_utf8;
4f55667c
SP
5367#ifndef DEBUGGING
5368 PERL_UNUSED_ARG(depth);
5369#endif
a0d0e21e 5370
7918f24d
NC
5371 PERL_ARGS_ASSERT_REGREPEAT;
5372
3280af22 5373 scan = PL_reginput;
faf11cac
HS
5374 if (max == REG_INFTY)
5375 max = I32_MAX;
5376 else if (max < loceol - scan)
7f596f4c 5377 loceol = scan + max;
a0d0e21e 5378 switch (OP(p)) {
22c35a8c 5379 case REG_ANY:
1aa99e6b 5380 if (do_utf8) {
ffc61ed2 5381 loceol = PL_regeol;
1aa99e6b 5382 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
5383 scan += UTF8SKIP(scan);
5384 hardcount++;
5385 }
5386 } else {
5387 while (scan < loceol && *scan != '\n')
5388 scan++;
a0ed51b3
LW
5389 }
5390 break;
ffc61ed2 5391 case SANY:
def8e4ea
JH
5392 if (do_utf8) {
5393 loceol = PL_regeol;
a0804c9e 5394 while (scan < loceol && hardcount < max) {
def8e4ea
JH
5395 scan += UTF8SKIP(scan);
5396 hardcount++;
5397 }
5398 }
5399 else
5400 scan = loceol;
a0ed51b3 5401 break;
f33976b4
DB
5402 case CANY:
5403 scan = loceol;
5404 break;
090f7165
JH
5405 case EXACT: /* length of string is 1 */
5406 c = (U8)*STRING(p);
5407 while (scan < loceol && UCHARAT(scan) == c)
5408 scan++;
bbce6d69 5409 break;
5410 case EXACTF: /* length of string is 1 */
cd439c50 5411 c = (U8)*STRING(p);
bbce6d69 5412 while (scan < loceol &&
22c35a8c 5413 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 5414 scan++;
5415 break;
5416 case EXACTFL: /* length of string is 1 */
3280af22 5417 PL_reg_flags |= RF_tainted;
cd439c50 5418 c = (U8)*STRING(p);
bbce6d69 5419 while (scan < loceol &&
22c35a8c 5420 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
5421 scan++;
5422 break;
5423 case ANYOF:
ffc61ed2
JH
5424 if (do_utf8) {
5425 loceol = PL_regeol;
cfc92286 5426 while (hardcount < max && scan < loceol &&
32fc9b6a 5427 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
5428 scan += UTF8SKIP(scan);
5429 hardcount++;
5430 }
5431 } else {
32fc9b6a 5432 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
5433 scan++;
5434 }
a0d0e21e
LW
5435 break;
5436 case ALNUM:
1aa99e6b 5437 if (do_utf8) {
ffc61ed2 5438 loceol = PL_regeol;
1a4fad37 5439 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 5440 while (hardcount < max && scan < loceol &&
3568d838 5441 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5442 scan += UTF8SKIP(scan);
5443 hardcount++;
5444 }
5445 } else {
5446 while (scan < loceol && isALNUM(*scan))
5447 scan++;
a0ed51b3
LW
5448 }
5449 break;
bbce6d69 5450 case ALNUML:
3280af22 5451 PL_reg_flags |= RF_tainted;
1aa99e6b 5452 if (do_utf8) {
ffc61ed2 5453 loceol = PL_regeol;
1aa99e6b
IH
5454 while (hardcount < max && scan < loceol &&
5455 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5456 scan += UTF8SKIP(scan);
5457 hardcount++;
5458 }
5459 } else {
5460 while (scan < loceol && isALNUM_LC(*scan))
5461 scan++;
a0ed51b3
LW
5462 }
5463 break;
a0d0e21e 5464 case NALNUM:
1aa99e6b 5465 if (do_utf8) {
ffc61ed2 5466 loceol = PL_regeol;
1a4fad37 5467 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 5468 while (hardcount < max && scan < loceol &&
3568d838 5469 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5470 scan += UTF8SKIP(scan);
5471 hardcount++;
5472 }
5473 } else {
5474 while (scan < loceol && !isALNUM(*scan))
5475 scan++;
a0ed51b3
LW
5476 }
5477 break;
bbce6d69 5478 case NALNUML:
3280af22 5479 PL_reg_flags |= RF_tainted;
1aa99e6b 5480 if (do_utf8) {
ffc61ed2 5481 loceol = PL_regeol;
1aa99e6b
IH
5482 while (hardcount < max && scan < loceol &&
5483 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5484 scan += UTF8SKIP(scan);
5485 hardcount++;
5486 }
5487 } else {
5488 while (scan < loceol && !isALNUM_LC(*scan))
5489 scan++;
a0ed51b3
LW
5490 }
5491 break;
a0d0e21e 5492 case SPACE:
1aa99e6b 5493 if (do_utf8) {
ffc61ed2 5494 loceol = PL_regeol;
1a4fad37 5495 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5496 while (hardcount < max && scan < loceol &&
3568d838
JH
5497 (*scan == ' ' ||
5498 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5499 scan += UTF8SKIP(scan);
5500 hardcount++;
5501 }
5502 } else {
5503 while (scan < loceol && isSPACE(*scan))
5504 scan++;
a0ed51b3
LW
5505 }
5506 break;
bbce6d69 5507 case SPACEL:
3280af22 5508 PL_reg_flags |= RF_tainted;
1aa99e6b 5509 if (do_utf8) {
ffc61ed2 5510 loceol = PL_regeol;
1aa99e6b 5511 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5512 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5513 scan += UTF8SKIP(scan);
5514 hardcount++;
5515 }
5516 } else {
5517 while (scan < loceol && isSPACE_LC(*scan))
5518 scan++;
a0ed51b3
LW
5519 }
5520 break;
a0d0e21e 5521 case NSPACE:
1aa99e6b 5522 if (do_utf8) {
ffc61ed2 5523 loceol = PL_regeol;
1a4fad37 5524 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5525 while (hardcount < max && scan < loceol &&
3568d838
JH
5526 !(*scan == ' ' ||
5527 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5528 scan += UTF8SKIP(scan);
5529 hardcount++;
5530 }
5531 } else {
5532 while (scan < loceol && !isSPACE(*scan))
5533 scan++;
a0ed51b3 5534 }
0008a298 5535 break;
bbce6d69 5536 case NSPACEL:
3280af22 5537 PL_reg_flags |= RF_tainted;
1aa99e6b 5538 if (do_utf8) {
ffc61ed2 5539 loceol = PL_regeol;
1aa99e6b 5540 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5541 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5542 scan += UTF8SKIP(scan);
5543 hardcount++;
5544 }
5545 } else {
5546 while (scan < loceol && !isSPACE_LC(*scan))
5547 scan++;
a0ed51b3
LW
5548 }
5549 break;
a0d0e21e 5550 case DIGIT:
1aa99e6b 5551 if (do_utf8) {
ffc61ed2 5552 loceol = PL_regeol;
1a4fad37 5553 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5554 while (hardcount < max && scan < loceol &&
3568d838 5555 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5556 scan += UTF8SKIP(scan);
5557 hardcount++;
5558 }
5559 } else {
5560 while (scan < loceol && isDIGIT(*scan))
5561 scan++;
a0ed51b3
LW
5562 }
5563 break;
a0d0e21e 5564 case NDIGIT:
1aa99e6b 5565 if (do_utf8) {
ffc61ed2 5566 loceol = PL_regeol;
1a4fad37 5567 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5568 while (hardcount < max && scan < loceol &&
3568d838 5569 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5570 scan += UTF8SKIP(scan);
5571 hardcount++;
5572 }
5573 } else {
5574 while (scan < loceol && !isDIGIT(*scan))
5575 scan++;
a0ed51b3 5576 }
e1d1eefb
YO
5577 case LNBREAK:
5578 if (do_utf8) {
5579 loceol = PL_regeol;
5580 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5581 scan += c;
5582 hardcount++;
5583 }
5584 } else {
5585 /*
5586 LNBREAK can match two latin chars, which is ok,
5587 because we have a null terminated string, but we
5588 have to use hardcount in this situation
5589 */
5590 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
5591 scan+=c;
5592 hardcount++;
5593 }
5594 }
5595 break;
5596 case HORIZWS:
5597 if (do_utf8) {
5598 loceol = PL_regeol;
5599 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5600 scan += c;
5601 hardcount++;
5602 }
5603 } else {
5604 while (scan < loceol && is_HORIZWS_latin1(scan))
5605 scan++;
5606 }
a0ed51b3 5607 break;
e1d1eefb
YO
5608 case NHORIZWS:
5609 if (do_utf8) {
5610 loceol = PL_regeol;
5611 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5612 scan += UTF8SKIP(scan);
5613 hardcount++;
5614 }
5615 } else {
5616 while (scan < loceol && !is_HORIZWS_latin1(scan))
5617 scan++;
5618
5619 }
5620 break;
5621 case VERTWS:
5622 if (do_utf8) {
5623 loceol = PL_regeol;
5624 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5625 scan += c;
5626 hardcount++;
5627 }
5628 } else {
5629 while (scan < loceol && is_VERTWS_latin1(scan))
5630 scan++;
5631
5632 }
5633 break;
5634 case NVERTWS:
5635 if (do_utf8) {
5636 loceol = PL_regeol;
5637 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5638 scan += UTF8SKIP(scan);
5639 hardcount++;
5640 }
5641 } else {
5642 while (scan < loceol && !is_VERTWS_latin1(scan))
5643 scan++;
5644
5645 }
5646 break;
5647
a0d0e21e
LW
5648 default: /* Called on something of 0 width. */
5649 break; /* So match right here or not at all. */
5650 }
a687059c 5651
a0ed51b3
LW
5652 if (hardcount)
5653 c = hardcount;
5654 else
5655 c = scan - PL_reginput;
3280af22 5656 PL_reginput = scan;
a687059c 5657
a3621e74 5658 DEBUG_r({
e68ec53f 5659 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 5660 DEBUG_EXECUTE_r({
e68ec53f
YO
5661 SV * const prop = sv_newmortal();
5662 regprop(prog, prop, p);
5663 PerlIO_printf(Perl_debug_log,
be8e71aa 5664 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 5665 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 5666 });
be8e71aa 5667 });
9041c2e3 5668
a0d0e21e 5669 return(c);
a687059c
LW
5670}
5671
c277df42 5672
be8e71aa 5673#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 5674/*
ffc61ed2
JH
5675- regclass_swash - prepare the utf8 swash
5676*/
5677
5678SV *
32fc9b6a 5679Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 5680{
97aff369 5681 dVAR;
9e55ce06
JH
5682 SV *sw = NULL;
5683 SV *si = NULL;
5684 SV *alt = NULL;
f8fc2ecf
YO
5685 RXi_GET_DECL(prog,progi);
5686 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 5687
7918f24d
NC
5688 PERL_ARGS_ASSERT_REGCLASS_SWASH;
5689
4f639d21 5690 if (data && data->count) {
a3b680e6 5691 const U32 n = ARG(node);
ffc61ed2 5692
4f639d21 5693 if (data->what[n] == 's') {
ad64d0ec
NC
5694 SV * const rv = MUTABLE_SV(data->data[n]);
5695 AV * const av = MUTABLE_AV(SvRV(rv));
2d03de9c 5696 SV **const ary = AvARRAY(av);
9e55ce06 5697 SV **a, **b;
9041c2e3 5698
711a919c 5699 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5700 * documentation of these array elements. */
5701
b11f357e 5702 si = *ary;
fe5bfecd
JH
5703 a = SvROK(ary[1]) ? &ary[1] : NULL;
5704 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
b11f357e 5705
ffc61ed2
JH
5706 if (a)
5707 sw = *a;
5708 else if (si && doinit) {
5709 sw = swash_init("utf8", "", si, 1, 0);
5710 (void)av_store(av, 1, sw);
5711 }
9e55ce06
JH
5712 if (b)
5713 alt = *b;
ffc61ed2
JH
5714 }
5715 }
5716
9e55ce06
JH
5717 if (listsvp)
5718 *listsvp = si;
5719 if (altsvp)
5720 *altsvp = alt;
ffc61ed2
JH
5721
5722 return sw;
5723}
76234dfb 5724#endif
ffc61ed2
JH
5725
5726/*
ba7b4546 5727 - reginclass - determine if a character falls into a character class
832705d4
JH
5728
5729 The n is the ANYOF regnode, the p is the target string, lenp
5730 is pointer to the maximum length of how far to go in the p
5731 (if the lenp is zero, UTF8SKIP(p) is used),
5732 do_utf8 tells whether the target string is in UTF-8.
5733
bbce6d69 5734 */
5735
76e3520e 5736STATIC bool
32fc9b6a 5737S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5738{
27da23d5 5739 dVAR;
a3b680e6 5740 const char flags = ANYOF_FLAGS(n);
bbce6d69 5741 bool match = FALSE;
cc07378b 5742 UV c = *p;
ae9ddab8 5743 STRLEN len = 0;
9e55ce06 5744 STRLEN plen;
1aa99e6b 5745
7918f24d
NC
5746 PERL_ARGS_ASSERT_REGINCLASS;
5747
19f67299
TS
5748 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5749 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5750 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5751 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
e8a70c6f
SP
5752 if (len == (STRLEN)-1)
5753 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 5754 }
bbce6d69 5755
0f0076b4 5756 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5757 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5758 if (lenp)
5759 *lenp = 0;
ffc61ed2 5760 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5761 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5762 match = TRUE;
bbce6d69 5763 }
3568d838 5764 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5765 match = TRUE;
ffc61ed2 5766 if (!match) {
9e55ce06 5767 AV *av;
32fc9b6a 5768 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5769
5770 if (sw) {
3568d838 5771 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
5772 match = TRUE;
5773 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5774 if (!match && lenp && av) {
5775 I32 i;
9e55ce06 5776 for (i = 0; i <= av_len(av); i++) {
890ce7af 5777 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5778 STRLEN len;
890ce7af 5779 const char * const s = SvPV_const(sv, len);
9e55ce06 5780
061b10df 5781 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
5782 *lenp = len;
5783 match = TRUE;
5784 break;
5785 }
5786 }
5787 }
5788 if (!match) {
89ebb4a3 5789 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
5790 STRLEN tmplen;
5791
9e55ce06
JH
5792 to_utf8_fold(p, tmpbuf, &tmplen);
5793 if (swash_fetch(sw, tmpbuf, do_utf8))
5794 match = TRUE;
5795 }
ffc61ed2
JH
5796 }
5797 }
bbce6d69 5798 }
9e55ce06 5799 if (match && lenp && *lenp == 0)
0f0076b4 5800 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5801 }
1aa99e6b 5802 if (!match && c < 256) {
ffc61ed2
JH
5803 if (ANYOF_BITMAP_TEST(n, c))
5804 match = TRUE;
5805 else if (flags & ANYOF_FOLD) {
eb160463 5806 U8 f;
a0ed51b3 5807
ffc61ed2
JH
5808 if (flags & ANYOF_LOCALE) {
5809 PL_reg_flags |= RF_tainted;
5810 f = PL_fold_locale[c];
5811 }
5812 else
5813 f = PL_fold[c];
5814 if (f != c && ANYOF_BITMAP_TEST(n, f))
5815 match = TRUE;
5816 }
5817
5818 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5819 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5820 if (
5821 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5822 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5823 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5824 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5825 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5826 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5827 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5828 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5829 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5830 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5831 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5832 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5833 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5834 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5835 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5836 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5837 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5838 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5839 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5840 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5841 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5842 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5843 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5844 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5845 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5846 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5847 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5848 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5849 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5850 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5851 ) /* How's that for a conditional? */
5852 {
5853 match = TRUE;
5854 }
a0ed51b3 5855 }
a0ed51b3
LW
5856 }
5857
a0ed51b3
LW
5858 return (flags & ANYOF_INVERT) ? !match : match;
5859}
161b471a 5860
dfe13c55 5861STATIC U8 *
0ce71af7 5862S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5863{
97aff369 5864 dVAR;
7918f24d
NC
5865
5866 PERL_ARGS_ASSERT_REGHOP3;
5867
a0ed51b3 5868 if (off >= 0) {
1aa99e6b 5869 while (off-- && s < lim) {
ffc61ed2 5870 /* XXX could check well-formedness here */
a0ed51b3 5871 s += UTF8SKIP(s);
ffc61ed2 5872 }
a0ed51b3
LW
5873 }
5874 else {
1de06328
YO
5875 while (off++ && s > lim) {
5876 s--;
5877 if (UTF8_IS_CONTINUED(*s)) {
5878 while (s > lim && UTF8_IS_CONTINUATION(*s))
5879 s--;
a0ed51b3 5880 }
1de06328 5881 /* XXX could check well-formedness here */
a0ed51b3
LW
5882 }
5883 }
5884 return s;
5885}
161b471a 5886
f9f4320a
YO
5887#ifdef XXX_dmq
5888/* there are a bunch of places where we use two reghop3's that should
5889 be replaced with this routine. but since thats not done yet
5890 we ifdef it out - dmq
5891*/
dfe13c55 5892STATIC U8 *
1de06328
YO
5893S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5894{
5895 dVAR;
7918f24d
NC
5896
5897 PERL_ARGS_ASSERT_REGHOP4;
5898
1de06328
YO
5899 if (off >= 0) {
5900 while (off-- && s < rlim) {
5901 /* XXX could check well-formedness here */
5902 s += UTF8SKIP(s);
5903 }
5904 }
5905 else {
5906 while (off++ && s > llim) {
5907 s--;
5908 if (UTF8_IS_CONTINUED(*s)) {
5909 while (s > llim && UTF8_IS_CONTINUATION(*s))
5910 s--;
5911 }
5912 /* XXX could check well-formedness here */
5913 }
5914 }
5915 return s;
5916}
f9f4320a 5917#endif
1de06328
YO
5918
5919STATIC U8 *
0ce71af7 5920S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5921{
97aff369 5922 dVAR;
7918f24d
NC
5923
5924 PERL_ARGS_ASSERT_REGHOPMAYBE3;
5925
a0ed51b3 5926 if (off >= 0) {
1aa99e6b 5927 while (off-- && s < lim) {
ffc61ed2 5928 /* XXX could check well-formedness here */
a0ed51b3 5929 s += UTF8SKIP(s);
ffc61ed2 5930 }
a0ed51b3 5931 if (off >= 0)
3dab1dad 5932 return NULL;
a0ed51b3
LW
5933 }
5934 else {
1de06328
YO
5935 while (off++ && s > lim) {
5936 s--;
5937 if (UTF8_IS_CONTINUED(*s)) {
5938 while (s > lim && UTF8_IS_CONTINUATION(*s))
5939 s--;
a0ed51b3 5940 }
1de06328 5941 /* XXX could check well-formedness here */
a0ed51b3
LW
5942 }
5943 if (off <= 0)
3dab1dad 5944 return NULL;
a0ed51b3
LW
5945 }
5946 return s;
5947}
51371543 5948
51371543 5949static void
acfe0abc 5950restore_pos(pTHX_ void *arg)
51371543 5951{
97aff369 5952 dVAR;
097eb12c 5953 regexp * const rex = (regexp *)arg;
51371543
GS
5954 if (PL_reg_eval_set) {
5955 if (PL_reg_oldsaved) {
4f639d21
DM
5956 rex->subbeg = PL_reg_oldsaved;
5957 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5958#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5959 rex->saved_copy = PL_nrs;
ed252734 5960#endif
07bc277f 5961 RXp_MATCH_COPIED_on(rex);
51371543
GS
5962 }
5963 PL_reg_magic->mg_len = PL_reg_oldpos;
5964 PL_reg_eval_set = 0;
5965 PL_curpm = PL_reg_oldcurpm;
5966 }
5967}
33b8afdf
JH
5968
5969STATIC void
5970S_to_utf8_substr(pTHX_ register regexp *prog)
5971{
a1cac82e 5972 int i = 1;
7918f24d
NC
5973
5974 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
5975
a1cac82e
NC
5976 do {
5977 if (prog->substrs->data[i].substr
5978 && !prog->substrs->data[i].utf8_substr) {
5979 SV* const sv = newSVsv(prog->substrs->data[i].substr);
5980 prog->substrs->data[i].utf8_substr = sv;
5981 sv_utf8_upgrade(sv);
610460f9
NC
5982 if (SvVALID(prog->substrs->data[i].substr)) {
5983 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
5984 if (flags & FBMcf_TAIL) {
5985 /* Trim the trailing \n that fbm_compile added last
5986 time. */
5987 SvCUR_set(sv, SvCUR(sv) - 1);
5988 /* Whilst this makes the SV technically "invalid" (as its
5989 buffer is no longer followed by "\0") when fbm_compile()
5990 adds the "\n" back, a "\0" is restored. */
5991 }
5992 fbm_compile(sv, flags);
5993 }
a1cac82e
NC
5994 if (prog->substrs->data[i].substr == prog->check_substr)
5995 prog->check_utf8 = sv;
5996 }
5997 } while (i--);
33b8afdf
JH
5998}
5999
6000STATIC void
6001S_to_byte_substr(pTHX_ register regexp *prog)
6002{
97aff369 6003 dVAR;
a1cac82e 6004 int i = 1;
7918f24d
NC
6005
6006 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6007
a1cac82e
NC
6008 do {
6009 if (prog->substrs->data[i].utf8_substr
6010 && !prog->substrs->data[i].substr) {
6011 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6012 if (sv_utf8_downgrade(sv, TRUE)) {
610460f9
NC
6013 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6014 const U8 flags
6015 = BmFLAGS(prog->substrs->data[i].utf8_substr);
6016 if (flags & FBMcf_TAIL) {
6017 /* Trim the trailing \n that fbm_compile added last
6018 time. */
6019 SvCUR_set(sv, SvCUR(sv) - 1);
6020 }
6021 fbm_compile(sv, flags);
6022 }
a1cac82e
NC
6023 } else {
6024 SvREFCNT_dec(sv);
6025 sv = &PL_sv_undef;
6026 }
6027 prog->substrs->data[i].substr = sv;
6028 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6029 prog->check_substr = sv;
33b8afdf 6030 }
a1cac82e 6031 } while (i--);
33b8afdf 6032}
66610fdd
RGS
6033
6034/*
6035 * Local variables:
6036 * c-indentation-style: bsd
6037 * c-basic-offset: 4
6038 * indent-tabs-mode: t
6039 * End:
6040 *
37442d52
RGS
6041 * ex: set ts=8 sts=4 sw=4 noet:
6042 */