This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add a regression test for RT #68182.
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
c277df42
IZ
83#define RF_tainted 1 /* tainted information used? */
84#define RF_warned 2 /* warned about big count? */
faec1544 85
ab3bbdeb 86#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 87
eb160463 88#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
89
90#define RS_init 1 /* eval environment created */
91#define RS_set 2 /* replsv value is set */
c277df42 92
a687059c
LW
93#ifndef STATIC
94#define STATIC static
95#endif
96
32fc9b6a 97#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 98
c277df42
IZ
99/*
100 * Forwards.
101 */
102
33b8afdf 103#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 104#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 105
3dab1dad
YO
106#define HOPc(pos,off) \
107 (char *)(PL_reg_match_utf8 \
52657f30 108 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
109 : (U8*)(pos + off))
110#define HOPBACKc(pos, off) \
07be1b83
YO
111 (char*)(PL_reg_match_utf8\
112 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
113 : (pos - off >= PL_bostr) \
8e11feef 114 ? (U8*)pos - off \
3dab1dad 115 : NULL)
efb30f32 116
e7409c1b 117#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 118#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 119
1a4fad37
AL
120#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
121 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
122#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
123#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
124#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
125#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 126
3dab1dad
YO
127/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
128
5f80c4cf 129/* for use after a quantifier and before an EXACT-like node -- japhy */
3e901dc0
YO
130/* it would be nice to rework regcomp.sym to generate this stuff. sigh */
131#define JUMPABLE(rn) ( \
132 OP(rn) == OPEN || \
133 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
134 OP(rn) == EVAL || \
cca55fe3
JP
135 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
136 OP(rn) == PLUS || OP(rn) == MINMOD || \
ee9b8eae 137 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
3dab1dad 138 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 139)
ee9b8eae 140#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 141
ee9b8eae
YO
142#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
143
144#if 0
145/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
146 we don't need this definition. */
147#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
148#define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
149#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
150
151#else
152/* ... so we use this as its faster. */
153#define IS_TEXT(rn) ( OP(rn)==EXACT )
154#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
155#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
156
157#endif
e2d8ce26 158
a84d97b6
HS
159/*
160 Search for mandatory following text node; for lookahead, the text must
161 follow but for lookbehind (rn->flags != 0) we skip to the next step.
162*/
cca55fe3 163#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
164 while (JUMPABLE(rn)) { \
165 const OPCODE type = OP(rn); \
166 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 167 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 168 else if (type == PLUS) \
cca55fe3 169 rn = NEXTOPER(rn); \
3dab1dad 170 else if (type == IFMATCH) \
a84d97b6 171 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 172 else rn += NEXT_OFF(rn); \
3dab1dad 173 } \
5f80c4cf 174} STMT_END
74750237 175
c476f425 176
acfe0abc 177static void restore_pos(pTHX_ void *arg);
51371543 178
76e3520e 179STATIC CHECKPOINT
cea2e8a9 180S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 181{
97aff369 182 dVAR;
a3b680e6 183 const int retval = PL_savestack_ix;
b1ce53c5 184#define REGCP_PAREN_ELEMS 4
a3b680e6 185 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 186 int p;
40a82448 187 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 188
e49a9654
IH
189 if (paren_elems_to_push < 0)
190 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
191
f0ab9afb 192#define REGCP_OTHER_ELEMS 7
4b3c1a47 193 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
7f69552c 194
3280af22 195 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 196/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
f0ab9afb
NC
197 SSPUSHINT(PL_regoffs[p].end);
198 SSPUSHINT(PL_regoffs[p].start);
3280af22 199 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 200 SSPUSHINT(p);
e7707071 201 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
40a82448 202 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
f0ab9afb 203 (UV)p, (IV)PL_regoffs[p].start,
40a82448 204 (IV)(PL_reg_start_tmp[p] - PL_bostr),
f0ab9afb 205 (IV)PL_regoffs[p].end
40a82448 206 ));
a0d0e21e 207 }
b1ce53c5 208/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
f0ab9afb 209 SSPUSHPTR(PL_regoffs);
3280af22
NIS
210 SSPUSHINT(PL_regsize);
211 SSPUSHINT(*PL_reglastparen);
a01268b5 212 SSPUSHINT(*PL_reglastcloseparen);
3280af22 213 SSPUSHPTR(PL_reginput);
41123dfd
JH
214#define REGCP_FRAME_ELEMS 2
215/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
216 * are needed for the regexp context stack bookkeeping. */
217 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 218 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 219
a0d0e21e
LW
220 return retval;
221}
222
c277df42 223/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
224#define REGCP_SET(cp) \
225 DEBUG_STATE_r( \
ab3bbdeb 226 PerlIO_printf(Perl_debug_log, \
e4f74956 227 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
228 (IV)PL_savestack_ix)); \
229 cp = PL_savestack_ix
c3464db5 230
ab3bbdeb 231#define REGCP_UNWIND(cp) \
e4f74956 232 DEBUG_STATE_r( \
ab3bbdeb 233 if (cp != PL_savestack_ix) \
e4f74956
YO
234 PerlIO_printf(Perl_debug_log, \
235 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
236 (IV)(cp), (IV)PL_savestack_ix)); \
237 regcpblow(cp)
c277df42 238
76e3520e 239STATIC char *
097eb12c 240S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 241{
97aff369 242 dVAR;
3b6647e0 243 U32 i;
a0d0e21e 244 char *input;
a3621e74
YO
245 GET_RE_DEBUG_FLAGS_DECL;
246
7918f24d
NC
247 PERL_ARGS_ASSERT_REGCPPOP;
248
b1ce53c5 249 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 250 i = SSPOPINT;
b1ce53c5
JH
251 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
252 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 253 input = (char *) SSPOPPTR;
a01268b5 254 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
255 *PL_reglastparen = SSPOPINT;
256 PL_regsize = SSPOPINT;
f0ab9afb 257 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
b1ce53c5 258
6bda09f9 259
b1ce53c5 260 /* Now restore the parentheses context. */
41123dfd
JH
261 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
262 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 263 I32 tmps;
097eb12c 264 U32 paren = (U32)SSPOPINT;
3280af22 265 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
f0ab9afb 266 PL_regoffs[paren].start = SSPOPINT;
cf93c79d 267 tmps = SSPOPINT;
3280af22 268 if (paren <= *PL_reglastparen)
f0ab9afb 269 PL_regoffs[paren].end = tmps;
e7707071 270 DEBUG_BUFFERS_r(
c3464db5 271 PerlIO_printf(Perl_debug_log,
b900a521 272 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
f0ab9afb 273 (UV)paren, (IV)PL_regoffs[paren].start,
b900a521 274 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
f0ab9afb 275 (IV)PL_regoffs[paren].end,
3280af22 276 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 277 );
a0d0e21e 278 }
e7707071 279 DEBUG_BUFFERS_r(
bb7a0f54 280 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 281 PerlIO_printf(Perl_debug_log,
faccc32b 282 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 283 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
284 }
285 );
daf18116 286#if 1
dafc8851
JH
287 /* It would seem that the similar code in regtry()
288 * already takes care of this, and in fact it is in
289 * a better location to since this code can #if 0-ed out
290 * but the code in regtry() is needed or otherwise tests
291 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
292 * (as of patchlevel 7877) will fail. Then again,
293 * this code seems to be necessary or otherwise
225593e1
DM
294 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
295 * --jhi updated by dapm */
3b6647e0 296 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
097eb12c 297 if (i > PL_regsize)
f0ab9afb
NC
298 PL_regoffs[i].start = -1;
299 PL_regoffs[i].end = -1;
a0d0e21e 300 }
dafc8851 301#endif
a0d0e21e
LW
302 return input;
303}
304
02db2b7b 305#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 306
a687059c 307/*
e50aee73 308 * pregexec and friends
a687059c
LW
309 */
310
76234dfb 311#ifndef PERL_IN_XSUB_RE
a687059c 312/*
c277df42 313 - pregexec - match a regexp against a string
a687059c 314 */
c277df42 315I32
49d7dfbc 316Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 317 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
318/* strend: pointer to null at end of string */
319/* strbeg: real beginning of string */
320/* minend: end of match must be >=minend after stringarg. */
321/* nosave: For optimizations. */
322{
7918f24d
NC
323 PERL_ARGS_ASSERT_PREGEXEC;
324
c277df42 325 return
9041c2e3 326 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
327 nosave ? 0 : REXEC_COPY_STR);
328}
76234dfb 329#endif
22e551b9 330
9041c2e3 331/*
cad2e5aa
JH
332 * Need to implement the following flags for reg_anch:
333 *
334 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
335 * USE_INTUIT_ML
336 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
337 * INTUIT_AUTORITATIVE_ML
338 * INTUIT_ONCE_NOML - Intuit can match in one location only.
339 * INTUIT_ONCE_ML
340 *
341 * Another flag for this function: SECOND_TIME (so that float substrs
342 * with giant delta may be not rechecked).
343 */
344
345/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
346
3f7c398e 347/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
348 Otherwise, only SvCUR(sv) is used to get strbeg. */
349
350/* XXXX We assume that strpos is strbeg unless sv. */
351
6eb5f6b9
JH
352/* XXXX Some places assume that there is a fixed substring.
353 An update may be needed if optimizer marks as "INTUITable"
354 RExen without fixed substrings. Similarly, it is assumed that
355 lengths of all the strings are no more than minlen, thus they
356 cannot come from lookahead.
40d049e4
YO
357 (Or minlen should take into account lookahead.)
358 NOTE: Some of this comment is not correct. minlen does now take account
359 of lookahead/behind. Further research is required. -- demerphq
360
361*/
6eb5f6b9 362
2c2d71f5
JH
363/* A failure to find a constant substring means that there is no need to make
364 an expensive call to REx engine, thus we celebrate a failure. Similarly,
365 finding a substring too deep into the string means that less calls to
30944b6d
IZ
366 regtry() should be needed.
367
368 REx compiler's optimizer found 4 possible hints:
369 a) Anchored substring;
370 b) Fixed substring;
371 c) Whether we are anchored (beginning-of-line or \G);
372 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 373 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
374 string which does not contradict any of them.
375 */
2c2d71f5 376
6eb5f6b9
JH
377/* Most of decisions we do here should have been done at compile time.
378 The nodes of the REx which we used for the search should have been
379 deleted from the finite automaton. */
380
cad2e5aa 381char *
288b8c02 382Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 383 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 384{
97aff369 385 dVAR;
288b8c02 386 struct regexp *const prog = (struct regexp *)SvANY(rx);
b7953727 387 register I32 start_shift = 0;
cad2e5aa 388 /* Should be nonnegative! */
b7953727 389 register I32 end_shift = 0;
2c2d71f5
JH
390 register char *s;
391 register SV *check;
a1933d95 392 char *strbeg;
cad2e5aa 393 char *t;
1de06328 394 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 395 I32 ml_anch;
bd61b366
SS
396 register char *other_last = NULL; /* other substr checked before this */
397 char *check_at = NULL; /* check substr found at this pos */
bbe252da 398 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 399 RXi_GET_DECL(prog,progi);
30944b6d 400#ifdef DEBUGGING
890ce7af 401 const char * const i_strpos = strpos;
30944b6d 402#endif
a3621e74
YO
403 GET_RE_DEBUG_FLAGS_DECL;
404
7918f24d
NC
405 PERL_ARGS_ASSERT_RE_INTUIT_START;
406
288b8c02 407 RX_MATCH_UTF8_set(rx,do_utf8);
cad2e5aa 408
3c8556c3 409 if (RX_UTF8(rx)) {
b8d68ded
JH
410 PL_reg_flags |= RF_utf8;
411 }
ab3bbdeb 412 DEBUG_EXECUTE_r(
efd26800 413 debug_start_match(rx, do_utf8, strpos, strend,
1de06328
YO
414 sv ? "Guessing start of match in sv for"
415 : "Guessing start of match in string for");
2a782b5b 416 );
cad2e5aa 417
c344f387
JH
418 /* CHR_DIST() would be more correct here but it makes things slow. */
419 if (prog->minlen > strend - strpos) {
a3621e74 420 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 421 "String too short... [re_intuit_start]\n"));
cad2e5aa 422 goto fail;
2c2d71f5 423 }
1de06328 424
a1933d95 425 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 426 PL_regeol = strend;
33b8afdf
JH
427 if (do_utf8) {
428 if (!prog->check_utf8 && prog->check_substr)
429 to_utf8_substr(prog);
430 check = prog->check_utf8;
431 } else {
432 if (!prog->check_substr && prog->check_utf8)
433 to_byte_substr(prog);
434 check = prog->check_substr;
435 }
1de06328 436 if (check == &PL_sv_undef) {
a3621e74 437 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 438 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
439 goto fail;
440 }
bbe252da
YO
441 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
442 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
443 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 444 && !multiline ) ); /* Check after \n? */
cad2e5aa 445
7e25d62c 446 if (!ml_anch) {
bbe252da
YO
447 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
448 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 449 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
450 && sv && !SvROK(sv)
451 && (strpos != strbeg)) {
a3621e74 452 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
453 goto fail;
454 }
455 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 456 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 457 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
458 I32 slen;
459
1aa99e6b 460 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 461
653099ff
GS
462 if (SvTAIL(check)) {
463 slen = SvCUR(check); /* >= 1 */
cad2e5aa 464
9041c2e3 465 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 466 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 467 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 468 goto fail_finish;
cad2e5aa
JH
469 }
470 /* Now should match s[0..slen-2] */
471 slen--;
3f7c398e 472 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 473 || (slen > 1
3f7c398e 474 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 475 report_neq:
a3621e74 476 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
477 goto fail_finish;
478 }
cad2e5aa 479 }
3f7c398e 480 else if (*SvPVX_const(check) != *s
653099ff 481 || ((slen = SvCUR(check)) > 1
3f7c398e 482 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 483 goto report_neq;
c315bfe8 484 check_at = s;
2c2d71f5 485 goto success_at_start;
7e25d62c 486 }
cad2e5aa 487 }
2c2d71f5 488 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 489 s = strpos;
2c2d71f5 490 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
491 end_shift = prog->check_end_shift;
492
2c2d71f5 493 if (!ml_anch) {
a3b680e6 494 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 495 - (SvTAIL(check) != 0);
a3b680e6 496 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
497
498 if (end_shift < eshift)
499 end_shift = eshift;
500 }
cad2e5aa 501 }
2c2d71f5 502 else { /* Can match at random position */
cad2e5aa
JH
503 ml_anch = 0;
504 s = strpos;
1de06328
YO
505 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
506 end_shift = prog->check_end_shift;
507
508 /* end shift should be non negative here */
cad2e5aa
JH
509 }
510
bcdf7404 511#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 512 if (end_shift < 0)
1de06328 513 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 514 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
515#endif
516
2c2d71f5
JH
517 restart:
518 /* Find a possible match in the region s..strend by looking for
519 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
520
521 {
522 I32 srch_start_shift = start_shift;
523 I32 srch_end_shift = end_shift;
524 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
525 srch_end_shift -= ((strbeg - s) - srch_start_shift);
526 srch_start_shift = strbeg - s;
527 }
6bda09f9 528 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
529 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
530 (IV)prog->check_offset_min,
531 (IV)srch_start_shift,
532 (IV)srch_end_shift,
533 (IV)prog->check_end_shift);
534 });
535
cad2e5aa 536 if (flags & REXEC_SCREAM) {
cad2e5aa 537 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 538 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 539
2c2d71f5
JH
540 if (PL_screamfirst[BmRARE(check)] >= 0
541 || ( BmRARE(check) == '\n'
85c508c3 542 && (BmPREVIOUS(check) == SvCUR(check) - 1)
2c2d71f5 543 && SvTAIL(check) ))
9041c2e3 544 s = screaminstr(sv, check,
1de06328 545 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 546 else
2c2d71f5 547 goto fail_finish;
4addbd3b 548 /* we may be pointing at the wrong string */
07bc277f 549 if (s && RXp_MATCH_COPIED(prog))
3f7c398e 550 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
551 if (data)
552 *data->scream_olds = s;
553 }
1de06328
YO
554 else {
555 U8* start_point;
556 U8* end_point;
bbe252da 557 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
558 start_point= (U8*)(s + srch_start_shift);
559 end_point= (U8*)(strend - srch_end_shift);
560 } else {
561 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
562 end_point= HOP3(strend, -srch_end_shift, strbeg);
563 }
6bda09f9 564 DEBUG_OPTIMISE_MORE_r({
56570a2c 565 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 566 (int)(end_point - start_point),
fc8cd66c 567 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
568 start_point);
569 });
570
571 s = fbm_instr( start_point, end_point,
7fba1cd6 572 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
573 }
574 }
cad2e5aa
JH
575 /* Update the count-of-usability, remove useless subpatterns,
576 unshift s. */
2c2d71f5 577
ab3bbdeb
YO
578 DEBUG_EXECUTE_r({
579 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
580 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
581 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 582 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
583 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
584 ? "anchored" : "floating"),
585 quoted,
586 RE_SV_TAIL(check),
587 (s ? " at offset " : "...\n") );
588 });
2c2d71f5
JH
589
590 if (!s)
591 goto fail_finish;
2c2d71f5 592 /* Finish the diagnostic message */
a3621e74 593 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 594
1de06328
YO
595 /* XXX dmq: first branch is for positive lookbehind...
596 Our check string is offset from the beginning of the pattern.
597 So we need to do any stclass tests offset forward from that
598 point. I think. :-(
599 */
600
601
602
603 check_at=s;
604
605
2c2d71f5
JH
606 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
607 Start with the other substr.
608 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 609 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
610 *always* match. Probably should be marked during compile...
611 Probably it is right to do no SCREAM here...
612 */
613
1de06328
YO
614 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
615 : (prog->float_substr && prog->anchored_substr))
616 {
30944b6d 617 /* Take into account the "other" substring. */
2c2d71f5
JH
618 /* XXXX May be hopelessly wrong for UTF... */
619 if (!other_last)
6eb5f6b9 620 other_last = strpos;
33b8afdf 621 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
622 do_other_anchored:
623 {
890ce7af
AL
624 char * const last = HOP3c(s, -start_shift, strbeg);
625 char *last1, *last2;
be8e71aa 626 char * const saved_s = s;
33b8afdf 627 SV* must;
2c2d71f5 628
2c2d71f5
JH
629 t = s - prog->check_offset_max;
630 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 631 && (!do_utf8
0ce71af7 632 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 633 && t > strpos)))
6f207bd3 634 NOOP;
2c2d71f5
JH
635 else
636 t = strpos;
1aa99e6b 637 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
638 if (t < other_last) /* These positions already checked */
639 t = other_last;
1aa99e6b 640 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
641 if (last < last1)
642 last1 = last;
1de06328
YO
643 /* XXXX It is not documented what units *_offsets are in.
644 We assume bytes, but this is clearly wrong.
645 Meaning this code needs to be carefully reviewed for errors.
646 dmq.
647 */
648
2c2d71f5 649 /* On end-of-str: see comment below. */
33b8afdf
JH
650 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
651 if (must == &PL_sv_undef) {
652 s = (char*)NULL;
1de06328 653 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
654 }
655 else
656 s = fbm_instr(
657 (unsigned char*)t,
658 HOP3(HOP3(last1, prog->anchored_offset, strend)
659 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
660 must,
7fba1cd6 661 multiline ? FBMrf_MULTILINE : 0
33b8afdf 662 );
ab3bbdeb
YO
663 DEBUG_EXECUTE_r({
664 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
665 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
666 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 667 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
668 quoted, RE_SV_TAIL(must));
669 });
670
671
2c2d71f5
JH
672 if (!s) {
673 if (last1 >= last2) {
a3621e74 674 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
675 ", giving up...\n"));
676 goto fail_finish;
677 }
a3621e74 678 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 679 ", trying floating at offset %ld...\n",
be8e71aa 680 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
681 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
682 s = HOP3c(last, 1, strend);
2c2d71f5
JH
683 goto restart;
684 }
685 else {
a3621e74 686 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 687 (long)(s - i_strpos)));
1aa99e6b
IH
688 t = HOP3c(s, -prog->anchored_offset, strbeg);
689 other_last = HOP3c(s, 1, strend);
be8e71aa 690 s = saved_s;
2c2d71f5
JH
691 if (t == strpos)
692 goto try_at_start;
2c2d71f5
JH
693 goto try_at_offset;
694 }
30944b6d 695 }
2c2d71f5
JH
696 }
697 else { /* Take into account the floating substring. */
33b8afdf 698 char *last, *last1;
be8e71aa 699 char * const saved_s = s;
33b8afdf
JH
700 SV* must;
701
702 t = HOP3c(s, -start_shift, strbeg);
703 last1 = last =
704 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
705 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
706 last = HOP3c(t, prog->float_max_offset, strend);
707 s = HOP3c(t, prog->float_min_offset, strend);
708 if (s < other_last)
709 s = other_last;
2c2d71f5 710 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
711 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
712 /* fbm_instr() takes into account exact value of end-of-str
713 if the check is SvTAIL(ed). Since false positives are OK,
714 and end-of-str is not later than strend we are OK. */
715 if (must == &PL_sv_undef) {
716 s = (char*)NULL;
1de06328 717 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
718 }
719 else
2c2d71f5 720 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
721 (unsigned char*)last + SvCUR(must)
722 - (SvTAIL(must)!=0),
7fba1cd6 723 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
724 DEBUG_EXECUTE_r({
725 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
726 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
727 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 728 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
729 quoted, RE_SV_TAIL(must));
730 });
33b8afdf
JH
731 if (!s) {
732 if (last1 == last) {
a3621e74 733 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
734 ", giving up...\n"));
735 goto fail_finish;
2c2d71f5 736 }
a3621e74 737 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 738 ", trying anchored starting at offset %ld...\n",
be8e71aa 739 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
740 other_last = last;
741 s = HOP3c(t, 1, strend);
742 goto restart;
743 }
744 else {
a3621e74 745 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
746 (long)(s - i_strpos)));
747 other_last = s; /* Fix this later. --Hugo */
be8e71aa 748 s = saved_s;
33b8afdf
JH
749 if (t == strpos)
750 goto try_at_start;
751 goto try_at_offset;
752 }
2c2d71f5 753 }
cad2e5aa 754 }
2c2d71f5 755
1de06328 756
9ef43ace 757 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 758
6bda09f9 759 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
760 PerlIO_printf(Perl_debug_log,
761 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
762 (IV)prog->check_offset_min,
763 (IV)prog->check_offset_max,
764 (IV)(s-strpos),
765 (IV)(t-strpos),
766 (IV)(t-s),
767 (IV)(strend-strpos)
768 )
769 );
770
2c2d71f5 771 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 772 && (!do_utf8
9ef43ace 773 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
774 && t > strpos)))
775 {
2c2d71f5
JH
776 /* Fixed substring is found far enough so that the match
777 cannot start at strpos. */
778 try_at_offset:
cad2e5aa 779 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
780 /* Eventually fbm_*() should handle this, but often
781 anchored_offset is not 0, so this check will not be wasted. */
782 /* XXXX In the code below we prefer to look for "^" even in
783 presence of anchored substrings. And we search even
784 beyond the found float position. These pessimizations
785 are historical artefacts only. */
786 find_anchor:
2c2d71f5 787 while (t < strend - prog->minlen) {
cad2e5aa 788 if (*t == '\n') {
4ee3650e 789 if (t < check_at - prog->check_offset_min) {
33b8afdf 790 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
791 /* Since we moved from the found position,
792 we definitely contradict the found anchored
30944b6d
IZ
793 substr. Due to the above check we do not
794 contradict "check" substr.
795 Thus we can arrive here only if check substr
796 is float. Redo checking for "other"=="fixed".
797 */
9041c2e3 798 strpos = t + 1;
a3621e74 799 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 800 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
801 goto do_other_anchored;
802 }
4ee3650e
GS
803 /* We don't contradict the found floating substring. */
804 /* XXXX Why not check for STCLASS? */
cad2e5aa 805 s = t + 1;
a3621e74 806 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 807 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
808 goto set_useful;
809 }
4ee3650e
GS
810 /* Position contradicts check-string */
811 /* XXXX probably better to look for check-string
812 than for "\n", so one should lower the limit for t? */
a3621e74 813 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 814 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 815 other_last = strpos = s = t + 1;
cad2e5aa
JH
816 goto restart;
817 }
818 t++;
819 }
a3621e74 820 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 821 PL_colors[0], PL_colors[1]));
2c2d71f5 822 goto fail_finish;
cad2e5aa 823 }
f5952150 824 else {
a3621e74 825 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 826 PL_colors[0], PL_colors[1]));
f5952150 827 }
cad2e5aa
JH
828 s = t;
829 set_useful:
33b8afdf 830 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
831 }
832 else {
f5952150 833 /* The found string does not prohibit matching at strpos,
2c2d71f5 834 - no optimization of calling REx engine can be performed,
f5952150
GS
835 unless it was an MBOL and we are not after MBOL,
836 or a future STCLASS check will fail this. */
2c2d71f5
JH
837 try_at_start:
838 /* Even in this situation we may use MBOL flag if strpos is offset
839 wrt the start of the string. */
05b4157f 840 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 841 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 842 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 843 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 844 {
cad2e5aa
JH
845 t = strpos;
846 goto find_anchor;
847 }
a3621e74 848 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 849 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 850 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 851 );
2c2d71f5 852 success_at_start:
bbe252da 853 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
854 && (do_utf8 ? (
855 prog->check_utf8 /* Could be deleted already */
856 && --BmUSEFUL(prog->check_utf8) < 0
857 && (prog->check_utf8 == prog->float_utf8)
858 ) : (
859 prog->check_substr /* Could be deleted already */
860 && --BmUSEFUL(prog->check_substr) < 0
861 && (prog->check_substr == prog->float_substr)
862 )))
66e933ab 863 {
cad2e5aa 864 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 865 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
866 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
867 if (do_utf8 ? prog->check_substr : prog->check_utf8)
868 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
869 prog->check_substr = prog->check_utf8 = NULL; /* disable */
870 prog->float_substr = prog->float_utf8 = NULL; /* clear */
871 check = NULL; /* abort */
cad2e5aa 872 s = strpos;
3cf5c195
IZ
873 /* XXXX This is a remnant of the old implementation. It
874 looks wasteful, since now INTUIT can use many
6eb5f6b9 875 other heuristics. */
bbe252da 876 prog->extflags &= ~RXf_USE_INTUIT;
cad2e5aa
JH
877 }
878 else
879 s = strpos;
880 }
881
6eb5f6b9
JH
882 /* Last resort... */
883 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
884 /* trie stclasses are too expensive to use here, we are better off to
885 leave it to regmatch itself */
f8fc2ecf 886 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
887 /* minlen == 0 is possible if regstclass is \b or \B,
888 and the fixed substr is ''$.
889 Since minlen is already taken into account, s+1 is before strend;
890 accidentally, minlen >= 1 guaranties no false positives at s + 1
891 even for \b or \B. But (minlen? 1 : 0) below assumes that
892 regstclass does not come from lookahead... */
893 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
894 This leaves EXACTF only, which is dealt with in find_byclass(). */
f8fc2ecf
YO
895 const U8* const str = (U8*)STRING(progi->regstclass);
896 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
897 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 898 : 1);
1de06328
YO
899 char * endpos;
900 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
901 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
902 else if (prog->float_substr || prog->float_utf8)
903 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
904 else
905 endpos= strend;
906
70685ca0
JH
907 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
908 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1de06328 909
6eb5f6b9 910 t = s;
f8fc2ecf 911 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
6eb5f6b9
JH
912 if (!s) {
913#ifdef DEBUGGING
cbbf8932 914 const char *what = NULL;
6eb5f6b9
JH
915#endif
916 if (endpos == strend) {
a3621e74 917 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
918 "Could not match STCLASS...\n") );
919 goto fail;
920 }
a3621e74 921 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 922 "This position contradicts STCLASS...\n") );
bbe252da 923 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 924 goto fail;
6eb5f6b9 925 /* Contradict one of substrings */
33b8afdf
JH
926 if (prog->anchored_substr || prog->anchored_utf8) {
927 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 928 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 929 hop_and_restart:
1aa99e6b 930 s = HOP3c(t, 1, strend);
66e933ab
GS
931 if (s + start_shift + end_shift > strend) {
932 /* XXXX Should be taken into account earlier? */
a3621e74 933 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
934 "Could not match STCLASS...\n") );
935 goto fail;
936 }
5e39e1e5
HS
937 if (!check)
938 goto giveup;
a3621e74 939 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 940 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
941 what, (long)(s + start_shift - i_strpos)) );
942 goto restart;
943 }
66e933ab 944 /* Have both, check_string is floating */
6eb5f6b9
JH
945 if (t + start_shift >= check_at) /* Contradicts floating=check */
946 goto retry_floating_check;
947 /* Recheck anchored substring, but not floating... */
9041c2e3 948 s = check_at;
5e39e1e5
HS
949 if (!check)
950 goto giveup;
a3621e74 951 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 952 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
953 (long)(other_last - i_strpos)) );
954 goto do_other_anchored;
955 }
60e71179
GS
956 /* Another way we could have checked stclass at the
957 current position only: */
958 if (ml_anch) {
959 s = t = t + 1;
5e39e1e5
HS
960 if (!check)
961 goto giveup;
a3621e74 962 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 963 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 964 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 965 goto try_at_offset;
66e933ab 966 }
33b8afdf 967 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 968 goto fail;
6eb5f6b9
JH
969 /* Check is floating subtring. */
970 retry_floating_check:
971 t = check_at - start_shift;
a3621e74 972 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
973 goto hop_and_restart;
974 }
b7953727 975 if (t != s) {
a3621e74 976 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 977 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
978 (long)(t - i_strpos), (long)(s - i_strpos))
979 );
980 }
981 else {
a3621e74 982 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
983 "Does not contradict STCLASS...\n");
984 );
985 }
6eb5f6b9 986 }
5e39e1e5 987 giveup:
a3621e74 988 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
989 PL_colors[4], (check ? "Guessed" : "Giving up"),
990 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 991 return s;
2c2d71f5
JH
992
993 fail_finish: /* Substring not found */
33b8afdf
JH
994 if (prog->check_substr || prog->check_utf8) /* could be removed already */
995 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 996 fail:
a3621e74 997 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 998 PL_colors[4], PL_colors[5]));
bd61b366 999 return NULL;
cad2e5aa 1000}
9661b544 1001
a0a388a1
YO
1002#define DECL_TRIE_TYPE(scan) \
1003 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1004 trie_type = (scan->flags != EXACT) \
1005 ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
1006 : (do_utf8 ? trie_utf8 : trie_plain)
3b0527fe 1007
55eed653
NC
1008#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1009uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
c012444f 1010 UV uvc_unfolded = 0; \
4cadc6a9
YO
1011 switch (trie_type) { \
1012 case trie_utf8_fold: \
1013 if ( foldlen>0 ) { \
c012444f 1014 uvc_unfolded = uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
4cadc6a9
YO
1015 foldlen -= len; \
1016 uscan += len; \
1017 len=0; \
1018 } else { \
c012444f 1019 uvc_unfolded = uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
4cadc6a9
YO
1020 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1021 foldlen -= UNISKIP( uvc ); \
1022 uscan = foldbuf + UNISKIP( uvc ); \
1023 } \
1024 break; \
a0a388a1
YO
1025 case trie_latin_utf8_fold: \
1026 if ( foldlen>0 ) { \
1027 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1028 foldlen -= len; \
1029 uscan += len; \
1030 len=0; \
1031 } else { \
1032 len = 1; \
1033 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
1034 foldlen -= UNISKIP( uvc ); \
1035 uscan = foldbuf + UNISKIP( uvc ); \
1036 } \
1037 break; \
4cadc6a9
YO
1038 case trie_utf8: \
1039 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1040 break; \
1041 case trie_plain: \
1042 uvc = (UV)*uc; \
1043 len = 1; \
1044 } \
1045 \
1046 if (uvc < 256) { \
1047 charid = trie->charmap[ uvc ]; \
1048 } \
1049 else { \
1050 charid = 0; \
55eed653
NC
1051 if (widecharmap) { \
1052 SV** const svpp = hv_fetch(widecharmap, \
4cadc6a9
YO
1053 (char*)&uvc, sizeof(UV), 0); \
1054 if (svpp) \
1055 charid = (U16)SvIV(*svpp); \
1056 } \
1057 } \
c012444f
SR
1058 if (!charid && trie_type == trie_utf8_fold && !UTF) { \
1059 charid = trie->charmap[uvc_unfolded]; \
1060 } \
4cadc6a9
YO
1061} STMT_END
1062
a0a388a1
YO
1063#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1064{ \
1065 char *my_strend= (char *)strend; \
4cadc6a9
YO
1066 if ( (CoNd) \
1067 && (ln == len || \
a0a388a1 1068 !ibcmp_utf8(s, &my_strend, 0, do_utf8, \
4cadc6a9 1069 m, NULL, ln, (bool)UTF)) \
a0a388a1 1070 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1071 goto got_it; \
1072 else { \
1073 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1074 uvchr_to_utf8(tmpbuf, c); \
1075 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1076 if ( f != c \
1077 && (f == c1 || f == c2) \
a0a388a1
YO
1078 && (ln == len || \
1079 !ibcmp_utf8(s, &my_strend, 0, do_utf8,\
1080 m, NULL, ln, (bool)UTF)) \
1081 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1082 goto got_it; \
1083 } \
a0a388a1
YO
1084} \
1085s += len
4cadc6a9
YO
1086
1087#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1088STMT_START { \
1089 while (s <= e) { \
1090 if ( (CoNd) \
1091 && (ln == 1 || !(OP(c) == EXACTF \
1092 ? ibcmp(s, m, ln) \
1093 : ibcmp_locale(s, m, ln))) \
24b23f37 1094 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1095 goto got_it; \
1096 s++; \
1097 } \
1098} STMT_END
1099
1100#define REXEC_FBC_UTF8_SCAN(CoDe) \
1101STMT_START { \
1102 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1103 CoDe \
1104 s += uskip; \
1105 } \
1106} STMT_END
1107
1108#define REXEC_FBC_SCAN(CoDe) \
1109STMT_START { \
1110 while (s < strend) { \
1111 CoDe \
1112 s++; \
1113 } \
1114} STMT_END
1115
1116#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1117REXEC_FBC_UTF8_SCAN( \
1118 if (CoNd) { \
24b23f37 1119 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1120 goto got_it; \
1121 else \
1122 tmp = doevery; \
1123 } \
1124 else \
1125 tmp = 1; \
1126)
1127
1128#define REXEC_FBC_CLASS_SCAN(CoNd) \
1129REXEC_FBC_SCAN( \
1130 if (CoNd) { \
24b23f37 1131 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1132 goto got_it; \
1133 else \
1134 tmp = doevery; \
1135 } \
1136 else \
1137 tmp = 1; \
1138)
1139
1140#define REXEC_FBC_TRYIT \
24b23f37 1141if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1142 goto got_it
1143
e1d1eefb
YO
1144#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1145 if (do_utf8) { \
1146 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1147 } \
1148 else { \
1149 REXEC_FBC_CLASS_SCAN(CoNd); \
1150 } \
1151 break
1152
4cadc6a9
YO
1153#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1154 if (do_utf8) { \
1155 UtFpReLoAd; \
1156 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1157 } \
1158 else { \
1159 REXEC_FBC_CLASS_SCAN(CoNd); \
1160 } \
1161 break
1162
1163#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1164 PL_reg_flags |= RF_tainted; \
1165 if (do_utf8) { \
1166 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1167 } \
1168 else { \
1169 REXEC_FBC_CLASS_SCAN(CoNd); \
1170 } \
1171 break
1172
786e8c11
YO
1173#define DUMP_EXEC_POS(li,s,doutf8) \
1174 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1175
1176/* We know what class REx starts with. Try to find this position... */
1177/* if reginfo is NULL, its a dryrun */
1178/* annoyingly all the vars in this routine have different names from their counterparts
1179 in regmatch. /grrr */
1180
3c3eec57 1181STATIC char *
07be1b83 1182S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1183 const char *strend, regmatch_info *reginfo)
a687059c 1184{
27da23d5 1185 dVAR;
bbe252da 1186 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
6eb5f6b9 1187 char *m;
d8093b23 1188 STRLEN ln;
5dab1207 1189 STRLEN lnc;
078c425b 1190 register STRLEN uskip;
d8093b23
G
1191 unsigned int c1;
1192 unsigned int c2;
6eb5f6b9
JH
1193 char *e;
1194 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1195 register const bool do_utf8 = PL_reg_match_utf8;
f8fc2ecf 1196 RXi_GET_DECL(prog,progi);
7918f24d
NC
1197
1198 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1199
6eb5f6b9
JH
1200 /* We know what class it must start with. */
1201 switch (OP(c)) {
6eb5f6b9 1202 case ANYOF:
388cc4de 1203 if (do_utf8) {
4cadc6a9 1204 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1205 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1206 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1207 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1208 }
1209 else {
1210 while (s < strend) {
1211 STRLEN skip = 1;
1212
32fc9b6a 1213 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1214 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1215 /* The assignment of 2 is intentional:
1216 * for the folded sharp s, the skip is 2. */
1217 (skip = SHARP_S_SKIP))) {
24b23f37 1218 if (tmp && (!reginfo || regtry(reginfo, &s)))
388cc4de
HS
1219 goto got_it;
1220 else
1221 tmp = doevery;
1222 }
1223 else
1224 tmp = 1;
1225 s += skip;
1226 }
a0d0e21e 1227 }
6eb5f6b9 1228 break;
f33976b4 1229 case CANY:
4cadc6a9 1230 REXEC_FBC_SCAN(
24b23f37 1231 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1232 goto got_it;
1233 else
1234 tmp = doevery;
4cadc6a9 1235 );
f33976b4 1236 break;
6eb5f6b9 1237 case EXACTF:
5dab1207
NIS
1238 m = STRING(c);
1239 ln = STR_LEN(c); /* length to match in octets/bytes */
1240 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1241 if (UTF) {
a2a2844f 1242 STRLEN ulen1, ulen2;
5dab1207 1243 U8 *sm = (U8 *) m;
89ebb4a3
JH
1244 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1245 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
97dc7d3e
RGS
1246 /* used by commented-out code below */
1247 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
a0a388a1
YO
1248
1249 /* XXX: Since the node will be case folded at compile
1250 time this logic is a little odd, although im not
1251 sure that its actually wrong. --dmq */
1252
1253 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1254 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1255
1256 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1257 codepoint of the first character in the converted
1258 form, yet originally we did the extra step.
1259 No tests fail by commenting this code out however
1260 so Ive left it out. -- dmq.
1261
89ebb4a3 1262 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1263 0, uniflags);
89ebb4a3 1264 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1265 0, uniflags);
a0a388a1
YO
1266 */
1267
5dab1207
NIS
1268 lnc = 0;
1269 while (sm < ((U8 *) m + ln)) {
1270 lnc++;
1271 sm += UTF8SKIP(sm);
1272 }
1aa99e6b
IH
1273 }
1274 else {
1275 c1 = *(U8*)m;
1276 c2 = PL_fold[c1];
1277 }
6eb5f6b9
JH
1278 goto do_exactf;
1279 case EXACTFL:
5dab1207
NIS
1280 m = STRING(c);
1281 ln = STR_LEN(c);
1282 lnc = (I32) ln;
d8093b23 1283 c1 = *(U8*)m;
6eb5f6b9
JH
1284 c2 = PL_fold_locale[c1];
1285 do_exactf:
db12adc6 1286 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1287
3b0527fe 1288 if (!reginfo && e < s)
6eb5f6b9 1289 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1290
60a8b682
JH
1291 /* The idea in the EXACTF* cases is to first find the
1292 * first character of the EXACTF* node and then, if
1293 * necessary, case-insensitively compare the full
1294 * text of the node. The c1 and c2 are the first
1295 * characters (though in Unicode it gets a bit
1296 * more complicated because there are more cases
7f16dd3d
JH
1297 * than just upper and lower: one needs to use
1298 * the so-called folding case for case-insensitive
1299 * matching (called "loose matching" in Unicode).
1300 * ibcmp_utf8() will do just that. */
60a8b682 1301
a0a388a1 1302 if (do_utf8 || UTF) {
575cac57 1303 UV c, f;
89ebb4a3 1304 U8 tmpbuf [UTF8_MAXBYTES+1];
a0a388a1
YO
1305 STRLEN len = 1;
1306 STRLEN foldlen;
4ad0818d 1307 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1308 if (c1 == c2) {
5dab1207
NIS
1309 /* Upper and lower of 1st char are equal -
1310 * probably not a "letter". */
1aa99e6b 1311 while (s <= e) {
a0a388a1
YO
1312 if (do_utf8) {
1313 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1314 uniflags);
a0a388a1
YO
1315 } else {
1316 c = *((U8*)s);
1317 }
4cadc6a9 1318 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1319 }
09091399
JH
1320 }
1321 else {
1aa99e6b 1322 while (s <= e) {
a0a388a1
YO
1323 if (do_utf8) {
1324 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1325 uniflags);
a0a388a1
YO
1326 } else {
1327 c = *((U8*)s);
1328 }
80aecb99 1329
60a8b682 1330 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1331 * Note that not all the possible combinations
1332 * are handled here: some of them are handled
1333 * by the standard folding rules, and some of
1334 * them (the character class or ANYOF cases)
1335 * are handled during compiletime in
1336 * regexec.c:S_regclass(). */
880bd946
JH
1337 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1338 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1339 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1340
4cadc6a9 1341 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1342 }
09091399 1343 }
1aa99e6b
IH
1344 }
1345 else {
a0a388a1 1346 /* Neither pattern nor string are UTF8 */
1aa99e6b 1347 if (c1 == c2)
4cadc6a9 1348 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1349 else
4cadc6a9 1350 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1351 }
1352 break;
bbce6d69 1353 case BOUNDL:
3280af22 1354 PL_reg_flags |= RF_tainted;
bbce6d69 1355 /* FALL THROUGH */
a0d0e21e 1356 case BOUND:
ffc61ed2 1357 if (do_utf8) {
12d33761 1358 if (s == PL_bostr)
ffc61ed2
JH
1359 tmp = '\n';
1360 else {
6136c704 1361 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1362 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1363 }
1364 tmp = ((OP(c) == BOUND ?
9041c2e3 1365 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1366 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1367 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1368 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1369 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1370 isALNUM_LC_utf8((U8*)s)))
1371 {
1372 tmp = !tmp;
4cadc6a9 1373 REXEC_FBC_TRYIT;
a687059c 1374 }
4cadc6a9 1375 );
a0d0e21e 1376 }
667bb95a 1377 else {
12d33761 1378 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1379 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1380 REXEC_FBC_SCAN(
ffc61ed2
JH
1381 if (tmp ==
1382 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1383 tmp = !tmp;
4cadc6a9 1384 REXEC_FBC_TRYIT;
a0ed51b3 1385 }
4cadc6a9 1386 );
a0ed51b3 1387 }
24b23f37 1388 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1389 goto got_it;
1390 break;
bbce6d69 1391 case NBOUNDL:
3280af22 1392 PL_reg_flags |= RF_tainted;
bbce6d69 1393 /* FALL THROUGH */
a0d0e21e 1394 case NBOUND:
ffc61ed2 1395 if (do_utf8) {
12d33761 1396 if (s == PL_bostr)
ffc61ed2
JH
1397 tmp = '\n';
1398 else {
6136c704 1399 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1400 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1401 }
1402 tmp = ((OP(c) == NBOUND ?
9041c2e3 1403 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1404 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1405 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1406 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1407 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1408 isALNUM_LC_utf8((U8*)s)))
1409 tmp = !tmp;
4cadc6a9
YO
1410 else REXEC_FBC_TRYIT;
1411 );
a0d0e21e 1412 }
667bb95a 1413 else {
12d33761 1414 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1415 tmp = ((OP(c) == NBOUND ?
1416 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1417 REXEC_FBC_SCAN(
ffc61ed2
JH
1418 if (tmp ==
1419 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1420 tmp = !tmp;
4cadc6a9
YO
1421 else REXEC_FBC_TRYIT;
1422 );
a0ed51b3 1423 }
24b23f37 1424 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1425 goto got_it;
1426 break;
a0d0e21e 1427 case ALNUM:
4cadc6a9
YO
1428 REXEC_FBC_CSCAN_PRELOAD(
1429 LOAD_UTF8_CHARCLASS_ALNUM(),
1430 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1431 isALNUM(*s)
1432 );
bbce6d69 1433 case ALNUML:
4cadc6a9
YO
1434 REXEC_FBC_CSCAN_TAINT(
1435 isALNUM_LC_utf8((U8*)s),
1436 isALNUM_LC(*s)
1437 );
a0d0e21e 1438 case NALNUM:
4cadc6a9
YO
1439 REXEC_FBC_CSCAN_PRELOAD(
1440 LOAD_UTF8_CHARCLASS_ALNUM(),
1441 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1442 !isALNUM(*s)
1443 );
bbce6d69 1444 case NALNUML:
4cadc6a9
YO
1445 REXEC_FBC_CSCAN_TAINT(
1446 !isALNUM_LC_utf8((U8*)s),
1447 !isALNUM_LC(*s)
1448 );
a0d0e21e 1449 case SPACE:
4cadc6a9
YO
1450 REXEC_FBC_CSCAN_PRELOAD(
1451 LOAD_UTF8_CHARCLASS_SPACE(),
1452 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1453 isSPACE(*s)
1454 );
bbce6d69 1455 case SPACEL:
4cadc6a9
YO
1456 REXEC_FBC_CSCAN_TAINT(
1457 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1458 isSPACE_LC(*s)
1459 );
a0d0e21e 1460 case NSPACE:
4cadc6a9
YO
1461 REXEC_FBC_CSCAN_PRELOAD(
1462 LOAD_UTF8_CHARCLASS_SPACE(),
1463 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1464 !isSPACE(*s)
1465 );
bbce6d69 1466 case NSPACEL:
4cadc6a9
YO
1467 REXEC_FBC_CSCAN_TAINT(
1468 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1469 !isSPACE_LC(*s)
1470 );
a0d0e21e 1471 case DIGIT:
4cadc6a9
YO
1472 REXEC_FBC_CSCAN_PRELOAD(
1473 LOAD_UTF8_CHARCLASS_DIGIT(),
1474 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1475 isDIGIT(*s)
1476 );
b8c5462f 1477 case DIGITL:
4cadc6a9
YO
1478 REXEC_FBC_CSCAN_TAINT(
1479 isDIGIT_LC_utf8((U8*)s),
1480 isDIGIT_LC(*s)
1481 );
a0d0e21e 1482 case NDIGIT:
4cadc6a9
YO
1483 REXEC_FBC_CSCAN_PRELOAD(
1484 LOAD_UTF8_CHARCLASS_DIGIT(),
1485 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1486 !isDIGIT(*s)
1487 );
b8c5462f 1488 case NDIGITL:
4cadc6a9
YO
1489 REXEC_FBC_CSCAN_TAINT(
1490 !isDIGIT_LC_utf8((U8*)s),
1491 !isDIGIT_LC(*s)
1492 );
e1d1eefb
YO
1493 case LNBREAK:
1494 REXEC_FBC_CSCAN(
1495 is_LNBREAK_utf8(s),
1496 is_LNBREAK_latin1(s)
1497 );
1498 case VERTWS:
1499 REXEC_FBC_CSCAN(
1500 is_VERTWS_utf8(s),
1501 is_VERTWS_latin1(s)
1502 );
1503 case NVERTWS:
1504 REXEC_FBC_CSCAN(
1505 !is_VERTWS_utf8(s),
1506 !is_VERTWS_latin1(s)
1507 );
1508 case HORIZWS:
1509 REXEC_FBC_CSCAN(
1510 is_HORIZWS_utf8(s),
1511 is_HORIZWS_latin1(s)
1512 );
1513 case NHORIZWS:
1514 REXEC_FBC_CSCAN(
1515 !is_HORIZWS_utf8(s),
1516 !is_HORIZWS_latin1(s)
1517 );
1de06328
YO
1518 case AHOCORASICKC:
1519 case AHOCORASICK:
07be1b83 1520 {
a0a388a1 1521 DECL_TRIE_TYPE(c);
07be1b83
YO
1522 /* what trie are we using right now */
1523 reg_ac_data *aho
f8fc2ecf 1524 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1525 reg_trie_data *trie
1526 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1527 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1528
1529 const char *last_start = strend - trie->minlen;
6148ee25 1530#ifdef DEBUGGING
07be1b83 1531 const char *real_start = s;
6148ee25 1532#endif
07be1b83 1533 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1534 SV *sv_points;
1535 U8 **points; /* map of where we were in the input string
786e8c11 1536 when reading a given char. For ASCII this
be8e71aa 1537 is unnecessary overhead as the relationship
38a44b82
NC
1538 is always 1:1, but for Unicode, especially
1539 case folded Unicode this is not true. */
f9e705e8 1540 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1541 U8 *bitmap=NULL;
1542
07be1b83
YO
1543
1544 GET_RE_DEBUG_FLAGS_DECL;
1545
be8e71aa
YO
1546 /* We can't just allocate points here. We need to wrap it in
1547 * an SV so it gets freed properly if there is a croak while
1548 * running the match */
1549 ENTER;
1550 SAVETMPS;
1551 sv_points=newSV(maxlen * sizeof(U8 *));
1552 SvCUR_set(sv_points,
1553 maxlen * sizeof(U8 *));
1554 SvPOK_on(sv_points);
1555 sv_2mortal(sv_points);
1556 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1557 if ( trie_type != trie_utf8_fold
1558 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1559 {
786e8c11
YO
1560 if (trie->bitmap)
1561 bitmap=(U8*)trie->bitmap;
1562 else
1563 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1564 }
786e8c11
YO
1565 /* this is the Aho-Corasick algorithm modified a touch
1566 to include special handling for long "unknown char"
1567 sequences. The basic idea being that we use AC as long
1568 as we are dealing with a possible matching char, when
1569 we encounter an unknown char (and we have not encountered
1570 an accepting state) we scan forward until we find a legal
1571 starting char.
1572 AC matching is basically that of trie matching, except
1573 that when we encounter a failing transition, we fall back
1574 to the current states "fail state", and try the current char
1575 again, a process we repeat until we reach the root state,
1576 state 1, or a legal transition. If we fail on the root state
1577 then we can either terminate if we have reached an accepting
1578 state previously, or restart the entire process from the beginning
1579 if we have not.
1580
1581 */
07be1b83
YO
1582 while (s <= last_start) {
1583 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1584 U8 *uc = (U8*)s;
1585 U16 charid = 0;
1586 U32 base = 1;
1587 U32 state = 1;
1588 UV uvc = 0;
1589 STRLEN len = 0;
1590 STRLEN foldlen = 0;
1591 U8 *uscan = (U8*)NULL;
1592 U8 *leftmost = NULL;
786e8c11
YO
1593#ifdef DEBUGGING
1594 U32 accepted_word= 0;
1595#endif
07be1b83
YO
1596 U32 pointpos = 0;
1597
1598 while ( state && uc <= (U8*)strend ) {
1599 int failed=0;
786e8c11
YO
1600 U32 word = aho->states[ state ].wordnum;
1601
1de06328
YO
1602 if( state==1 ) {
1603 if ( bitmap ) {
1604 DEBUG_TRIE_EXECUTE_r(
1605 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1606 dump_exec_pos( (char *)uc, c, strend, real_start,
1607 (char *)uc, do_utf8 );
1608 PerlIO_printf( Perl_debug_log,
1609 " Scanning for legal start char...\n");
1610 }
1611 );
1612 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1613 uc++;
786e8c11 1614 }
1de06328 1615 s= (char *)uc;
786e8c11 1616 }
786e8c11
YO
1617 if (uc >(U8*)last_start) break;
1618 }
1619
1620 if ( word ) {
1621 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1622 if (!leftmost || lpos < leftmost) {
1623 DEBUG_r(accepted_word=word);
07be1b83 1624 leftmost= lpos;
786e8c11 1625 }
07be1b83 1626 if (base==0) break;
786e8c11 1627
07be1b83
YO
1628 }
1629 points[pointpos++ % maxlen]= uc;
55eed653
NC
1630 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1631 uscan, len, uvc, charid, foldlen,
1632 foldbuf, uniflags);
786e8c11
YO
1633 DEBUG_TRIE_EXECUTE_r({
1634 dump_exec_pos( (char *)uc, c, strend, real_start,
1635 s, do_utf8 );
07be1b83 1636 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1637 " Charid:%3u CP:%4"UVxf" ",
1638 charid, uvc);
1639 });
07be1b83
YO
1640
1641 do {
6148ee25 1642#ifdef DEBUGGING
786e8c11 1643 word = aho->states[ state ].wordnum;
6148ee25 1644#endif
07be1b83
YO
1645 base = aho->states[ state ].trans.base;
1646
786e8c11
YO
1647 DEBUG_TRIE_EXECUTE_r({
1648 if (failed)
1649 dump_exec_pos( (char *)uc, c, strend, real_start,
1650 s, do_utf8 );
07be1b83 1651 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1652 "%sState: %4"UVxf", word=%"UVxf,
1653 failed ? " Fail transition to " : "",
1654 (UV)state, (UV)word);
1655 });
07be1b83
YO
1656 if ( base ) {
1657 U32 tmp;
1658 if (charid &&
1659 (base + charid > trie->uniquecharcount )
1660 && (base + charid - 1 - trie->uniquecharcount
1661 < trie->lasttrans)
1662 && trie->trans[base + charid - 1 -
1663 trie->uniquecharcount].check == state
1664 && (tmp=trie->trans[base + charid - 1 -
1665 trie->uniquecharcount ].next))
1666 {
786e8c11
YO
1667 DEBUG_TRIE_EXECUTE_r(
1668 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1669 state = tmp;
1670 break;
1671 }
1672 else {
786e8c11
YO
1673 DEBUG_TRIE_EXECUTE_r(
1674 PerlIO_printf( Perl_debug_log," - fail\n"));
1675 failed = 1;
1676 state = aho->fail[state];
07be1b83
YO
1677 }
1678 }
1679 else {
1680 /* we must be accepting here */
786e8c11
YO
1681 DEBUG_TRIE_EXECUTE_r(
1682 PerlIO_printf( Perl_debug_log," - accepting\n"));
1683 failed = 1;
07be1b83
YO
1684 break;
1685 }
1686 } while(state);
786e8c11 1687 uc += len;
07be1b83
YO
1688 if (failed) {
1689 if (leftmost)
1690 break;
786e8c11 1691 if (!state) state = 1;
07be1b83
YO
1692 }
1693 }
1694 if ( aho->states[ state ].wordnum ) {
1695 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
786e8c11
YO
1696 if (!leftmost || lpos < leftmost) {
1697 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1698 leftmost = lpos;
786e8c11 1699 }
07be1b83 1700 }
07be1b83
YO
1701 if (leftmost) {
1702 s = (char*)leftmost;
786e8c11
YO
1703 DEBUG_TRIE_EXECUTE_r({
1704 PerlIO_printf(
70685ca0
JH
1705 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1706 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
1707 );
1708 });
24b23f37 1709 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
1710 FREETMPS;
1711 LEAVE;
07be1b83 1712 goto got_it;
be8e71aa 1713 }
07be1b83 1714 s = HOPc(s,1);
786e8c11
YO
1715 DEBUG_TRIE_EXECUTE_r({
1716 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1717 });
07be1b83 1718 } else {
786e8c11
YO
1719 DEBUG_TRIE_EXECUTE_r(
1720 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1721 break;
1722 }
1723 }
be8e71aa
YO
1724 FREETMPS;
1725 LEAVE;
07be1b83
YO
1726 }
1727 break;
b3c9acc1 1728 default:
3c3eec57
GS
1729 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1730 break;
d6a28714 1731 }
6eb5f6b9
JH
1732 return 0;
1733 got_it:
1734 return s;
1735}
1736
fae667d5 1737
6eb5f6b9
JH
1738/*
1739 - regexec_flags - match a regexp against a string
1740 */
1741I32
288b8c02 1742Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9
JH
1743 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1744/* strend: pointer to null at end of string */
1745/* strbeg: real beginning of string */
1746/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
1747/* data: May be used for some additional optimizations.
1748 Currently its only used, with a U32 cast, for transmitting
1749 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
1750/* nosave: For optimizations. */
1751{
97aff369 1752 dVAR;
288b8c02 1753 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 1754 /*register*/ char *s;
6eb5f6b9 1755 register regnode *c;
24b23f37 1756 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
1757 I32 minlen; /* must match at least this many chars */
1758 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1759 I32 end_shift = 0; /* Same for the end. */ /* CC */
1760 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1761 char *scream_olds = NULL;
f9f4320a 1762 const bool do_utf8 = (bool)DO_UTF8(sv);
2757e526 1763 I32 multiline;
f8fc2ecf 1764 RXi_GET_DECL(prog,progi);
3b0527fe 1765 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 1766 regexp_paren_pair *swap = NULL;
a3621e74
YO
1767 GET_RE_DEBUG_FLAGS_DECL;
1768
7918f24d 1769 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 1770 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1771
1772 /* Be paranoid... */
1773 if (prog == NULL || startpos == NULL) {
1774 Perl_croak(aTHX_ "NULL regexp parameter");
1775 return 0;
1776 }
1777
bbe252da 1778 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 1779 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 1780
288b8c02 1781 RX_MATCH_UTF8_set(rx, do_utf8);
1de06328 1782 DEBUG_EXECUTE_r(
efd26800 1783 debug_start_match(rx, do_utf8, startpos, strend,
1de06328
YO
1784 "Matching");
1785 );
bac06658 1786
6eb5f6b9 1787 minlen = prog->minlen;
1de06328
YO
1788
1789 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 1790 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1791 "String too short [regexec_flags]...\n"));
1792 goto phooey;
1aa99e6b 1793 }
6eb5f6b9 1794
1de06328 1795
6eb5f6b9 1796 /* Check validity of program. */
f8fc2ecf 1797 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
1798 Perl_croak(aTHX_ "corrupted regexp program");
1799 }
1800
1801 PL_reg_flags = 0;
1802 PL_reg_eval_set = 0;
1803 PL_reg_maxiter = 0;
1804
3c8556c3 1805 if (RX_UTF8(rx))
6eb5f6b9
JH
1806 PL_reg_flags |= RF_utf8;
1807
1808 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1809 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1810 PL_bostr = strbeg;
3b0527fe 1811 reginfo.sv = sv;
6eb5f6b9
JH
1812
1813 /* Mark end of line for $ (and such) */
1814 PL_regeol = strend;
1815
1816 /* see how far we have to get to not match where we matched before */
3b0527fe 1817 reginfo.till = startpos+minend;
6eb5f6b9 1818
6eb5f6b9
JH
1819 /* If there is a "must appear" string, look for it. */
1820 s = startpos;
1821
bbe252da 1822 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1823 MAGIC *mg;
1824
1825 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
58e23c8d 1826 reginfo.ganch = startpos + prog->gofs;
6eb5f6b9
JH
1827 else if (sv && SvTYPE(sv) >= SVt_PVMG
1828 && SvMAGIC(sv)
14befaf4
DM
1829 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1830 && mg->mg_len >= 0) {
3b0527fe 1831 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
bbe252da 1832 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 1833 if (s > reginfo.ganch)
6eb5f6b9 1834 goto phooey;
58e23c8d 1835 s = reginfo.ganch - prog->gofs;
6eb5f6b9
JH
1836 }
1837 }
58e23c8d 1838 else if (data) {
70685ca0 1839 reginfo.ganch = strbeg + PTR2UV(data);
58e23c8d 1840 } else /* pos() not defined */
3b0527fe 1841 reginfo.ganch = strbeg;
6eb5f6b9 1842 }
288b8c02 1843 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
1844 /* We have to be careful. If the previous successful match
1845 was from this regex we don't want a subsequent partially
1846 successful match to clobber the old results.
1847 So when we detect this possibility we add a swap buffer
1848 to the re, and switch the buffer each match. If we fail
1849 we switch it back, otherwise we leave it swapped.
1850 */
1851 swap = prog->offs;
1852 /* do we need a save destructor here for eval dies? */
1853 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
c74340f9 1854 }
a0714e2c 1855 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1856 re_scream_pos_data d;
1857
1858 d.scream_olds = &scream_olds;
1859 d.scream_pos = &scream_pos;
288b8c02 1860 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 1861 if (!s) {
a3621e74 1862 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1863 goto phooey; /* not present */
3fa9c3d7 1864 }
6eb5f6b9
JH
1865 }
1866
1de06328 1867
6eb5f6b9
JH
1868
1869 /* Simplest case: anchored match need be tried only once. */
1870 /* [unless only anchor is BOL and multiline is set] */
bbe252da 1871 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 1872 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 1873 goto got_it;
bbe252da
YO
1874 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1875 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
1876 {
1877 char *end;
1878
1879 if (minlen)
1880 dontbother = minlen - 1;
1aa99e6b 1881 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1882 /* for multiline we only have to try after newlines */
33b8afdf 1883 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1884 if (s == startpos)
1885 goto after_try;
1886 while (1) {
24b23f37 1887 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1888 goto got_it;
1889 after_try:
5339e136 1890 if (s > end)
6eb5f6b9 1891 goto phooey;
bbe252da 1892 if (prog->extflags & RXf_USE_INTUIT) {
288b8c02 1893 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
6eb5f6b9
JH
1894 if (!s)
1895 goto phooey;
1896 }
1897 else
1898 s++;
1899 }
1900 } else {
1901 if (s > startpos)
1902 s--;
1903 while (s < end) {
1904 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 1905 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1906 goto got_it;
1907 }
1908 }
1909 }
1910 }
1911 goto phooey;
bbe252da 1912 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a
YO
1913 {
1914 /* the warning about reginfo.ganch being used without intialization
bbe252da 1915 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 1916 and we only enter this block when the same bit is set. */
58e23c8d
YO
1917 char *tmp_s = reginfo.ganch - prog->gofs;
1918 if (regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
1919 goto got_it;
1920 goto phooey;
1921 }
1922
1923 /* Messy cases: unanchored match. */
bbe252da 1924 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9
JH
1925 /* we have /x+whatever/ */
1926 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1927 char ch;
bf93d4cc
GS
1928#ifdef DEBUGGING
1929 int did_match = 0;
1930#endif
33b8afdf
JH
1931 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1932 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1933 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1934
1aa99e6b 1935 if (do_utf8) {
4cadc6a9 1936 REXEC_FBC_SCAN(
6eb5f6b9 1937 if (*s == ch) {
a3621e74 1938 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 1939 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
1940 s += UTF8SKIP(s);
1941 while (s < strend && *s == ch)
1942 s += UTF8SKIP(s);
1943 }
4cadc6a9 1944 );
6eb5f6b9
JH
1945 }
1946 else {
4cadc6a9 1947 REXEC_FBC_SCAN(
6eb5f6b9 1948 if (*s == ch) {
a3621e74 1949 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 1950 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
1951 s++;
1952 while (s < strend && *s == ch)
1953 s++;
1954 }
4cadc6a9 1955 );
6eb5f6b9 1956 }
a3621e74 1957 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1958 PerlIO_printf(Perl_debug_log,
b7953727
JH
1959 "Did not find anchored character...\n")
1960 );
6eb5f6b9 1961 }
a0714e2c
SS
1962 else if (prog->anchored_substr != NULL
1963 || prog->anchored_utf8 != NULL
1964 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1965 && prog->float_max_offset < strend - s)) {
1966 SV *must;
1967 I32 back_max;
1968 I32 back_min;
1969 char *last;
6eb5f6b9 1970 char *last1; /* Last position checked before */
bf93d4cc
GS
1971#ifdef DEBUGGING
1972 int did_match = 0;
1973#endif
33b8afdf
JH
1974 if (prog->anchored_substr || prog->anchored_utf8) {
1975 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1976 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1977 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1978 back_max = back_min = prog->anchored_offset;
1979 } else {
1980 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1981 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1982 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1983 back_max = prog->float_max_offset;
1984 back_min = prog->float_min_offset;
1985 }
1de06328
YO
1986
1987
33b8afdf
JH
1988 if (must == &PL_sv_undef)
1989 /* could not downgrade utf8 check substring, so must fail */
1990 goto phooey;
1991
1de06328
YO
1992 if (back_min<0) {
1993 last = strend;
1994 } else {
1995 last = HOP3c(strend, /* Cannot start after this */
1996 -(I32)(CHR_SVLEN(must)
1997 - (SvTAIL(must) != 0) + back_min), strbeg);
1998 }
6eb5f6b9
JH
1999 if (s > PL_bostr)
2000 last1 = HOPc(s, -1);
2001 else
2002 last1 = s - 1; /* bogus */
2003
a0288114 2004 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2005 check_substr==must. */
2006 scream_pos = -1;
2007 dontbother = end_shift;
2008 strend = HOPc(strend, -dontbother);
2009 while ( (s <= last) &&
9041c2e3 2010 ((flags & REXEC_SCREAM)
1de06328 2011 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 2012 end_shift, &scream_pos, 0))
1de06328 2013 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2014 (unsigned char*)strend, must,
7fba1cd6 2015 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b 2016 /* we may be pointing at the wrong string */
07bc277f 2017 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
3f7c398e 2018 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2019 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2020 if (HOPc(s, -back_max) > last1) {
2021 last1 = HOPc(s, -back_min);
2022 s = HOPc(s, -back_max);
2023 }
2024 else {
52657f30 2025 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2026
2027 last1 = HOPc(s, -back_min);
52657f30 2028 s = t;
6eb5f6b9 2029 }
1aa99e6b 2030 if (do_utf8) {
6eb5f6b9 2031 while (s <= last1) {
24b23f37 2032 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2033 goto got_it;
2034 s += UTF8SKIP(s);
2035 }
2036 }
2037 else {
2038 while (s <= last1) {
24b23f37 2039 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2040 goto got_it;
2041 s++;
2042 }
2043 }
2044 }
ab3bbdeb
YO
2045 DEBUG_EXECUTE_r(if (!did_match) {
2046 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2047 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2048 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2049 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2050 ? "anchored" : "floating"),
ab3bbdeb
YO
2051 quoted, RE_SV_TAIL(must));
2052 });
6eb5f6b9
JH
2053 goto phooey;
2054 }
f8fc2ecf 2055 else if ( (c = progi->regstclass) ) {
f14c76ed 2056 if (minlen) {
f8fc2ecf 2057 const OPCODE op = OP(progi->regstclass);
66e933ab 2058 /* don't bother with what can't match */
786e8c11 2059 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2060 strend = HOPc(strend, -(minlen - 1));
2061 }
a3621e74 2062 DEBUG_EXECUTE_r({
be8e71aa 2063 SV * const prop = sv_newmortal();
32fc9b6a 2064 regprop(prog, prop, c);
0df25f3d 2065 {
02daf0ab 2066 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2067 s,strend-s,60);
0df25f3d 2068 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2069 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 2070 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2071 quoted, (int)(strend - s));
0df25f3d 2072 }
ffc61ed2 2073 });
3b0527fe 2074 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2075 goto got_it;
07be1b83 2076 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2077 }
2078 else {
2079 dontbother = 0;
a0714e2c 2080 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2081 /* Trim the end. */
d6a28714 2082 char *last;
33b8afdf
JH
2083 SV* float_real;
2084
2085 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2086 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2087 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2088
2089 if (flags & REXEC_SCREAM) {
33b8afdf 2090 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2091 end_shift, &scream_pos, 1); /* last one */
2092 if (!last)
ffc61ed2 2093 last = scream_olds; /* Only one occurrence. */
4addbd3b 2094 /* we may be pointing at the wrong string */
07bc277f 2095 else if (RXp_MATCH_COPIED(prog))
3f7c398e 2096 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2097 }
d6a28714
JH
2098 else {
2099 STRLEN len;
cfd0369c 2100 const char * const little = SvPV_const(float_real, len);
d6a28714 2101
33b8afdf 2102 if (SvTAIL(float_real)) {
d6a28714
JH
2103 if (memEQ(strend - len + 1, little, len - 1))
2104 last = strend - len + 1;
7fba1cd6 2105 else if (!multiline)
9041c2e3 2106 last = memEQ(strend - len, little, len)
bd61b366 2107 ? strend - len : NULL;
b8c5462f 2108 else
d6a28714
JH
2109 goto find_last;
2110 } else {
2111 find_last:
9041c2e3 2112 if (len)
d6a28714 2113 last = rninstr(s, strend, little, little + len);
b8c5462f 2114 else
a0288114 2115 last = strend; /* matching "$" */
b8c5462f 2116 }
b8c5462f 2117 }
bf93d4cc 2118 if (last == NULL) {
6bda09f9
YO
2119 DEBUG_EXECUTE_r(
2120 PerlIO_printf(Perl_debug_log,
2121 "%sCan't trim the tail, match fails (should not happen)%s\n",
2122 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2123 goto phooey; /* Should not happen! */
2124 }
d6a28714
JH
2125 dontbother = strend - last + prog->float_min_offset;
2126 }
2127 if (minlen && (dontbother < minlen))
2128 dontbother = minlen - 1;
2129 strend -= dontbother; /* this one's always in bytes! */
2130 /* We don't know much -- general case. */
1aa99e6b 2131 if (do_utf8) {
d6a28714 2132 for (;;) {
24b23f37 2133 if (regtry(&reginfo, &s))
d6a28714
JH
2134 goto got_it;
2135 if (s >= strend)
2136 break;
b8c5462f 2137 s += UTF8SKIP(s);
d6a28714
JH
2138 };
2139 }
2140 else {
2141 do {
24b23f37 2142 if (regtry(&reginfo, &s))
d6a28714
JH
2143 goto got_it;
2144 } while (s++ < strend);
2145 }
2146 }
2147
2148 /* Failure. */
2149 goto phooey;
2150
2151got_it:
e9105d30 2152 Safefree(swap);
288b8c02 2153 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2154
19b95bf0 2155 if (PL_reg_eval_set)
4f639d21 2156 restore_pos(aTHX_ prog);
5daac39c
NC
2157 if (RXp_PAREN_NAMES(prog))
2158 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2159
2160 /* make sure $`, $&, $', and $digit will work later */
2161 if ( !(flags & REXEC_NOT_FIRST) ) {
288b8c02 2162 RX_MATCH_COPY_FREE(rx);
d6a28714 2163 if (flags & REXEC_COPY_STR) {
be8e71aa 2164 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2165#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2166 if ((SvIsCOW(sv)
2167 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2168 if (DEBUG_C_TEST) {
2169 PerlIO_printf(Perl_debug_log,
2170 "Copy on write: regexp capture, type %d\n",
2171 (int) SvTYPE(sv));
2172 }
2173 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2174 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2175 assert (SvPOKp(prog->saved_copy));
2176 } else
2177#endif
2178 {
288b8c02 2179 RX_MATCH_COPIED_on(rx);
ed252734
NC
2180 s = savepvn(strbeg, i);
2181 prog->subbeg = s;
2182 }
d6a28714 2183 prog->sublen = i;
d6a28714
JH
2184 }
2185 else {
2186 prog->subbeg = strbeg;
2187 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2188 }
2189 }
9041c2e3 2190
d6a28714
JH
2191 return 1;
2192
2193phooey:
a3621e74 2194 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2195 PL_colors[4], PL_colors[5]));
d6a28714 2196 if (PL_reg_eval_set)
4f639d21 2197 restore_pos(aTHX_ prog);
e9105d30 2198 if (swap) {
c74340f9 2199 /* we failed :-( roll it back */
e9105d30
GG
2200 Safefree(prog->offs);
2201 prog->offs = swap;
2202 }
2203
d6a28714
JH
2204 return 0;
2205}
2206
6bda09f9 2207
d6a28714
JH
2208/*
2209 - regtry - try match at specific point
2210 */
2211STATIC I32 /* 0 failure, 1 success */
24b23f37 2212S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2213{
97aff369 2214 dVAR;
d6a28714 2215 CHECKPOINT lastcp;
288b8c02
NC
2216 REGEXP *const rx = reginfo->prog;
2217 regexp *const prog = (struct regexp *)SvANY(rx);
f8fc2ecf 2218 RXi_GET_DECL(prog,progi);
a3621e74 2219 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2220
2221 PERL_ARGS_ASSERT_REGTRY;
2222
24b23f37 2223 reginfo->cutpoint=NULL;
d6a28714 2224
bbe252da 2225 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
d6a28714
JH
2226 MAGIC *mg;
2227
2228 PL_reg_eval_set = RS_init;
a3621e74 2229 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2230 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2231 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2232 ));
ea8d6ae1 2233 SAVESTACK_CXPOS();
d6a28714
JH
2234 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2235 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2236 SAVETMPS;
2237 /* Apparently this is not needed, judging by wantarray. */
e8347627 2238 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2239 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2240
3b0527fe 2241 if (reginfo->sv) {
d6a28714 2242 /* Make $_ available to executed code. */
3b0527fe 2243 if (reginfo->sv != DEFSV) {
59f00321 2244 SAVE_DEFSV;
414bf5ae 2245 DEFSV_set(reginfo->sv);
b8c5462f 2246 }
d6a28714 2247
3b0527fe
DM
2248 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2249 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2250 /* prepare for quick setting of pos */
d300d9fa 2251#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2252 if (SvIsCOW(reginfo->sv))
2253 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2254#endif
3dab1dad 2255 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2256 &PL_vtbl_mglob, NULL, 0);
d6a28714 2257 mg->mg_len = -1;
b8c5462f 2258 }
d6a28714
JH
2259 PL_reg_magic = mg;
2260 PL_reg_oldpos = mg->mg_len;
4f639d21 2261 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2262 }
09687e5a 2263 if (!PL_reg_curpm) {
a02a5408 2264 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2265#ifdef USE_ITHREADS
2266 {
14a49a24 2267 SV* const repointer = &PL_sv_undef;
92313705
NC
2268 /* this regexp is also owned by the new PL_reg_curpm, which
2269 will try to free it. */
d2ece331 2270 av_push(PL_regex_padav, repointer);
09687e5a
AB
2271 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2272 PL_regex_pad = AvARRAY(PL_regex_padav);
2273 }
2274#endif
2275 }
86c29d75
NC
2276#ifdef USE_ITHREADS
2277 /* It seems that non-ithreads works both with and without this code.
2278 So for efficiency reasons it seems best not to have the code
2279 compiled when it is not needed. */
92313705
NC
2280 /* This is safe against NULLs: */
2281 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2282 /* PM_reg_curpm owns a reference to this regexp. */
2283 ReREFCNT_inc(rx);
86c29d75 2284#endif
288b8c02 2285 PM_SETRE(PL_reg_curpm, rx);
d6a28714
JH
2286 PL_reg_oldcurpm = PL_curpm;
2287 PL_curpm = PL_reg_curpm;
07bc277f 2288 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2289 /* Here is a serious problem: we cannot rewrite subbeg,
2290 since it may be needed if this match fails. Thus
2291 $` inside (?{}) could fail... */
2292 PL_reg_oldsaved = prog->subbeg;
2293 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2294#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2295 PL_nrs = prog->saved_copy;
2296#endif
07bc277f 2297 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2298 }
2299 else
bd61b366 2300 PL_reg_oldsaved = NULL;
d6a28714
JH
2301 prog->subbeg = PL_bostr;
2302 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2303 }
24b23f37 2304 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
f0ab9afb 2305 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2306 PL_reginput = *startpos;
d6a28714 2307 PL_reglastparen = &prog->lastparen;
a01268b5 2308 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2309 prog->lastparen = 0;
03994de8 2310 prog->lastcloseparen = 0;
d6a28714 2311 PL_regsize = 0;
f0ab9afb 2312 PL_regoffs = prog->offs;
d6a28714
JH
2313 if (PL_reg_start_tmpl <= prog->nparens) {
2314 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2315 if(PL_reg_start_tmp)
2316 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2317 else
a02a5408 2318 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2319 }
2320
2321 /* XXXX What this code is doing here?!!! There should be no need
2322 to do this again and again, PL_reglastparen should take care of
3dd2943c 2323 this! --ilya*/
dafc8851
JH
2324
2325 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2326 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116 2327 * PL_reglastparen), is not needed at all by the test suite
225593e1
DM
2328 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2329 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2330 * Meanwhile, this code *is* needed for the
daf18116
JH
2331 * above-mentioned test suite tests to succeed. The common theme
2332 * on those tests seems to be returning null fields from matches.
225593e1 2333 * --jhi updated by dapm */
dafc8851 2334#if 1
d6a28714 2335 if (prog->nparens) {
f0ab9afb 2336 regexp_paren_pair *pp = PL_regoffs;
097eb12c 2337 register I32 i;
eb160463 2338 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
f0ab9afb
NC
2339 ++pp;
2340 pp->start = -1;
2341 pp->end = -1;
d6a28714
JH
2342 }
2343 }
dafc8851 2344#endif
02db2b7b 2345 REGCP_SET(lastcp);
f8fc2ecf 2346 if (regmatch(reginfo, progi->program + 1)) {
f0ab9afb 2347 PL_regoffs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2348 return 1;
2349 }
24b23f37
YO
2350 if (reginfo->cutpoint)
2351 *startpos= reginfo->cutpoint;
02db2b7b 2352 REGCP_UNWIND(lastcp);
d6a28714
JH
2353 return 0;
2354}
2355
02db2b7b 2356
8ba1375e
MJD
2357#define sayYES goto yes
2358#define sayNO goto no
262b90c4 2359#define sayNO_SILENT goto no_silent
8ba1375e 2360
f9f4320a
YO
2361/* we dont use STMT_START/END here because it leads to
2362 "unreachable code" warnings, which are bogus, but distracting. */
2363#define CACHEsayNO \
c476f425
DM
2364 if (ST.cache_mask) \
2365 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2366 sayNO
3298f257 2367
a3621e74 2368/* this is used to determine how far from the left messages like
265c4333
YO
2369 'failed...' are printed. It should be set such that messages
2370 are inline with the regop output that created them.
a3621e74 2371*/
265c4333 2372#define REPORT_CODE_OFF 32
a3621e74
YO
2373
2374
2375/* Make sure there is a test for this +1 options in re_tests */
2376#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2377
40a82448
DM
2378#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2379#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2380
86545054
DM
2381#define SLAB_FIRST(s) (&(s)->states[0])
2382#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2383
5d9a96ca
DM
2384/* grab a new slab and return the first slot in it */
2385
2386STATIC regmatch_state *
2387S_push_slab(pTHX)
2388{
a35a87e7 2389#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2390 dMY_CXT;
2391#endif
5d9a96ca
DM
2392 regmatch_slab *s = PL_regmatch_slab->next;
2393 if (!s) {
2394 Newx(s, 1, regmatch_slab);
2395 s->prev = PL_regmatch_slab;
2396 s->next = NULL;
2397 PL_regmatch_slab->next = s;
2398 }
2399 PL_regmatch_slab = s;
86545054 2400 return SLAB_FIRST(s);
5d9a96ca 2401}
5b47454d 2402
95b24440 2403
40a82448
DM
2404/* push a new state then goto it */
2405
2406#define PUSH_STATE_GOTO(state, node) \
2407 scan = node; \
2408 st->resume_state = state; \
2409 goto push_state;
2410
2411/* push a new state with success backtracking, then goto it */
2412
2413#define PUSH_YES_STATE_GOTO(state, node) \
2414 scan = node; \
2415 st->resume_state = state; \
2416 goto push_yes_state;
2417
aa283a38 2418
aa283a38 2419
d6a28714 2420/*
95b24440 2421
bf1f174e
DM
2422regmatch() - main matching routine
2423
2424This is basically one big switch statement in a loop. We execute an op,
2425set 'next' to point the next op, and continue. If we come to a point which
2426we may need to backtrack to on failure such as (A|B|C), we push a
2427backtrack state onto the backtrack stack. On failure, we pop the top
2428state, and re-enter the loop at the state indicated. If there are no more
2429states to pop, we return failure.
2430
2431Sometimes we also need to backtrack on success; for example /A+/, where
2432after successfully matching one A, we need to go back and try to
2433match another one; similarly for lookahead assertions: if the assertion
2434completes successfully, we backtrack to the state just before the assertion
2435and then carry on. In these cases, the pushed state is marked as
2436'backtrack on success too'. This marking is in fact done by a chain of
2437pointers, each pointing to the previous 'yes' state. On success, we pop to
2438the nearest yes state, discarding any intermediate failure-only states.
2439Sometimes a yes state is pushed just to force some cleanup code to be
2440called at the end of a successful match or submatch; e.g. (??{$re}) uses
2441it to free the inner regex.
2442
2443Note that failure backtracking rewinds the cursor position, while
2444success backtracking leaves it alone.
2445
2446A pattern is complete when the END op is executed, while a subpattern
2447such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2448ops trigger the "pop to last yes state if any, otherwise return true"
2449behaviour.
2450
2451A common convention in this function is to use A and B to refer to the two
2452subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2453the subpattern to be matched possibly multiple times, while B is the entire
2454rest of the pattern. Variable and state names reflect this convention.
2455
2456The states in the main switch are the union of ops and failure/success of
2457substates associated with with that op. For example, IFMATCH is the op
2458that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2459'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2460successfully matched A and IFMATCH_A_fail is a state saying that we have
2461just failed to match A. Resume states always come in pairs. The backtrack
2462state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2463at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2464on success or failure.
2465
2466The struct that holds a backtracking state is actually a big union, with
2467one variant for each major type of op. The variable st points to the
2468top-most backtrack struct. To make the code clearer, within each
2469block of code we #define ST to alias the relevant union.
2470
2471Here's a concrete example of a (vastly oversimplified) IFMATCH
2472implementation:
2473
2474 switch (state) {
2475 ....
2476
2477#define ST st->u.ifmatch
2478
2479 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2480 ST.foo = ...; // some state we wish to save
95b24440 2481 ...
bf1f174e
DM
2482 // push a yes backtrack state with a resume value of
2483 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2484 // first node of A:
2485 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2486 // NOTREACHED
2487
2488 case IFMATCH_A: // we have successfully executed A; now continue with B
2489 next = B;
2490 bar = ST.foo; // do something with the preserved value
2491 break;
2492
2493 case IFMATCH_A_fail: // A failed, so the assertion failed
2494 ...; // do some housekeeping, then ...
2495 sayNO; // propagate the failure
2496
2497#undef ST
95b24440 2498
bf1f174e
DM
2499 ...
2500 }
95b24440 2501
bf1f174e
DM
2502For any old-timers reading this who are familiar with the old recursive
2503approach, the code above is equivalent to:
95b24440 2504
bf1f174e
DM
2505 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2506 {
2507 int foo = ...
95b24440 2508 ...
bf1f174e
DM
2509 if (regmatch(A)) {
2510 next = B;
2511 bar = foo;
2512 break;
95b24440 2513 }
bf1f174e
DM
2514 ...; // do some housekeeping, then ...
2515 sayNO; // propagate the failure
95b24440 2516 }
bf1f174e
DM
2517
2518The topmost backtrack state, pointed to by st, is usually free. If you
2519want to claim it, populate any ST.foo fields in it with values you wish to
2520save, then do one of
2521
2522 PUSH_STATE_GOTO(resume_state, node);
2523 PUSH_YES_STATE_GOTO(resume_state, node);
2524
2525which sets that backtrack state's resume value to 'resume_state', pushes a
2526new free entry to the top of the backtrack stack, then goes to 'node'.
2527On backtracking, the free slot is popped, and the saved state becomes the
2528new free state. An ST.foo field in this new top state can be temporarily
2529accessed to retrieve values, but once the main loop is re-entered, it
2530becomes available for reuse.
2531
2532Note that the depth of the backtrack stack constantly increases during the
2533left-to-right execution of the pattern, rather than going up and down with
2534the pattern nesting. For example the stack is at its maximum at Z at the
2535end of the pattern, rather than at X in the following:
2536
2537 /(((X)+)+)+....(Y)+....Z/
2538
2539The only exceptions to this are lookahead/behind assertions and the cut,
2540(?>A), which pop all the backtrack states associated with A before
2541continuing.
2542
2543Bascktrack state structs are allocated in slabs of about 4K in size.
2544PL_regmatch_state and st always point to the currently active state,
2545and PL_regmatch_slab points to the slab currently containing
2546PL_regmatch_state. The first time regmatch() is called, the first slab is
2547allocated, and is never freed until interpreter destruction. When the slab
2548is full, a new one is allocated and chained to the end. At exit from
2549regmatch(), slabs allocated since entry are freed.
2550
2551*/
95b24440 2552
40a82448 2553
5bc10b2c 2554#define DEBUG_STATE_pp(pp) \
265c4333 2555 DEBUG_STATE_r({ \
5bc10b2c
DM
2556 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2557 PerlIO_printf(Perl_debug_log, \
5d458dd8 2558 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2559 depth*2, "", \
13d6edb4 2560 PL_reg_name[st->resume_state], \
5d458dd8
YO
2561 ((st==yes_state||st==mark_state) ? "[" : ""), \
2562 ((st==yes_state) ? "Y" : ""), \
2563 ((st==mark_state) ? "M" : ""), \
2564 ((st==yes_state||st==mark_state) ? "]" : "") \
2565 ); \
265c4333 2566 });
5bc10b2c 2567
40a82448 2568
3dab1dad 2569#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2570
3df15adc 2571#ifdef DEBUGGING
5bc10b2c 2572
ab3bbdeb 2573STATIC void
efd26800 2574S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8,
ab3bbdeb
YO
2575 const char *start, const char *end, const char *blurb)
2576{
efd26800 2577 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
2578
2579 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2580
ab3bbdeb
YO
2581 if (!PL_colorset)
2582 reginitcolors();
2583 {
2584 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 2585 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb
YO
2586
2587 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2588 start, end - start, 60);
2589
2590 PerlIO_printf(Perl_debug_log,
2591 "%s%s REx%s %s against %s\n",
2592 PL_colors[4], blurb, PL_colors[5], s0, s1);
2593
2594 if (do_utf8||utf8_pat)
1de06328
YO
2595 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2596 utf8_pat ? "pattern" : "",
2597 utf8_pat && do_utf8 ? " and " : "",
2598 do_utf8 ? "string" : ""
ab3bbdeb
YO
2599 );
2600 }
2601}
3df15adc
YO
2602
2603STATIC void
786e8c11
YO
2604S_dump_exec_pos(pTHX_ const char *locinput,
2605 const regnode *scan,
2606 const char *loc_regeol,
2607 const char *loc_bostr,
2608 const char *loc_reg_starttry,
2609 const bool do_utf8)
07be1b83 2610{
786e8c11 2611 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2612 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2613 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2614 /* The part of the string before starttry has one color
2615 (pref0_len chars), between starttry and current
2616 position another one (pref_len - pref0_len chars),
2617 after the current position the third one.
2618 We assume that pref0_len <= pref_len, otherwise we
2619 decrease pref0_len. */
786e8c11
YO
2620 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2621 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2622 int pref0_len;
2623
7918f24d
NC
2624 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2625
07be1b83
YO
2626 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2627 pref_len++;
786e8c11
YO
2628 pref0_len = pref_len - (locinput - loc_reg_starttry);
2629 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2630 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2631 ? (5 + taill) - pref_len : loc_regeol - locinput);
07be1b83
YO
2632 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2633 l--;
2634 if (pref0_len < 0)
2635 pref0_len = 0;
2636 if (pref0_len > pref_len)
2637 pref0_len = pref_len;
2638 {
3df15adc 2639 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2640
ab3bbdeb 2641 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2642 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2643
ab3bbdeb 2644 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2645 (locinput - pref_len + pref0_len),
1de06328 2646 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2647
ab3bbdeb 2648 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2649 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2650
1de06328 2651 const STRLEN tlen=len0+len1+len2;
3df15adc 2652 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2653 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2654 (IV)(locinput - loc_bostr),
07be1b83 2655 len0, s0,
07be1b83 2656 len1, s1,
07be1b83 2657 (docolor ? "" : "> <"),
07be1b83 2658 len2, s2,
f9f4320a 2659 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
2660 "");
2661 }
2662}
3df15adc 2663
07be1b83
YO
2664#endif
2665
0a4db386
YO
2666/* reg_check_named_buff_matched()
2667 * Checks to see if a named buffer has matched. The data array of
2668 * buffer numbers corresponding to the buffer is expected to reside
2669 * in the regexp->data->data array in the slot stored in the ARG() of
2670 * node involved. Note that this routine doesn't actually care about the
2671 * name, that information is not preserved from compilation to execution.
2672 * Returns the index of the leftmost defined buffer with the given name
2673 * or 0 if non of the buffers matched.
2674 */
2675STATIC I32
7918f24d
NC
2676S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2677{
0a4db386 2678 I32 n;
f8fc2ecf 2679 RXi_GET_DECL(rex,rexi);
ad64d0ec 2680 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 2681 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
2682
2683 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2684
0a4db386
YO
2685 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2686 if ((I32)*PL_reglastparen >= nums[n] &&
f0ab9afb 2687 PL_regoffs[nums[n]].end != -1)
0a4db386
YO
2688 {
2689 return nums[n];
2690 }
2691 }
2692 return 0;
2693}
2694
2f554ef7
DM
2695
2696/* free all slabs above current one - called during LEAVE_SCOPE */
2697
2698STATIC void
2699S_clear_backtrack_stack(pTHX_ void *p)
2700{
2701 regmatch_slab *s = PL_regmatch_slab->next;
2702 PERL_UNUSED_ARG(p);
2703
2704 if (!s)
2705 return;
2706 PL_regmatch_slab->next = NULL;
2707 while (s) {
2708 regmatch_slab * const osl = s;
2709 s = s->next;
2710 Safefree(osl);
2711 }
2712}
2713
2714
28d8d7f4
YO
2715#define SETREX(Re1,Re2) \
2716 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2717 Re1 = (Re2)
2718
d6a28714 2719STATIC I32 /* 0 failure, 1 success */
24b23f37 2720S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 2721{
a35a87e7 2722#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2723 dMY_CXT;
2724#endif
27da23d5 2725 dVAR;
95b24440 2726 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2727 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
2728 REGEXP *rex_sv = reginfo->prog;
2729 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 2730 RXi_GET_DECL(rex,rexi);
2f554ef7 2731 I32 oldsave;
5d9a96ca
DM
2732 /* the current state. This is a cached copy of PL_regmatch_state */
2733 register regmatch_state *st;
5d9a96ca
DM
2734 /* cache heavy used fields of st in registers */
2735 register regnode *scan;
2736 register regnode *next;
438e9bae 2737 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 2738 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 2739 register char *locinput = PL_reginput;
5d9a96ca 2740 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 2741
b69b0499 2742 bool result = 0; /* return value of S_regmatch */
24d3c4a9 2743 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
2744 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2745 const U32 max_nochange_depth =
2746 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2747 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
2748 regmatch_state *yes_state = NULL; /* state to pop to on success of
2749 subpattern */
e2e6a0f1
YO
2750 /* mark_state piggy backs on the yes_state logic so that when we unwind
2751 the stack on success we can update the mark_state as we go */
2752 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 2753 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 2754 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 2755 U32 state_num;
5d458dd8
YO
2756 bool no_final = 0; /* prevent failure from backtracking? */
2757 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 2758 char *startpoint = PL_reginput;
5d458dd8
YO
2759 SV *popmark = NULL; /* are we looking for a mark? */
2760 SV *sv_commit = NULL; /* last mark name seen in failure */
2761 SV *sv_yes_mark = NULL; /* last mark name we have seen
2762 during a successfull match */
2763 U32 lastopen = 0; /* last open we saw */
2764 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 2765 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
2766 /* these three flags are set by various ops to signal information to
2767 * the very next op. They have a useful lifetime of exactly one loop
2768 * iteration, and are not preserved or restored by state pushes/pops
2769 */
2770 bool sw = 0; /* the condition value in (?(cond)a|b) */
2771 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2772 int logical = 0; /* the following EVAL is:
2773 0: (?{...})
2774 1: (?(?{...})X|Y)
2775 2: (??{...})
2776 or the following IFMATCH/UNLESSM is:
2777 false: plain (?=foo)
2778 true: used as a condition: (?(?=foo))
2779 */
95b24440 2780#ifdef DEBUGGING
e68ec53f 2781 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2782#endif
2783
7918f24d
NC
2784 PERL_ARGS_ASSERT_REGMATCH;
2785
3b57cd43 2786 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 2787 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 2788 }));
5d9a96ca
DM
2789 /* on first ever call to regmatch, allocate first slab */
2790 if (!PL_regmatch_slab) {
2791 Newx(PL_regmatch_slab, 1, regmatch_slab);
2792 PL_regmatch_slab->prev = NULL;
2793 PL_regmatch_slab->next = NULL;
86545054 2794 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2795 }
2796
2f554ef7
DM
2797 oldsave = PL_savestack_ix;
2798 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2799 SAVEVPTR(PL_regmatch_slab);
2800 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
2801
2802 /* grab next free state slot */
2803 st = ++PL_regmatch_state;
86545054 2804 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2805 st = PL_regmatch_state = S_push_slab(aTHX);
2806
d6a28714
JH
2807 /* Note that nextchr is a byte even in UTF */
2808 nextchr = UCHARAT(locinput);
2809 scan = prog;
2810 while (scan != NULL) {
8ba1375e 2811
a3621e74 2812 DEBUG_EXECUTE_r( {
6136c704 2813 SV * const prop = sv_newmortal();
1de06328 2814 regnode *rnext=regnext(scan);
786e8c11 2815 DUMP_EXEC_POS( locinput, scan, do_utf8 );
32fc9b6a 2816 regprop(rex, prop, scan);
07be1b83
YO
2817
2818 PerlIO_printf(Perl_debug_log,
2819 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 2820 (IV)(scan - rexi->program), depth*2, "",
07be1b83 2821 SvPVX_const(prop),
1de06328 2822 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 2823 0 : (IV)(rnext - rexi->program));
2a782b5b 2824 });
d6a28714
JH
2825
2826 next = scan + NEXT_OFF(scan);
2827 if (next == scan)
2828 next = NULL;
40a82448 2829 state_num = OP(scan);
d6a28714 2830
40a82448 2831 reenter_switch:
34a81e2b
B
2832
2833 assert(PL_reglastparen == &rex->lastparen);
2834 assert(PL_reglastcloseparen == &rex->lastcloseparen);
2835 assert(PL_regoffs == rex->offs);
2836
40a82448 2837 switch (state_num) {
d6a28714 2838 case BOL:
7fba1cd6 2839 if (locinput == PL_bostr)
d6a28714 2840 {
3b0527fe 2841 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2842 break;
2843 }
d6a28714
JH
2844 sayNO;
2845 case MBOL:
12d33761
HS
2846 if (locinput == PL_bostr ||
2847 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2848 {
b8c5462f
JH
2849 break;
2850 }
d6a28714
JH
2851 sayNO;
2852 case SBOL:
c2a73568 2853 if (locinput == PL_bostr)
b8c5462f 2854 break;
d6a28714
JH
2855 sayNO;
2856 case GPOS:
3b0527fe 2857 if (locinput == reginfo->ganch)
d6a28714
JH
2858 break;
2859 sayNO;
ee9b8eae
YO
2860
2861 case KEEPS:
2862 /* update the startpoint */
f0ab9afb 2863 st->u.keeper.val = PL_regoffs[0].start;
ee9b8eae 2864 PL_reginput = locinput;
f0ab9afb 2865 PL_regoffs[0].start = locinput - PL_bostr;
ee9b8eae
YO
2866 PUSH_STATE_GOTO(KEEPS_next, next);
2867 /*NOT-REACHED*/
2868 case KEEPS_next_fail:
2869 /* rollback the start point change */
f0ab9afb 2870 PL_regoffs[0].start = st->u.keeper.val;
ee9b8eae
YO
2871 sayNO_SILENT;
2872 /*NOT-REACHED*/
d6a28714 2873 case EOL:
d6a28714
JH
2874 goto seol;
2875 case MEOL:
d6a28714 2876 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2877 sayNO;
b8c5462f 2878 break;
d6a28714
JH
2879 case SEOL:
2880 seol:
2881 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2882 sayNO;
d6a28714 2883 if (PL_regeol - locinput > 1)
b8c5462f 2884 sayNO;
b8c5462f 2885 break;
d6a28714
JH
2886 case EOS:
2887 if (PL_regeol != locinput)
b8c5462f 2888 sayNO;
d6a28714 2889 break;
ffc61ed2 2890 case SANY:
d6a28714 2891 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2892 sayNO;
f33976b4
DB
2893 if (do_utf8) {
2894 locinput += PL_utf8skip[nextchr];
2895 if (locinput > PL_regeol)
2896 sayNO;
2897 nextchr = UCHARAT(locinput);
2898 }
2899 else
2900 nextchr = UCHARAT(++locinput);
2901 break;
2902 case CANY:
2903 if (!nextchr && locinput >= PL_regeol)
2904 sayNO;
b8c5462f 2905 nextchr = UCHARAT(++locinput);
a0d0e21e 2906 break;
ffc61ed2 2907 case REG_ANY:
1aa99e6b
IH
2908 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2909 sayNO;
2910 if (do_utf8) {
b8c5462f 2911 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2912 if (locinput > PL_regeol)
2913 sayNO;
a0ed51b3 2914 nextchr = UCHARAT(locinput);
a0ed51b3 2915 }
1aa99e6b
IH
2916 else
2917 nextchr = UCHARAT(++locinput);
a0ed51b3 2918 break;
166ba7cd
DM
2919
2920#undef ST
2921#define ST st->u.trie
786e8c11
YO
2922 case TRIEC:
2923 /* In this case the charclass data is available inline so
2924 we can fail fast without a lot of extra overhead.
2925 */
2926 if (scan->flags == EXACT || !do_utf8) {
2927 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2928 DEBUG_EXECUTE_r(
2929 PerlIO_printf(Perl_debug_log,
2930 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2931 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
786e8c11
YO
2932 );
2933 sayNO_SILENT;
2934 /* NOTREACHED */
2935 }
2936 }
2937 /* FALL THROUGH */
5b47454d 2938 case TRIE:
3dab1dad 2939 {
07be1b83 2940 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 2941 DECL_TRIE_TYPE(scan);
3dab1dad
YO
2942
2943 /* what trie are we using right now */
be8e71aa 2944 reg_trie_data * const trie
f8fc2ecf 2945 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 2946 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 2947 U32 state = trie->startstate;
166ba7cd 2948
3dab1dad
YO
2949 if (trie->bitmap && trie_type != trie_utf8_fold &&
2950 !TRIE_BITMAP_TEST(trie,*locinput)
2951 ) {
2952 if (trie->states[ state ].wordnum) {
2953 DEBUG_EXECUTE_r(
2954 PerlIO_printf(Perl_debug_log,
2955 "%*s %smatched empty string...%s\n",
5bc10b2c 2956 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2957 );
2958 break;
2959 } else {
2960 DEBUG_EXECUTE_r(
2961 PerlIO_printf(Perl_debug_log,
786e8c11 2962 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2963 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2964 );
2965 sayNO_SILENT;
2966 }
2967 }
166ba7cd 2968
786e8c11
YO
2969 {
2970 U8 *uc = ( U8* )locinput;
2971
2972 STRLEN len = 0;
2973 STRLEN foldlen = 0;
2974 U8 *uscan = (U8*)NULL;
2975 STRLEN bufflen=0;
2976 SV *sv_accept_buff = NULL;
2977 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2978
2979 ST.accepted = 0; /* how many accepting states we have seen */
2980 ST.B = next;
2981 ST.jump = trie->jump;
786e8c11 2982 ST.me = scan;
07be1b83
YO
2983 /*
2984 traverse the TRIE keeping track of all accepting states
2985 we transition through until we get to a failing node.
2986 */
2987
a3621e74 2988 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 2989 U32 base = trie->states[ state ].trans.base;
f9f4320a 2990 UV uvc = 0;
786e8c11
YO
2991 U16 charid;
2992 /* We use charid to hold the wordnum as we don't use it
2993 for charid until after we have done the wordnum logic.
2994 We define an alias just so that the wordnum logic reads
2995 more naturally. */
2996
2997#define got_wordnum charid
2998 got_wordnum = trie->states[ state ].wordnum;
2999
3000 if ( got_wordnum ) {
3001 if ( ! ST.accepted ) {
5b47454d 3002 ENTER;
6962fb1a 3003 SAVETMPS; /* XXX is this necessary? dmq */
5b47454d
DM
3004 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
3005 sv_accept_buff=newSV(bufflen *
3006 sizeof(reg_trie_accepted) - 1);
786e8c11 3007 SvCUR_set(sv_accept_buff, 0);
5b47454d
DM
3008 SvPOK_on(sv_accept_buff);
3009 sv_2mortal(sv_accept_buff);
166ba7cd
DM
3010 SAVETMPS;
3011 ST.accept_buff =
5b47454d
DM
3012 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
3013 }
786e8c11 3014 do {
166ba7cd 3015 if (ST.accepted >= bufflen) {
5b47454d 3016 bufflen *= 2;
166ba7cd 3017 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
3018 SvGROW(sv_accept_buff,
3019 bufflen * sizeof(reg_trie_accepted));
3020 }
3021 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
3022 + sizeof(reg_trie_accepted));
a3621e74 3023
786e8c11
YO
3024
3025 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
3026 ST.accept_buff[ST.accepted].endpos = uc;
3027 ++ST.accepted;
3028 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
3029 }
3030#undef got_wordnum
a3621e74 3031
07be1b83 3032 DEBUG_TRIE_EXECUTE_r({
786e8c11 3033 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
a3621e74 3034 PerlIO_printf( Perl_debug_log,
786e8c11 3035 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
5bc10b2c 3036 2+depth * 2, "", PL_colors[4],
786e8c11 3037 (UV)state, (UV)ST.accepted );
07be1b83 3038 });
a3621e74
YO
3039
3040 if ( base ) {
55eed653
NC
3041 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3042 uscan, len, uvc, charid, foldlen,
3043 foldbuf, uniflags);
a3621e74 3044
5b47454d
DM
3045 if (charid &&
3046 (base + charid > trie->uniquecharcount )
3047 && (base + charid - 1 - trie->uniquecharcount
3048 < trie->lasttrans)
3049 && trie->trans[base + charid - 1 -
3050 trie->uniquecharcount].check == state)
3051 {
3052 state = trie->trans[base + charid - 1 -
3053 trie->uniquecharcount ].next;
3054 }
3055 else {
3056 state = 0;
3057 }
3058 uc += len;
3059
3060 }
3061 else {
a3621e74
YO
3062 state = 0;
3063 }
3064 DEBUG_TRIE_EXECUTE_r(
e4584336 3065 PerlIO_printf( Perl_debug_log,
786e8c11 3066 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3067 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3068 );
3069 }
166ba7cd 3070 if (!ST.accepted )
a3621e74 3071 sayNO;
a3621e74 3072
166ba7cd
DM
3073 DEBUG_EXECUTE_r(
3074 PerlIO_printf( Perl_debug_log,
3075 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3076 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3077 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3078 );
786e8c11 3079 }}
fae667d5
YO
3080 goto trie_first_try; /* jump into the fail handler */
3081 /* NOTREACHED */
166ba7cd 3082 case TRIE_next_fail: /* we failed - try next alterative */
fae667d5
YO
3083 if ( ST.jump) {
3084 REGCP_UNWIND(ST.cp);
3085 for (n = *PL_reglastparen; n > ST.lastparen; n--)
f0ab9afb 3086 PL_regoffs[n].end = -1;
fae667d5
YO
3087 *PL_reglastparen = n;
3088 }
3089 trie_first_try:
5d458dd8
YO
3090 if (do_cutgroup) {
3091 do_cutgroup = 0;
3092 no_final = 0;
3093 }
fae667d5
YO
3094
3095 if ( ST.jump) {
3096 ST.lastparen = *PL_reglastparen;
3097 REGCP_SET(ST.cp);
3098 }
166ba7cd
DM
3099 if ( ST.accepted == 1 ) {
3100 /* only one choice left - just continue */
3101 DEBUG_EXECUTE_r({
2b8b4781 3102 AV *const trie_words
502c6561 3103 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
2b8b4781 3104 SV ** const tmp = av_fetch( trie_words,
5c9f2f80 3105 ST.accept_buff[ 0 ].wordnum-1, 0 );
de734bd5
A
3106 SV *sv= tmp ? sv_newmortal() : NULL;
3107
166ba7cd
DM
3108 PerlIO_printf( Perl_debug_log,
3109 "%*s %sonly one match left: #%d <%s>%s\n",
5bc10b2c 3110 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3111 ST.accept_buff[ 0 ].wordnum,
de734bd5
A
3112 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3113 PL_colors[0], PL_colors[1],
3114 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3115 )
3116 : "not compiled under -Dr",
166ba7cd
DM
3117 PL_colors[5] );
3118 });
3119 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3120 /* in this case we free tmps/leave before we call regmatch
3121 as we wont be using accept_buff again. */
5d458dd8 3122
166ba7cd
DM
3123 locinput = PL_reginput;
3124 nextchr = UCHARAT(locinput);
5d458dd8
YO
3125 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3126 scan = ST.B;
3127 else
3128 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3129 if (!has_cutgroup) {
3130 FREETMPS;
3131 LEAVE;
3132 } else {
3133 ST.accepted--;
3134 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3135 }
786e8c11 3136
166ba7cd
DM
3137 continue; /* execute rest of RE */
3138 }
fae667d5
YO
3139
3140 if ( !ST.accepted-- ) {
5d458dd8
YO
3141 DEBUG_EXECUTE_r({
3142 PerlIO_printf( Perl_debug_log,
3143 "%*s %sTRIE failed...%s\n",
3144 REPORT_CODE_OFF+depth*2, "",
3145 PL_colors[4],
3146 PL_colors[5] );
3147 });
166ba7cd
DM
3148 FREETMPS;
3149 LEAVE;
5d458dd8 3150 sayNO_SILENT;
fae667d5
YO
3151 /*NOTREACHED*/
3152 }
166ba7cd 3153
a3621e74 3154 /*
166ba7cd
DM
3155 There are at least two accepting states left. Presumably
3156 the number of accepting states is going to be low,
3157 typically two. So we simply scan through to find the one
3158 with lowest wordnum. Once we find it, we swap the last
3159 state into its place and decrement the size. We then try to
3160 match the rest of the pattern at the point where the word
3161 ends. If we succeed, control just continues along the
3162 regex; if we fail we return here to try the next accepting
3163 state
3164 */
a3621e74 3165
166ba7cd
DM
3166 {
3167 U32 best = 0;
3168 U32 cur;
3169 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3170 DEBUG_TRIE_EXECUTE_r(
f2278c82 3171 PerlIO_printf( Perl_debug_log,
166ba7cd 3172 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
5bc10b2c 3173 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
166ba7cd
DM
3174 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3175 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3176 );
3177
3178 if (ST.accept_buff[cur].wordnum <
3179 ST.accept_buff[best].wordnum)
3180 best = cur;
a3621e74 3181 }
166ba7cd
DM
3182
3183 DEBUG_EXECUTE_r({
2b8b4781 3184 AV *const trie_words
502c6561 3185 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
2b8b4781 3186 SV ** const tmp = av_fetch( trie_words,
5c9f2f80 3187 ST.accept_buff[ best ].wordnum - 1, 0 );
7f69552c 3188 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
265c4333 3189 ST.B :
7f69552c 3190 ST.me + ST.jump[ST.accept_buff[best].wordnum];
de734bd5
A
3191 SV *sv= tmp ? sv_newmortal() : NULL;
3192
265c4333
YO
3193 PerlIO_printf( Perl_debug_log,
3194 "%*s %strying alternation #%d <%s> at node #%d %s\n",
5bc10b2c 3195 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3196 ST.accept_buff[best].wordnum,
de734bd5
A
3197 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3198 PL_colors[0], PL_colors[1],
3199 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3200 ) : "not compiled under -Dr",
265c4333 3201 REG_NODE_NUM(nextop),
166ba7cd
DM
3202 PL_colors[5] );
3203 });
3204
3205 if ( best<ST.accepted ) {
3206 reg_trie_accepted tmp = ST.accept_buff[ best ];
3207 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3208 ST.accept_buff[ ST.accepted ] = tmp;
3209 best = ST.accepted;
a3621e74 3210 }
166ba7cd 3211 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
7f69552c 3212 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
5d458dd8 3213 scan = ST.B;
786e8c11 3214 } else {
5d458dd8 3215 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
5d458dd8 3216 }
6b173516 3217 PUSH_YES_STATE_GOTO(TRIE_next, scan);
786e8c11 3218 /* NOTREACHED */
166ba7cd 3219 }
166ba7cd 3220 /* NOTREACHED */
5d458dd8 3221 case TRIE_next:
6962fb1a
YO
3222 /* we dont want to throw this away, see bug 57042*/
3223 if (oreplsv != GvSV(PL_replgv))
3224 sv_setsv(oreplsv, GvSV(PL_replgv));
5d458dd8
YO
3225 FREETMPS;
3226 LEAVE;
3227 sayYES;
166ba7cd
DM
3228#undef ST
3229
95b24440
DM
3230 case EXACT: {
3231 char *s = STRING(scan);
24d3c4a9 3232 ln = STR_LEN(scan);
eb160463 3233 if (do_utf8 != UTF) {
bc517b45 3234 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3235 char *l = locinput;
24d3c4a9 3236 const char * const e = s + ln;
a72c7584 3237
5ff6fc6d
JH
3238 if (do_utf8) {
3239 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3240 while (s < e) {
a3b680e6 3241 STRLEN ulen;
1aa99e6b 3242 if (l >= PL_regeol)
5ff6fc6d
JH
3243 sayNO;
3244 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3245 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3246 uniflags))
5ff6fc6d 3247 sayNO;
bc517b45 3248 l += ulen;
5ff6fc6d 3249 s ++;
1aa99e6b 3250 }
5ff6fc6d
JH
3251 }
3252 else {
3253 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3254 while (s < e) {
a3b680e6 3255 STRLEN ulen;
1aa99e6b
IH
3256 if (l >= PL_regeol)
3257 sayNO;
5ff6fc6d 3258 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3259 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3260 uniflags))
1aa99e6b 3261 sayNO;
bc517b45 3262 s += ulen;
a72c7584 3263 l ++;
1aa99e6b 3264 }
5ff6fc6d 3265 }
1aa99e6b
IH
3266 locinput = l;
3267 nextchr = UCHARAT(locinput);
3268 break;
3269 }
bc517b45 3270 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3271 /* Inline the first character, for speed. */
3272 if (UCHARAT(s) != nextchr)
3273 sayNO;
24d3c4a9 3274 if (PL_regeol - locinput < ln)
d6a28714 3275 sayNO;
24d3c4a9 3276 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 3277 sayNO;
24d3c4a9 3278 locinput += ln;
d6a28714
JH
3279 nextchr = UCHARAT(locinput);
3280 break;
95b24440 3281 }
d6a28714 3282 case EXACTFL:
b8c5462f
JH
3283 PL_reg_flags |= RF_tainted;
3284 /* FALL THROUGH */
95b24440 3285 case EXACTF: {
be8e71aa 3286 char * const s = STRING(scan);
24d3c4a9 3287 ln = STR_LEN(scan);
d6a28714 3288
d07ddd77
JH
3289 if (do_utf8 || UTF) {
3290 /* Either target or the pattern are utf8. */
be8e71aa 3291 const char * const l = locinput;
d07ddd77 3292 char *e = PL_regeol;
bc517b45 3293
24d3c4a9 3294 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 3295 l, &e, 0, do_utf8)) {
5486206c
JH
3296 /* One more case for the sharp s:
3297 * pack("U0U*", 0xDF) =~ /ss/i,
3298 * the 0xC3 0x9F are the UTF-8
3299 * byte sequence for the U+00DF. */
e1d1eefb 3300
5486206c 3301 if (!(do_utf8 &&
e1d1eefb 3302 toLOWER(s[0]) == 's' &&
24d3c4a9 3303 ln >= 2 &&
5486206c
JH
3304 toLOWER(s[1]) == 's' &&
3305 (U8)l[0] == 0xC3 &&
3306 e - l >= 2 &&
3307 (U8)l[1] == 0x9F))
3308 sayNO;
3309 }
d07ddd77
JH
3310 locinput = e;
3311 nextchr = UCHARAT(locinput);
3312 break;
a0ed51b3 3313 }
d6a28714 3314
bc517b45
JH
3315 /* Neither the target and the pattern are utf8. */
3316
d6a28714
JH
3317 /* Inline the first character, for speed. */
3318 if (UCHARAT(s) != nextchr &&
3319 UCHARAT(s) != ((OP(scan) == EXACTF)
3320 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3321 sayNO;
24d3c4a9 3322 if (PL_regeol - locinput < ln)
b8c5462f 3323 sayNO;
24d3c4a9
DM
3324 if (ln > 1 && (OP(scan) == EXACTF
3325 ? ibcmp(s, locinput, ln)
3326 : ibcmp_locale(s, locinput, ln)))
4633a7c4 3327 sayNO;
24d3c4a9 3328 locinput += ln;
d6a28714 3329 nextchr = UCHARAT(locinput);
a0d0e21e 3330 break;
95b24440 3331 }
d6a28714 3332 case ANYOF:
ffc61ed2 3333 if (do_utf8) {
9e55ce06
JH
3334 STRLEN inclasslen = PL_regeol - locinput;
3335
32fc9b6a 3336 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
262b90c4 3337 goto anyof_fail;
ffc61ed2
JH
3338 if (locinput >= PL_regeol)
3339 sayNO;
0f0076b4 3340 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3341 nextchr = UCHARAT(locinput);
e0f9d4a8 3342 break;
ffc61ed2
JH
3343 }
3344 else {
3345 if (nextchr < 0)
3346 nextchr = UCHARAT(locinput);
32fc9b6a 3347 if (!REGINCLASS(rex, scan, (U8*)locinput))
262b90c4 3348 goto anyof_fail;
ffc61ed2
JH
3349 if (!nextchr && locinput >= PL_regeol)
3350 sayNO;
3351 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3352 break;
3353 }
262b90c4 3354 anyof_fail:
e0f9d4a8
JH
3355 /* If we might have the case of the German sharp s
3356 * in a casefolding Unicode character class. */
3357
ebc501f0
JH
3358 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3359 locinput += SHARP_S_SKIP;
e0f9d4a8 3360 nextchr = UCHARAT(locinput);
ffc61ed2 3361 }
e0f9d4a8
JH
3362 else
3363 sayNO;
b8c5462f 3364 break;
d6a28714 3365 case ALNUML:
b8c5462f
JH
3366 PL_reg_flags |= RF_tainted;
3367 /* FALL THROUGH */
d6a28714 3368 case ALNUM:
b8c5462f 3369 if (!nextchr)
4633a7c4 3370 sayNO;
ffc61ed2 3371 if (do_utf8) {
1a4fad37 3372 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3373 if (!(OP(scan) == ALNUM
bb7a0f54 3374 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3375 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3376 {
3377 sayNO;
a0ed51b3 3378 }
b8c5462f 3379 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3380 nextchr = UCHARAT(locinput);
3381 break;
3382 }
ffc61ed2 3383 if (!(OP(scan) == ALNUM
d6a28714 3384 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3385 sayNO;
b8c5462f 3386 nextchr = UCHARAT(++locinput);
a0d0e21e 3387 break;
d6a28714 3388 case NALNUML:
b8c5462f
JH
3389 PL_reg_flags |= RF_tainted;
3390 /* FALL THROUGH */
d6a28714
JH
3391 case NALNUM:
3392 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3393 sayNO;
ffc61ed2 3394 if (do_utf8) {
1a4fad37 3395 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3396 if (OP(scan) == NALNUM
bb7a0f54 3397 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3398 : isALNUM_LC_utf8((U8*)locinput))
3399 {
b8c5462f 3400 sayNO;
d6a28714 3401 }
b8c5462f
JH
3402 locinput += PL_utf8skip[nextchr];
3403 nextchr = UCHARAT(locinput);
3404 break;
3405 }
ffc61ed2 3406 if (OP(scan) == NALNUM
d6a28714 3407 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3408 sayNO;
76e3520e 3409 nextchr = UCHARAT(++locinput);
a0d0e21e 3410 break;
d6a28714
JH
3411 case BOUNDL:
3412 case NBOUNDL:
3280af22 3413 PL_reg_flags |= RF_tainted;
bbce6d69 3414 /* FALL THROUGH */
d6a28714
JH
3415 case BOUND:
3416 case NBOUND:
3417 /* was last char in word? */
ffc61ed2 3418 if (do_utf8) {
12d33761 3419 if (locinput == PL_bostr)
24d3c4a9 3420 ln = '\n';
ffc61ed2 3421 else {
a3b680e6 3422 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3423
24d3c4a9 3424 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3425 }
3426 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3427 ln = isALNUM_uni(ln);
1a4fad37 3428 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3429 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3430 }
3431 else {
24d3c4a9 3432 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
3433 n = isALNUM_LC_utf8((U8*)locinput);
3434 }
a0ed51b3 3435 }
d6a28714 3436 else {
24d3c4a9 3437 ln = (locinput != PL_bostr) ?
12d33761 3438 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3439 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3440 ln = isALNUM(ln);
ffc61ed2
JH
3441 n = isALNUM(nextchr);
3442 }
3443 else {
24d3c4a9 3444 ln = isALNUM_LC(ln);
ffc61ed2
JH
3445 n = isALNUM_LC(nextchr);
3446 }
d6a28714 3447 }
24d3c4a9 3448 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3449 OP(scan) == BOUNDL))
3450 sayNO;
a0ed51b3 3451 break;
d6a28714 3452 case SPACEL:
3280af22 3453 PL_reg_flags |= RF_tainted;
bbce6d69 3454 /* FALL THROUGH */
d6a28714 3455 case SPACE:
9442cb0e 3456 if (!nextchr)
4633a7c4 3457 sayNO;
1aa99e6b 3458 if (do_utf8) {
fd400ab9 3459 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3460 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3461 if (!(OP(scan) == SPACE
bb7a0f54 3462 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3463 : isSPACE_LC_utf8((U8*)locinput)))
3464 {
3465 sayNO;
3466 }
3467 locinput += PL_utf8skip[nextchr];
3468 nextchr = UCHARAT(locinput);
3469 break;
d6a28714 3470 }
ffc61ed2
JH
3471 if (!(OP(scan) == SPACE
3472 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3473 sayNO;
3474 nextchr = UCHARAT(++locinput);
3475 }
3476 else {
3477 if (!(OP(scan) == SPACE
3478 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3479 sayNO;
3480 nextchr = UCHARAT(++locinput);
a0ed51b3 3481 }
a0ed51b3 3482 break;
d6a28714 3483 case NSPACEL:
3280af22 3484 PL_reg_flags |= RF_tainted;
bbce6d69 3485 /* FALL THROUGH */
d6a28714 3486 case NSPACE:
9442cb0e 3487 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3488 sayNO;
1aa99e6b 3489 if (do_utf8) {
1a4fad37 3490 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3491 if (OP(scan) == NSPACE
bb7a0f54 3492 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3493 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3494 {
3495 sayNO;
3496 }
3497 locinput += PL_utf8skip[nextchr];
3498 nextchr = UCHARAT(locinput);
3499 break;
a0ed51b3 3500 }
ffc61ed2 3501 if (OP(scan) == NSPACE
d6a28714 3502 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3503 sayNO;
76e3520e 3504 nextchr = UCHARAT(++locinput);
a0d0e21e 3505 break;
d6a28714 3506 case DIGITL:
a0ed51b3
LW
3507 PL_reg_flags |= RF_tainted;
3508 /* FALL THROUGH */
d6a28714 3509 case DIGIT:
9442cb0e 3510 if (!nextchr)
a0ed51b3 3511 sayNO;
1aa99e6b 3512 if (do_utf8) {
1a4fad37 3513 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3514 if (!(OP(scan) == DIGIT
bb7a0f54 3515 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3516 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3517 {
a0ed51b3 3518 sayNO;
dfe13c55 3519 }
6f06b55f 3520 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3521 nextchr = UCHARAT(locinput);
3522 break;
3523 }
ffc61ed2 3524 if (!(OP(scan) == DIGIT
9442cb0e 3525 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3526 sayNO;
3527 nextchr = UCHARAT(++locinput);
3528 break;
d6a28714 3529 case NDIGITL:
b8c5462f
JH
3530 PL_reg_flags |= RF_tainted;
3531 /* FALL THROUGH */
d6a28714 3532 case NDIGIT:
9442cb0e 3533 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3534 sayNO;
1aa99e6b 3535 if (do_utf8) {
1a4fad37 3536 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3537 if (OP(scan) == NDIGIT
bb7a0f54 3538 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3539 : isDIGIT_LC_utf8((U8*)locinput))
3540 {
a0ed51b3 3541 sayNO;
9442cb0e 3542 }
6f06b55f 3543 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3544 nextchr = UCHARAT(locinput);
3545 break;
3546 }
ffc61ed2 3547 if (OP(scan) == NDIGIT
9442cb0e 3548 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3549 sayNO;
3550 nextchr = UCHARAT(++locinput);
3551 break;
3552 case CLUMP:
b7c83a7e 3553 if (locinput >= PL_regeol)
a0ed51b3 3554 sayNO;
b7c83a7e 3555 if (do_utf8) {
1a4fad37 3556 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3557 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3558 sayNO;
3559 locinput += PL_utf8skip[nextchr];
3560 while (locinput < PL_regeol &&
3561 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3562 locinput += UTF8SKIP(locinput);
3563 if (locinput > PL_regeol)
3564 sayNO;
eb08e2da
JH
3565 }
3566 else
3567 locinput++;
a0ed51b3
LW
3568 nextchr = UCHARAT(locinput);
3569 break;
81714fb9
YO
3570
3571 case NREFFL:
3572 {
3573 char *s;
ff1157ca 3574 char type;
81714fb9
YO
3575 PL_reg_flags |= RF_tainted;
3576 /* FALL THROUGH */
3577 case NREF:
3578 case NREFF:
ff1157ca 3579 type = OP(scan);
0a4db386
YO
3580 n = reg_check_named_buff_matched(rex,scan);
3581
3582 if ( n ) {
3583 type = REF + ( type - NREF );
3584 goto do_ref;
3585 } else {
81714fb9 3586 sayNO;
0a4db386
YO
3587 }
3588 /* unreached */
c8756f30 3589 case REFFL:
3280af22 3590 PL_reg_flags |= RF_tainted;
c8756f30 3591 /* FALL THROUGH */
c277df42 3592 case REF:
81714fb9 3593 case REFF:
c277df42 3594 n = ARG(scan); /* which paren pair */
81714fb9
YO
3595 type = OP(scan);
3596 do_ref:
f0ab9afb 3597 ln = PL_regoffs[n].start;
2c2d71f5 3598 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3b6647e0 3599 if (*PL_reglastparen < n || ln == -1)
af3f8c16 3600 sayNO; /* Do not match unless seen CLOSEn. */
f0ab9afb 3601 if (ln == PL_regoffs[n].end)
a0d0e21e 3602 break;
a0ed51b3 3603
24d3c4a9 3604 s = PL_bostr + ln;
81714fb9 3605 if (do_utf8 && type != REF) { /* REF can do byte comparison */
a0ed51b3 3606 char *l = locinput;
f0ab9afb 3607 const char *e = PL_bostr + PL_regoffs[n].end;
a0ed51b3
LW
3608 /*
3609 * Note that we can't do the "other character" lookup trick as
3610 * in the 8-bit case (no pun intended) because in Unicode we
3611 * have to map both upper and title case to lower case.
3612 */
81714fb9 3613 if (type == REFF) {
a0ed51b3 3614 while (s < e) {
a3b680e6
AL
3615 STRLEN ulen1, ulen2;
3616 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3617 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3618
a0ed51b3
LW
3619 if (l >= PL_regeol)
3620 sayNO;
a2a2844f
JH
3621 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3622 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3623 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3624 sayNO;
a2a2844f
JH
3625 s += ulen1;
3626 l += ulen2;
a0ed51b3
LW
3627 }
3628 }
3629 locinput = l;
3630 nextchr = UCHARAT(locinput);
3631 break;
3632 }
3633
a0d0e21e 3634 /* Inline the first character, for speed. */
76e3520e 3635 if (UCHARAT(s) != nextchr &&
81714fb9
YO
3636 (type == REF ||
3637 (UCHARAT(s) != (type == REFF
3638 ? PL_fold : PL_fold_locale)[nextchr])))
4633a7c4 3639 sayNO;
f0ab9afb 3640 ln = PL_regoffs[n].end - ln;
24d3c4a9 3641 if (locinput + ln > PL_regeol)
4633a7c4 3642 sayNO;
81714fb9 3643 if (ln > 1 && (type == REF
24d3c4a9 3644 ? memNE(s, locinput, ln)
81714fb9 3645 : (type == REFF
24d3c4a9
DM
3646 ? ibcmp(s, locinput, ln)
3647 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3648 sayNO;
24d3c4a9 3649 locinput += ln;
76e3520e 3650 nextchr = UCHARAT(locinput);
a0d0e21e 3651 break;
81714fb9 3652 }
a0d0e21e 3653 case NOTHING:
c277df42 3654 case TAIL:
a0d0e21e
LW
3655 break;
3656 case BACK:
3657 break;
40a82448
DM
3658
3659#undef ST
3660#define ST st->u.eval
c277df42 3661 {
c277df42 3662 SV *ret;
d2f13c59 3663 REGEXP *re_sv;
6bda09f9 3664 regexp *re;
f8fc2ecf 3665 regexp_internal *rei;
1a147d38
YO
3666 regnode *startpoint;
3667
3668 case GOSTART:
e7707071
YO
3669 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
3670 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 3671 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 3672 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 3673 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
3674 Perl_croak(aTHX_
3675 "Pattern subroutine nesting without pos change"
3676 " exceeded limit in regex");
6bda09f9
YO
3677 } else {
3678 nochange_depth = 0;
1a147d38 3679 }
288b8c02 3680 re_sv = rex_sv;
6bda09f9 3681 re = rex;
f8fc2ecf 3682 rei = rexi;
288b8c02 3683 (void)ReREFCNT_inc(rex_sv);
1a147d38 3684 if (OP(scan)==GOSUB) {
6bda09f9
YO
3685 startpoint = scan + ARG2L(scan);
3686 ST.close_paren = ARG(scan);
3687 } else {
f8fc2ecf 3688 startpoint = rei->program+1;
6bda09f9
YO
3689 ST.close_paren = 0;
3690 }
3691 goto eval_recurse_doit;
3692 /* NOTREACHED */
3693 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3694 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 3695 if ( ++nochange_depth > max_nochange_depth )
1a147d38 3696 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
3697 } else {
3698 nochange_depth = 0;
3699 }
8e5e9ebe 3700 {
4aabdb9b
DM
3701 /* execute the code in the {...} */
3702 dSP;
6136c704 3703 SV ** const before = SP;
4aabdb9b
DM
3704 OP_4tree * const oop = PL_op;
3705 COP * const ocurcop = PL_curcop;
3706 PAD *old_comppad;
d80618d2 3707 char *saved_regeol = PL_regeol;
4aabdb9b
DM
3708
3709 n = ARG(scan);
f8fc2ecf 3710 PL_op = (OP_4tree*)rexi->data->data[n];
24b23f37
YO
3711 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3712 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f8fc2ecf 3713 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
f0ab9afb 3714 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4aabdb9b 3715
2bf803e2
YO
3716 if (sv_yes_mark) {
3717 SV *sv_mrk = get_sv("REGMARK", 1);
3718 sv_setsv(sv_mrk, sv_yes_mark);
3719 }
3720
8e5e9ebe
RGS
3721 CALLRUNOPS(aTHX); /* Scalar context. */
3722 SPAGAIN;
3723 if (SP == before)
075aa684 3724 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3725 else {
3726 ret = POPs;
3727 PUTBACK;
3728 }
4aabdb9b
DM
3729
3730 PL_op = oop;
3731 PAD_RESTORE_LOCAL(old_comppad);
3732 PL_curcop = ocurcop;
d80618d2 3733 PL_regeol = saved_regeol;
24d3c4a9 3734 if (!logical) {
4aabdb9b
DM
3735 /* /(?{...})/ */
3736 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3737 break;
3738 }
8e5e9ebe 3739 }
24d3c4a9
DM
3740 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3741 logical = 0;
4aabdb9b 3742 {
4f639d21
DM
3743 /* extract RE object from returned value; compiling if
3744 * necessary */
6136c704 3745 MAGIC *mg = NULL;
288b8c02 3746 REGEXP *rx = NULL;
5c35adbb
NC
3747
3748 if (SvROK(ret)) {
288b8c02 3749 SV *const sv = SvRV(ret);
5c35adbb
NC
3750
3751 if (SvTYPE(sv) == SVt_REGEXP) {
d2f13c59 3752 rx = (REGEXP*) sv;
5c35adbb
NC
3753 } else if (SvSMAGICAL(sv)) {
3754 mg = mg_find(sv, PERL_MAGIC_qr);
3755 assert(mg);
3756 }
3757 } else if (SvTYPE(ret) == SVt_REGEXP) {
d2f13c59 3758 rx = (REGEXP*) ret;
5c35adbb 3759 } else if (SvSMAGICAL(ret)) {
124ee91a
NC
3760 if (SvGMAGICAL(ret)) {
3761 /* I don't believe that there is ever qr magic
3762 here. */
3763 assert(!mg_find(ret, PERL_MAGIC_qr));
faf82a0b 3764 sv_unmagic(ret, PERL_MAGIC_qr);
124ee91a
NC
3765 }
3766 else {
faf82a0b 3767 mg = mg_find(ret, PERL_MAGIC_qr);
124ee91a
NC
3768 /* testing suggests mg only ends up non-NULL for
3769 scalars who were upgraded and compiled in the
3770 else block below. In turn, this is only
3771 triggered in the "postponed utf8 string" tests
3772 in t/op/pat.t */
3773 }
0f5d15d6 3774 }
faf82a0b 3775
0f5d15d6 3776 if (mg) {
d2f13c59 3777 rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
e2560c33 3778 assert(rx);
0f5d15d6 3779 }
288b8c02
NC
3780 if (rx) {
3781 rx = reg_temp_copy(rx);
3782 }
0f5d15d6 3783 else {
c737faaf 3784 U32 pm_flags = 0;
a3b680e6 3785 const I32 osize = PL_regsize;
0f5d15d6 3786
b9ad30b4
NC
3787 if (DO_UTF8(ret)) {
3788 assert (SvUTF8(ret));
3789 } else if (SvUTF8(ret)) {
3790 /* Not doing UTF-8, despite what the SV says. Is
3791 this only if we're trapped in use 'bytes'? */
3792 /* Make a copy of the octet sequence, but without
3793 the flag on, as the compiler now honours the
3794 SvUTF8 flag on ret. */
3795 STRLEN len;
3796 const char *const p = SvPV(ret, len);
3797 ret = newSVpvn_flags(p, len, SVs_TEMP);
3798 }
288b8c02 3799 rx = CALLREGCOMP(ret, pm_flags);
9041c2e3 3800 if (!(SvFLAGS(ret)
faf82a0b 3801 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3ce3ed55 3802 | SVs_GMG))) {
a2794585
NC
3803 /* This isn't a first class regexp. Instead, it's
3804 caching a regexp onto an existing, Perl visible
3805 scalar. */
ad64d0ec 3806 sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
3ce3ed55 3807 }
0f5d15d6 3808 PL_regsize = osize;
0f5d15d6 3809 }
288b8c02
NC
3810 re_sv = rx;
3811 re = (struct regexp *)SvANY(rx);
4aabdb9b 3812 }
07bc277f 3813 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
3814 re->subbeg = rex->subbeg;
3815 re->sublen = rex->sublen;
f8fc2ecf 3816 rei = RXi_GET(re);
6bda09f9 3817 DEBUG_EXECUTE_r(
efd26800 3818 debug_start_match(re_sv, do_utf8, locinput, PL_regeol,
6bda09f9
YO
3819 "Matching embedded");
3820 );
f8fc2ecf 3821 startpoint = rei->program + 1;
1a147d38 3822 ST.close_paren = 0; /* only used for GOSUB */
6bda09f9
YO
3823 /* borrowed from regtry */
3824 if (PL_reg_start_tmpl <= re->nparens) {
3825 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3826 if(PL_reg_start_tmp)
3827 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3828 else
3829 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
dd5def09 3830 }
aa283a38 3831
1a147d38 3832 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 3833 /* run the pattern returned from (??{...}) */
40a82448
DM
3834 ST.cp = regcppush(0); /* Save *all* the positions. */
3835 REGCP_SET(ST.lastcp);
6bda09f9 3836
f0ab9afb 3837 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
6bda09f9 3838
0357f1fd
ML
3839 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
3840 PL_reglastparen = &re->lastparen;
3841 PL_reglastcloseparen = &re->lastcloseparen;
3842 re->lastparen = 0;
3843 re->lastcloseparen = 0;
3844
4aabdb9b 3845 PL_reginput = locinput;
ae0beba1 3846 PL_regsize = 0;
4aabdb9b
DM
3847
3848 /* XXXX This is too dramatic a measure... */
3849 PL_reg_maxiter = 0;
3850
faec1544 3851 ST.toggle_reg_flags = PL_reg_flags;
3c8556c3 3852 if (RX_UTF8(re_sv))
faec1544
DM
3853 PL_reg_flags |= RF_utf8;
3854 else
3855 PL_reg_flags &= ~RF_utf8;
3856 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3857
288b8c02 3858 ST.prev_rex = rex_sv;
faec1544 3859 ST.prev_curlyx = cur_curlyx;
288b8c02
NC
3860 SETREX(rex_sv,re_sv);
3861 rex = re;
f8fc2ecf 3862 rexi = rei;
faec1544 3863 cur_curlyx = NULL;
40a82448 3864 ST.B = next;
faec1544
DM
3865 ST.prev_eval = cur_eval;
3866 cur_eval = st;
faec1544 3867 /* now continue from first node in postoned RE */
6bda09f9 3868 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4aabdb9b 3869 /* NOTREACHED */
a0ed51b3 3870 }
24d3c4a9
DM
3871 /* logical is 1, /(?(?{...})X|Y)/ */
3872 sw = (bool)SvTRUE(ret);
3873 logical = 0;
c277df42
IZ
3874 break;
3875 }
40a82448 3876
faec1544
DM
3877 case EVAL_AB: /* cleanup after a successful (??{A})B */
3878 /* note: this is called twice; first after popping B, then A */
3879 PL_reg_flags ^= ST.toggle_reg_flags;
288b8c02
NC
3880 ReREFCNT_dec(rex_sv);
3881 SETREX(rex_sv,ST.prev_rex);
3882 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3883 rexi = RXi_GET(rex);
faec1544
DM
3884 regcpblow(ST.cp);
3885 cur_eval = ST.prev_eval;
3886 cur_curlyx = ST.prev_curlyx;
34a81e2b
B
3887
3888 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
0357f1fd
ML
3889 PL_reglastparen = &rex->lastparen;
3890 PL_reglastcloseparen = &rex->lastcloseparen;
34a81e2b
B
3891 /* also update PL_regoffs */
3892 PL_regoffs = rex->offs;
0357f1fd 3893
40a82448
DM
3894 /* XXXX This is too dramatic a measure... */
3895 PL_reg_maxiter = 0;
e7707071 3896 if ( nochange_depth )
4b196cd4 3897 nochange_depth--;
262b90c4 3898 sayYES;
40a82448 3899
40a82448 3900
faec1544
DM
3901 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3902 /* note: this is called twice; first after popping B, then A */
3903 PL_reg_flags ^= ST.toggle_reg_flags;
288b8c02
NC
3904 ReREFCNT_dec(rex_sv);
3905 SETREX(rex_sv,ST.prev_rex);
3906 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3907 rexi = RXi_GET(rex);
34a81e2b 3908 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
0357f1fd
ML
3909 PL_reglastparen = &rex->lastparen;
3910 PL_reglastcloseparen = &rex->lastcloseparen;
3911
40a82448
DM
3912 PL_reginput = locinput;
3913 REGCP_UNWIND(ST.lastcp);
3914 regcppop(rex);
faec1544
DM
3915 cur_eval = ST.prev_eval;
3916 cur_curlyx = ST.prev_curlyx;
3917 /* XXXX This is too dramatic a measure... */
3918 PL_reg_maxiter = 0;
e7707071 3919 if ( nochange_depth )
4b196cd4 3920 nochange_depth--;
40a82448 3921 sayNO_SILENT;
40a82448
DM
3922#undef ST
3923
a0d0e21e 3924 case OPEN:
c277df42 3925 n = ARG(scan); /* which paren pair */
3280af22
NIS
3926 PL_reg_start_tmp[n] = locinput;
3927 if (n > PL_regsize)
3928 PL_regsize = n;
e2e6a0f1 3929 lastopen = n;
a0d0e21e
LW
3930 break;
3931 case CLOSE:
c277df42 3932 n = ARG(scan); /* which paren pair */
f0ab9afb
NC
3933 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
3934 PL_regoffs[n].end = locinput - PL_bostr;
7f69552c
YO
3935 /*if (n > PL_regsize)
3936 PL_regsize = n;*/
3b6647e0 3937 if (n > *PL_reglastparen)
3280af22 3938 *PL_reglastparen = n;
a01268b5 3939 *PL_reglastcloseparen = n;
3b6647e0 3940 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
3941 goto fake_end;
3942 }
a0d0e21e 3943 break;
e2e6a0f1
YO
3944 case ACCEPT:
3945 if (ARG(scan)){
3946 regnode *cursor;
3947 for (cursor=scan;
3948 cursor && OP(cursor)!=END;
3949 cursor=regnext(cursor))
3950 {
3951 if ( OP(cursor)==CLOSE ){
3952 n = ARG(cursor);
3953 if ( n <= lastopen ) {
f0ab9afb
NC
3954 PL_regoffs[n].start
3955 = PL_reg_start_tmp[n] - PL_bostr;
3956 PL_regoffs[n].end = locinput - PL_bostr;
e2e6a0f1
YO
3957 /*if (n > PL_regsize)
3958 PL_regsize = n;*/
3b6647e0 3959 if (n > *PL_reglastparen)
e2e6a0f1
YO
3960 *PL_reglastparen = n;
3961 *PL_reglastcloseparen = n;
3b6647e0
RB
3962 if ( n == ARG(scan) || (cur_eval &&
3963 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
3964 break;
3965 }
3966 }
3967 }
3968 }
3969 goto fake_end;
3970 /*NOTREACHED*/
c277df42
IZ
3971 case GROUPP:
3972 n = ARG(scan); /* which paren pair */
f0ab9afb 3973 sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
c277df42 3974 break;
0a4db386
YO
3975 case NGROUPP:
3976 /* reg_check_named_buff_matched returns 0 for no match */
3977 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3978 break;
1a147d38 3979 case INSUBP:
0a4db386 3980 n = ARG(scan);
3b6647e0 3981 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386
YO
3982 break;
3983 case DEFINEP:
3984 sw = 0;
3985 break;
c277df42 3986 case IFTHEN:
2c2d71f5 3987 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 3988 if (sw)
c277df42
IZ
3989 next = NEXTOPER(NEXTOPER(scan));
3990 else {
3991 next = scan + ARG(scan);
3992 if (OP(next) == IFTHEN) /* Fake one. */
3993 next = NEXTOPER(NEXTOPER(next));
3994 }
3995 break;
3996 case LOGICAL:
24d3c4a9 3997 logical = scan->flags;
c277df42 3998 break;
c476f425 3999
2ab05381 4000/*******************************************************************
2ab05381 4001
c476f425
DM
4002The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4003pattern, where A and B are subpatterns. (For simple A, CURLYM or
4004STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 4005
c476f425 4006A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 4007
c476f425
DM
4008On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4009state, which contains the current count, initialised to -1. It also sets
4010cur_curlyx to point to this state, with any previous value saved in the
4011state block.
2ab05381 4012
c476f425
DM
4013CURLYX then jumps straight to the WHILEM op, rather than executing A,
4014since the pattern may possibly match zero times (i.e. it's a while {} loop
4015rather than a do {} while loop).
2ab05381 4016
c476f425
DM
4017Each entry to WHILEM represents a successful match of A. The count in the
4018CURLYX block is incremented, another WHILEM state is pushed, and execution
4019passes to A or B depending on greediness and the current count.
2ab05381 4020
c476f425
DM
4021For example, if matching against the string a1a2a3b (where the aN are
4022substrings that match /A/), then the match progresses as follows: (the
4023pushed states are interspersed with the bits of strings matched so far):
2ab05381 4024
c476f425
DM
4025 <CURLYX cnt=-1>
4026 <CURLYX cnt=0><WHILEM>
4027 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4028 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4029 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4030 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 4031
c476f425
DM
4032(Contrast this with something like CURLYM, which maintains only a single
4033backtrack state:
2ab05381 4034
c476f425
DM
4035 <CURLYM cnt=0> a1
4036 a1 <CURLYM cnt=1> a2
4037 a1 a2 <CURLYM cnt=2> a3
4038 a1 a2 a3 <CURLYM cnt=3> b
4039)
2ab05381 4040
c476f425
DM
4041Each WHILEM state block marks a point to backtrack to upon partial failure
4042of A or B, and also contains some minor state data related to that
4043iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4044overall state, such as the count, and pointers to the A and B ops.
2ab05381 4045
c476f425
DM
4046This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4047must always point to the *current* CURLYX block, the rules are:
2ab05381 4048
c476f425
DM
4049When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4050and set cur_curlyx to point the new block.
2ab05381 4051
c476f425
DM
4052When popping the CURLYX block after a successful or unsuccessful match,
4053restore the previous cur_curlyx.
2ab05381 4054
c476f425
DM
4055When WHILEM is about to execute B, save the current cur_curlyx, and set it
4056to the outer one saved in the CURLYX block.
2ab05381 4057
c476f425
DM
4058When popping the WHILEM block after a successful or unsuccessful B match,
4059restore the previous cur_curlyx.
2ab05381 4060
c476f425
DM
4061Here's an example for the pattern (AI* BI)*BO
4062I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 4063
c476f425
DM
4064cur_
4065curlyx backtrack stack
4066------ ---------------
4067NULL
4068CO <CO prev=NULL> <WO>
4069CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4070CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4071NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 4072
c476f425
DM
4073At this point the pattern succeeds, and we work back down the stack to
4074clean up, restoring as we go:
95b24440 4075
c476f425
DM
4076CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4077CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4078CO <CO prev=NULL> <WO>
4079NULL
a0374537 4080
c476f425
DM
4081*******************************************************************/
4082
4083#define ST st->u.curlyx
4084
4085 case CURLYX: /* start of /A*B/ (for complex A) */
4086 {
4087 /* No need to save/restore up to this paren */
4088 I32 parenfloor = scan->flags;
4089
4090 assert(next); /* keep Coverity happy */
4091 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4092 next += ARG(next);
4093
4094 /* XXXX Probably it is better to teach regpush to support
4095 parenfloor > PL_regsize... */
4096 if (parenfloor > (I32)*PL_reglastparen)
4097 parenfloor = *PL_reglastparen; /* Pessimization... */
4098
4099 ST.prev_curlyx= cur_curlyx;
4100 cur_curlyx = st;
4101 ST.cp = PL_savestack_ix;
4102
4103 /* these fields contain the state of the current curly.
4104 * they are accessed by subsequent WHILEMs */
4105 ST.parenfloor = parenfloor;
4106 ST.min = ARG1(scan);
4107 ST.max = ARG2(scan);
4108 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4109 ST.B = next;
24d3c4a9
DM
4110 ST.minmod = minmod;
4111 minmod = 0;
c476f425
DM
4112 ST.count = -1; /* this will be updated by WHILEM */
4113 ST.lastloc = NULL; /* this will be updated by WHILEM */
4114
4115 PL_reginput = locinput;
4116 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
5f66b61c 4117 /* NOTREACHED */
c476f425 4118 }
a0d0e21e 4119
c476f425 4120 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
4121 cur_curlyx = ST.prev_curlyx;
4122 sayYES;
4123 /* NOTREACHED */
a0d0e21e 4124
c476f425
DM
4125 case CURLYX_end_fail: /* just failed to match all of A*B */
4126 regcpblow(ST.cp);
4127 cur_curlyx = ST.prev_curlyx;
4128 sayNO;
4129 /* NOTREACHED */
4633a7c4 4130
a0d0e21e 4131
c476f425
DM
4132#undef ST
4133#define ST st->u.whilem
4134
4135 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4136 {
4137 /* see the discussion above about CURLYX/WHILEM */
c476f425
DM
4138 I32 n;
4139 assert(cur_curlyx); /* keep Coverity happy */
4140 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4141 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4142 ST.cache_offset = 0;
4143 ST.cache_mask = 0;
4144
4145 PL_reginput = locinput;
4146
4147 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4148 "%*s whilem: matched %ld out of %ld..%ld\n",
4149 REPORT_CODE_OFF+depth*2, "", (long)n,
4150 (long)cur_curlyx->u.curlyx.min,
4151 (long)cur_curlyx->u.curlyx.max)
4152 );
a0d0e21e 4153
c476f425 4154 /* First just match a string of min A's. */
a0d0e21e 4155
c476f425
DM
4156 if (n < cur_curlyx->u.curlyx.min) {
4157 cur_curlyx->u.curlyx.lastloc = locinput;
4158 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4159 /* NOTREACHED */
4160 }
4161
4162 /* If degenerate A matches "", assume A done. */
4163
4164 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4165 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4166 "%*s whilem: empty match detected, trying continuation...\n",
4167 REPORT_CODE_OFF+depth*2, "")
4168 );
4169 goto do_whilem_B_max;
4170 }
4171
4172 /* super-linear cache processing */
4173
4174 if (scan->flags) {
a0d0e21e 4175
2c2d71f5 4176 if (!PL_reg_maxiter) {
c476f425
DM
4177 /* start the countdown: Postpone detection until we
4178 * know the match is not *that* much linear. */
2c2d71f5 4179 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
4180 /* possible overflow for long strings and many CURLYX's */
4181 if (PL_reg_maxiter < 0)
4182 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
4183 PL_reg_leftiter = PL_reg_maxiter;
4184 }
c476f425 4185
2c2d71f5 4186 if (PL_reg_leftiter-- == 0) {
c476f425 4187 /* initialise cache */
3298f257 4188 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 4189 if (PL_reg_poscache) {
eb160463 4190 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
4191 Renew(PL_reg_poscache, size, char);
4192 PL_reg_poscache_size = size;
4193 }
4194 Zero(PL_reg_poscache, size, char);
4195 }
4196 else {
4197 PL_reg_poscache_size = size;
a02a5408 4198 Newxz(PL_reg_poscache, size, char);
2c2d71f5 4199 }
c476f425
DM
4200 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4201 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4202 PL_colors[4], PL_colors[5])
4203 );
2c2d71f5 4204 }
c476f425 4205
2c2d71f5 4206 if (PL_reg_leftiter < 0) {
c476f425
DM
4207 /* have we already failed at this position? */
4208 I32 offset, mask;
4209 offset = (scan->flags & 0xf) - 1
4210 + (locinput - PL_bostr) * (scan->flags>>4);
4211 mask = 1 << (offset % 8);
4212 offset /= 8;
4213 if (PL_reg_poscache[offset] & mask) {
4214 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4215 "%*s whilem: (cache) already tried at this position...\n",
4216 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 4217 );
3298f257 4218 sayNO; /* cache records failure */
2c2d71f5 4219 }
c476f425
DM
4220 ST.cache_offset = offset;
4221 ST.cache_mask = mask;
2c2d71f5 4222 }
c476f425 4223 }
2c2d71f5 4224
c476f425 4225 /* Prefer B over A for minimal matching. */
a687059c 4226
c476f425
DM
4227 if (cur_curlyx->u.curlyx.minmod) {
4228 ST.save_curlyx = cur_curlyx;
4229 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4230 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4231 REGCP_SET(ST.lastcp);
4232 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4233 /* NOTREACHED */
4234 }
a0d0e21e 4235
c476f425
DM
4236 /* Prefer A over B for maximal matching. */
4237
4238 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4239 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4240 cur_curlyx->u.curlyx.lastloc = locinput;
4241 REGCP_SET(ST.lastcp);
4242 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4243 /* NOTREACHED */
4244 }
4245 goto do_whilem_B_max;
4246 }
4247 /* NOTREACHED */
4248
4249 case WHILEM_B_min: /* just matched B in a minimal match */
4250 case WHILEM_B_max: /* just matched B in a maximal match */
4251 cur_curlyx = ST.save_curlyx;
4252 sayYES;
4253 /* NOTREACHED */
4254
4255 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4256 cur_curlyx = ST.save_curlyx;
4257 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4258 cur_curlyx->u.curlyx.count--;
4259 CACHEsayNO;
4260 /* NOTREACHED */
4261
4262 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4263 REGCP_UNWIND(ST.lastcp);
4264 regcppop(rex);
4265 /* FALL THROUGH */
4266 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4267 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4268 cur_curlyx->u.curlyx.count--;
4269 CACHEsayNO;
4270 /* NOTREACHED */
4271
4272 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4273 REGCP_UNWIND(ST.lastcp);
4274 regcppop(rex); /* Restore some previous $<digit>s? */
4275 PL_reginput = locinput;
4276 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4277 "%*s whilem: failed, trying continuation...\n",
4278 REPORT_CODE_OFF+depth*2, "")
4279 );
4280 do_whilem_B_max:
4281 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4282 && ckWARN(WARN_REGEXP)
4283 && !(PL_reg_flags & RF_warned))
4284 {
4285 PL_reg_flags |= RF_warned;
4286 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4287 "Complex regular subexpression recursion",
4288 REG_INFTY - 1);
4289 }
4290
4291 /* now try B */
4292 ST.save_curlyx = cur_curlyx;
4293 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4294 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4295 /* NOTREACHED */
4296
4297 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4298 cur_curlyx = ST.save_curlyx;
4299 REGCP_UNWIND(ST.lastcp);
4300 regcppop(rex);
4301
4302 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4303 /* Maximum greed exceeded */
4304 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4305 && ckWARN(WARN_REGEXP)
4306 && !(PL_reg_flags & RF_warned))
4307 {
3280af22 4308 PL_reg_flags |= RF_warned;
c476f425
DM
4309 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4310 "%s limit (%d) exceeded",
4311 "Complex regular subexpression recursion",
4312 REG_INFTY - 1);
a0d0e21e 4313 }
c476f425 4314 cur_curlyx->u.curlyx.count--;
3ab3c9b4 4315 CACHEsayNO;
a0d0e21e 4316 }
c476f425
DM
4317
4318 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4319 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4320 );
4321 /* Try grabbing another A and see if it helps. */
4322 PL_reginput = locinput;
4323 cur_curlyx->u.curlyx.lastloc = locinput;
4324 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4325 REGCP_SET(ST.lastcp);
4326 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
5f66b61c 4327 /* NOTREACHED */
40a82448
DM
4328
4329#undef ST
4330#define ST st->u.branch
4331
4332 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
4333 next = scan + ARG(scan);
4334 if (next == scan)
4335 next = NULL;
40a82448
DM
4336 scan = NEXTOPER(scan);
4337 /* FALL THROUGH */
c277df42 4338
40a82448
DM
4339 case BRANCH: /* /(...|A|...)/ */
4340 scan = NEXTOPER(scan); /* scan now points to inner node */
40a82448
DM
4341 ST.lastparen = *PL_reglastparen;
4342 ST.next_branch = next;
4343 REGCP_SET(ST.cp);
4344 PL_reginput = locinput;
02db2b7b 4345
40a82448 4346 /* Now go into the branch */
5d458dd8
YO
4347 if (has_cutgroup) {
4348 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4349 } else {
4350 PUSH_STATE_GOTO(BRANCH_next, scan);
4351 }
40a82448 4352 /* NOTREACHED */
5d458dd8
YO
4353 case CUTGROUP:
4354 PL_reginput = locinput;
4355 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 4356 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
4357 PUSH_STATE_GOTO(CUTGROUP_next,next);
4358 /* NOTREACHED */
4359 case CUTGROUP_next_fail:
4360 do_cutgroup = 1;
4361 no_final = 1;
4362 if (st->u.mark.mark_name)
4363 sv_commit = st->u.mark.mark_name;
4364 sayNO;
4365 /* NOTREACHED */
4366 case BRANCH_next:
4367 sayYES;
4368 /* NOTREACHED */
40a82448 4369 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
4370 if (do_cutgroup) {
4371 do_cutgroup = 0;
4372 no_final = 0;
4373 }
40a82448
DM
4374 REGCP_UNWIND(ST.cp);
4375 for (n = *PL_reglastparen; n > ST.lastparen; n--)
f0ab9afb 4376 PL_regoffs[n].end = -1;
40a82448 4377 *PL_reglastparen = n;
0a4db386 4378 /*dmq: *PL_reglastcloseparen = n; */
40a82448
DM
4379 scan = ST.next_branch;
4380 /* no more branches? */
5d458dd8
YO
4381 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4382 DEBUG_EXECUTE_r({
4383 PerlIO_printf( Perl_debug_log,
4384 "%*s %sBRANCH failed...%s\n",
4385 REPORT_CODE_OFF+depth*2, "",
4386 PL_colors[4],
4387 PL_colors[5] );
4388 });
4389 sayNO_SILENT;
4390 }
40a82448
DM
4391 continue; /* execute next BRANCH[J] op */
4392 /* NOTREACHED */
4393
a0d0e21e 4394 case MINMOD:
24d3c4a9 4395 minmod = 1;
a0d0e21e 4396 break;
40a82448
DM
4397
4398#undef ST
4399#define ST st->u.curlym
4400
4401 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4402
4403 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 4404 * only a single backtracking state, no matter how many matches
40a82448
DM
4405 * there are in {m,n}. It relies on the pattern being constant
4406 * length, with no parens to influence future backrefs
4407 */
4408
4409 ST.me = scan;
dc45a647 4410 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
4411
4412 /* if paren positive, emulate an OPEN/CLOSE around A */
4413 if (ST.me->flags) {
3b6647e0 4414 U32 paren = ST.me->flags;
40a82448
DM
4415 if (paren > PL_regsize)
4416 PL_regsize = paren;
3b6647e0 4417 if (paren > *PL_reglastparen)
40a82448 4418 *PL_reglastparen = paren;
c277df42 4419 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 4420 }
40a82448
DM
4421 ST.A = scan;
4422 ST.B = next;
4423 ST.alen = 0;
4424 ST.count = 0;
24d3c4a9
DM
4425 ST.minmod = minmod;
4426 minmod = 0;
40a82448
DM
4427 ST.c1 = CHRTEST_UNINIT;
4428 REGCP_SET(ST.cp);
6407bf3b 4429
40a82448
DM
4430 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4431 goto curlym_do_B;
4432
4433 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 4434 PL_reginput = locinput;
40a82448
DM
4435 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4436 /* NOTREACHED */
5f80c4cf 4437
40a82448
DM
4438 case CURLYM_A: /* we've just matched an A */
4439 locinput = st->locinput;
4440 nextchr = UCHARAT(locinput);
4441
4442 ST.count++;
4443 /* after first match, determine A's length: u.curlym.alen */
4444 if (ST.count == 1) {
4445 if (PL_reg_match_utf8) {
4446 char *s = locinput;
4447 while (s < PL_reginput) {
4448 ST.alen++;
4449 s += UTF8SKIP(s);
4450 }
4451 }
4452 else {
4453 ST.alen = PL_reginput - locinput;
4454 }
4455 if (ST.alen == 0)
4456 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4457 }
0cadcf80
DM
4458 DEBUG_EXECUTE_r(
4459 PerlIO_printf(Perl_debug_log,
40a82448 4460 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 4461 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 4462 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
4463 );
4464
40a82448 4465 locinput = PL_reginput;
0a4db386
YO
4466
4467 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4468 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
4469 goto fake_end;
4470
c966426a
DM
4471 {
4472 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4473 if ( max == REG_INFTY || ST.count < max )
4474 goto curlym_do_A; /* try to match another A */
4475 }
40a82448 4476 goto curlym_do_B; /* try to match B */
5f80c4cf 4477
40a82448
DM
4478 case CURLYM_A_fail: /* just failed to match an A */
4479 REGCP_UNWIND(ST.cp);
0a4db386
YO
4480
4481 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4482 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4483 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 4484 sayNO;
0cadcf80 4485
40a82448
DM
4486 curlym_do_B: /* execute the B in /A{m,n}B/ */
4487 PL_reginput = locinput;
4488 if (ST.c1 == CHRTEST_UNINIT) {
4489 /* calculate c1 and c2 for possible match of 1st char
4490 * following curly */
4491 ST.c1 = ST.c2 = CHRTEST_VOID;
4492 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4493 regnode *text_node = ST.B;
4494 if (! HAS_TEXT(text_node))
4495 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
4496 /* this used to be
4497
4498 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4499
4500 But the former is redundant in light of the latter.
4501
4502 if this changes back then the macro for
4503 IS_TEXT and friends need to change.
4504 */
4505 if (PL_regkind[OP(text_node)] == EXACT)
40a82448 4506 {
ee9b8eae 4507
40a82448
DM
4508 ST.c1 = (U8)*STRING(text_node);
4509 ST.c2 =
ee9b8eae 4510 (IS_TEXTF(text_node))
40a82448 4511 ? PL_fold[ST.c1]
ee9b8eae 4512 : (IS_TEXTFL(text_node))
40a82448
DM
4513 ? PL_fold_locale[ST.c1]
4514 : ST.c1;
c277df42 4515 }
c277df42 4516 }
40a82448
DM
4517 }
4518
4519 DEBUG_EXECUTE_r(
4520 PerlIO_printf(Perl_debug_log,
4521 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 4522 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
4523 "", (IV)ST.count)
4524 );
4525 if (ST.c1 != CHRTEST_VOID
4526 && UCHARAT(PL_reginput) != ST.c1
4527 && UCHARAT(PL_reginput) != ST.c2)
4528 {
4529 /* simulate B failing */
3e901dc0
YO
4530 DEBUG_OPTIMISE_r(
4531 PerlIO_printf(Perl_debug_log,
4532 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4533 (int)(REPORT_CODE_OFF+(depth*2)),"",
4534 (IV)ST.c1,(IV)ST.c2
4535 ));
40a82448
DM
4536 state_num = CURLYM_B_fail;
4537 goto reenter_switch;
4538 }
4539
4540 if (ST.me->flags) {
4541 /* mark current A as captured */
4542 I32 paren = ST.me->flags;
4543 if (ST.count) {
f0ab9afb 4544 PL_regoffs[paren].start
40a82448 4545 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
f0ab9afb 4546 PL_regoffs[paren].end = PL_reginput - PL_bostr;
0a4db386 4547 /*dmq: *PL_reglastcloseparen = paren; */
c277df42 4548 }
40a82448 4549 else
f0ab9afb 4550 PL_regoffs[paren].end = -1;
0a4db386 4551 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4552 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
4553 {
4554 if (ST.count)
4555 goto fake_end;
4556 else
4557 sayNO;
4558 }
c277df42 4559 }
0a4db386 4560
40a82448 4561 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 4562 /* NOTREACHED */
40a82448
DM
4563
4564 case CURLYM_B_fail: /* just failed to match a B */
4565 REGCP_UNWIND(ST.cp);
4566 if (ST.minmod) {
84d2fa14
HS
4567 I32 max = ARG2(ST.me);
4568 if (max != REG_INFTY && ST.count == max)
40a82448
DM
4569 sayNO;
4570 goto curlym_do_A; /* try to match a further A */
4571 }
4572 /* backtrack one A */
4573 if (ST.count == ARG1(ST.me) /* min */)
4574 sayNO;
4575 ST.count--;
4576 locinput = HOPc(locinput, -ST.alen);
4577 goto curlym_do_B; /* try to match B */
4578
c255a977
DM
4579#undef ST
4580#define ST st->u.curly
40a82448 4581
c255a977
DM
4582#define CURLY_SETPAREN(paren, success) \
4583 if (paren) { \
4584 if (success) { \
f0ab9afb
NC
4585 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4586 PL_regoffs[paren].end = locinput - PL_bostr; \
0a4db386 4587 *PL_reglastcloseparen = paren; \
c255a977
DM
4588 } \
4589 else \
f0ab9afb 4590 PL_regoffs[paren].end = -1; \
c255a977
DM
4591 }
4592
4593 case STAR: /* /A*B/ where A is width 1 */
4594 ST.paren = 0;
4595 ST.min = 0;
4596 ST.max = REG_INFTY;
a0d0e21e
LW
4597 scan = NEXTOPER(scan);
4598 goto repeat;
c255a977
DM
4599 case PLUS: /* /A+B/ where A is width 1 */
4600 ST.paren = 0;
4601 ST.min = 1;
4602 ST.max = REG_INFTY;
c277df42 4603 scan = NEXTOPER(scan);
c255a977
DM
4604 goto repeat;
4605 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4606 ST.paren = scan->flags; /* Which paren to set */
4607 if (ST.paren > PL_regsize)
4608 PL_regsize = ST.paren;
3b6647e0 4609 if (ST.paren > *PL_reglastparen)
c255a977
DM
4610 *PL_reglastparen = ST.paren;
4611 ST.min = ARG1(scan); /* min to match */
4612 ST.max = ARG2(scan); /* max to match */
0a4db386 4613 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4614 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4615 ST.min=1;
4616 ST.max=1;
4617 }
c255a977
DM
4618 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4619 goto repeat;
4620 case CURLY: /* /A{m,n}B/ where A is width 1 */
4621 ST.paren = 0;
4622 ST.min = ARG1(scan); /* min to match */
4623 ST.max = ARG2(scan); /* max to match */
4624 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 4625 repeat:
a0d0e21e
LW
4626 /*
4627 * Lookahead to avoid useless match attempts
4628 * when we know what character comes next.
c255a977 4629 *
5f80c4cf
JP
4630 * Used to only do .*x and .*?x, but now it allows
4631 * for )'s, ('s and (?{ ... })'s to be in the way
4632 * of the quantifier and the EXACT-like node. -- japhy
4633 */
4634
c255a977
DM
4635 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4636 sayNO;
cca55fe3 4637 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4638 U8 *s;
4639 regnode *text_node = next;
4640
3dab1dad
YO
4641 if (! HAS_TEXT(text_node))
4642 FIND_NEXT_IMPT(text_node);
5f80c4cf 4643
9e137952 4644 if (! HAS_TEXT(text_node))
c255a977 4645 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 4646 else {
ee9b8eae 4647 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 4648 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 4649 goto assume_ok_easy;
cca55fe3 4650 }
be8e71aa
YO
4651 else
4652 s = (U8*)STRING(text_node);
ee9b8eae
YO
4653
4654 /* Currently we only get here when
4655
4656 PL_rekind[OP(text_node)] == EXACT
4657
4658 if this changes back then the macro for IS_TEXT and
4659 friends need to change. */
5f80c4cf 4660 if (!UTF) {
c255a977 4661 ST.c2 = ST.c1 = *s;
ee9b8eae 4662 if (IS_TEXTF(text_node))
c255a977 4663 ST.c2 = PL_fold[ST.c1];
ee9b8eae 4664 else if (IS_TEXTFL(text_node))
c255a977 4665 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 4666 }
5f80c4cf 4667 else { /* UTF */
ee9b8eae 4668 if (IS_TEXTF(text_node)) {
a2a2844f 4669 STRLEN ulen1, ulen2;
89ebb4a3
JH
4670 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4671 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4672
4673 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4674 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
e294cc5d
JH
4675#ifdef EBCDIC
4676 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4677 ckWARN(WARN_UTF8) ?
4678 0 : UTF8_ALLOW_ANY);
4679 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4680 ckWARN(WARN_UTF8) ?
4681 0 : UTF8_ALLOW_ANY);
4682#else
c255a977 4683 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
e294cc5d 4684 uniflags);
c255a977 4685 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
e294cc5d
JH
4686 uniflags);
4687#endif
5f80c4cf
JP
4688 }
4689 else {
c255a977 4690 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4691 uniflags);
5f80c4cf 4692 }
1aa99e6b
IH
4693 }
4694 }
bbce6d69 4695 }
a0d0e21e 4696 else
c255a977 4697 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 4698 assume_ok_easy:
c255a977
DM
4699
4700 ST.A = scan;
4701 ST.B = next;
3280af22 4702 PL_reginput = locinput;
24d3c4a9
DM
4703 if (minmod) {
4704 minmod = 0;
e2e6a0f1 4705 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4633a7c4 4706 sayNO;
c255a977 4707 ST.count = ST.min;
a0ed51b3 4708 locinput = PL_reginput;
c255a977
DM
4709 REGCP_SET(ST.cp);
4710 if (ST.c1 == CHRTEST_VOID)
4711 goto curly_try_B_min;
4712
4713 ST.oldloc = locinput;
4714
4715 /* set ST.maxpos to the furthest point along the
4716 * string that could possibly match */
4717 if (ST.max == REG_INFTY) {
4718 ST.maxpos = PL_regeol - 1;
4719 if (do_utf8)
4720 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4721 ST.maxpos--;
4722 }
4723 else if (do_utf8) {
4724 int m = ST.max - ST.min;
4725 for (ST.maxpos = locinput;
4726 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4727 ST.maxpos += UTF8SKIP(ST.maxpos);
4728 }
4729 else {
4730 ST.maxpos = locinput + ST.max - ST.min;
4731 if (ST.maxpos >= PL_regeol)
4732 ST.maxpos = PL_regeol - 1;
4733 }
4734 goto curly_try_B_min_known;
4735
4736 }
4737 else {
e2e6a0f1 4738 ST.count = regrepeat(rex, ST.A, ST.max, depth);
c255a977
DM
4739 locinput = PL_reginput;
4740 if (ST.count < ST.min)
4741 sayNO;
4742 if ((ST.count > ST.min)
4743 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4744 {
4745 /* A{m,n} must come at the end of the string, there's
4746 * no point in backing off ... */
4747 ST.min = ST.count;
4748 /* ...except that $ and \Z can match before *and* after
4749 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4750 We may back off by one in this case. */
4751 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4752 ST.min--;
4753 }
4754 REGCP_SET(ST.cp);
4755 goto curly_try_B_max;
4756 }
4757 /* NOTREACHED */
4758
4759
4760 case CURLY_B_min_known_fail:
4761 /* failed to find B in a non-greedy match where c1,c2 valid */
4762 if (ST.paren && ST.count)
f0ab9afb 4763 PL_regoffs[ST.paren].end = -1;
c255a977
DM
4764
4765 PL_reginput = locinput; /* Could be reset... */
4766 REGCP_UNWIND(ST.cp);
4767 /* Couldn't or didn't -- move forward. */
4768 ST.oldloc = locinput;
4769 if (do_utf8)
4770 locinput += UTF8SKIP(locinput);
4771 else
4772 locinput++;
4773 ST.count++;
4774 curly_try_B_min_known:
4775 /* find the next place where 'B' could work, then call B */
4776 {
4777 int n;
4778 if (do_utf8) {
4779 n = (ST.oldloc == locinput) ? 0 : 1;
4780 if (ST.c1 == ST.c2) {
4781 STRLEN len;
4782 /* set n to utf8_distance(oldloc, locinput) */
4783 while (locinput <= ST.maxpos &&
4784 utf8n_to_uvchr((U8*)locinput,
4785 UTF8_MAXBYTES, &len,
4786 uniflags) != (UV)ST.c1) {
4787 locinput += len;
4788 n++;
4789 }
1aa99e6b
IH
4790 }
4791 else {
c255a977
DM
4792 /* set n to utf8_distance(oldloc, locinput) */
4793 while (locinput <= ST.maxpos) {
4794 STRLEN len;
4795 const UV c = utf8n_to_uvchr((U8*)locinput,
4796 UTF8_MAXBYTES, &len,
4797 uniflags);
4798 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4799 break;
4800 locinput += len;
4801 n++;
1aa99e6b 4802 }
0fe9bf95
IZ
4803 }
4804 }
c255a977
DM
4805 else {
4806 if (ST.c1 == ST.c2) {
4807 while (locinput <= ST.maxpos &&
4808 UCHARAT(locinput) != ST.c1)
4809 locinput++;
bbce6d69 4810 }
c255a977
DM
4811 else {
4812 while (locinput <= ST.maxpos
4813 && UCHARAT(locinput) != ST.c1
4814 && UCHARAT(locinput) != ST.c2)
4815 locinput++;
a0ed51b3 4816 }
c255a977
DM
4817 n = locinput - ST.oldloc;
4818 }
4819 if (locinput > ST.maxpos)
4820 sayNO;
4821 /* PL_reginput == oldloc now */
4822 if (n) {
4823 ST.count += n;
e2e6a0f1 4824 if (regrepeat(rex, ST.A, n, depth) < n)
4633a7c4 4825 sayNO;
a0d0e21e 4826 }
c255a977
DM
4827 PL_reginput = locinput;
4828 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 4829 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4830 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4831 goto fake_end;
4832 }
c255a977 4833 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 4834 }
c255a977
DM
4835 /* NOTREACHED */
4836
4837
4838 case CURLY_B_min_fail:
4839 /* failed to find B in a non-greedy match where c1,c2 invalid */
4840 if (ST.paren && ST.count)
f0ab9afb 4841 PL_regoffs[ST.paren].end = -1;
c255a977
DM
4842
4843 REGCP_UNWIND(ST.cp);
4844 /* failed -- move forward one */
4845 PL_reginput = locinput;
e2e6a0f1 4846 if (regrepeat(rex, ST.A, 1, depth)) {
c255a977 4847 ST.count++;
a0ed51b3 4848 locinput = PL_reginput;
c255a977
DM
4849 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4850 ST.count > 0)) /* count overflow ? */
15272685 4851 {
c255a977
DM
4852 curly_try_B_min:
4853 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 4854 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4855 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4856 goto fake_end;
4857 }
c255a977 4858 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
4859 }
4860 }
4633a7c4 4861 sayNO;
c255a977
DM
4862 /* NOTREACHED */
4863
4864
4865 curly_try_B_max:
4866 /* a successful greedy match: now try to match B */
40d049e4 4867 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4868 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
4869 goto fake_end;
4870 }
c255a977
DM
4871 {
4872 UV c = 0;
4873 if (ST.c1 != CHRTEST_VOID)
4874 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4875 UTF8_MAXBYTES, 0, uniflags)
466787eb 4876 : (UV) UCHARAT(PL_reginput);
c255a977
DM
4877 /* If it could work, try it. */
4878 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4879 CURLY_SETPAREN(ST.paren, ST.count);
4880 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4881 /* NOTREACHED */
4882 }
4883 }
4884 /* FALL THROUGH */
4885 case CURLY_B_max_fail:
4886 /* failed to find B in a greedy match */
4887 if (ST.paren && ST.count)
f0ab9afb 4888 PL_regoffs[ST.paren].end = -1;
c255a977
DM
4889
4890 REGCP_UNWIND(ST.cp);
4891 /* back up. */
4892 if (--ST.count < ST.min)
4893 sayNO;
4894 PL_reginput = locinput = HOPc(locinput, -1);
4895 goto curly_try_B_max;
4896
4897#undef ST
4898
a0d0e21e 4899 case END:
6bda09f9 4900 fake_end:
faec1544
DM
4901 if (cur_eval) {
4902 /* we've just finished A in /(??{A})B/; now continue with B */
4903 I32 tmpix;
faec1544
DM
4904 st->u.eval.toggle_reg_flags
4905 = cur_eval->u.eval.toggle_reg_flags;
4906 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4907
288b8c02
NC
4908 st->u.eval.prev_rex = rex_sv; /* inner */
4909 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
4910 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 4911 rexi = RXi_GET(rex);
faec1544 4912 cur_curlyx = cur_eval->u.eval.prev_curlyx;
288b8c02 4913 ReREFCNT_inc(rex_sv);
faec1544 4914 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
34a81e2b
B
4915
4916 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4917 PL_reglastparen = &rex->lastparen;
4918 PL_reglastcloseparen = &rex->lastcloseparen;
4919
faec1544
DM
4920 REGCP_SET(st->u.eval.lastcp);
4921 PL_reginput = locinput;
4922
4923 /* Restore parens of the outer rex without popping the
4924 * savestack */
4925 tmpix = PL_savestack_ix;
4926 PL_savestack_ix = cur_eval->u.eval.lastcp;
4927 regcppop(rex);
4928 PL_savestack_ix = tmpix;
4929
4930 st->u.eval.prev_eval = cur_eval;
4931 cur_eval = cur_eval->u.eval.prev_eval;
4932 DEBUG_EXECUTE_r(
2a49f0f5
JH
4933 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
4934 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
4935 if ( nochange_depth )
4936 nochange_depth--;
4937
4938 PUSH_YES_STATE_GOTO(EVAL_AB,
faec1544
DM
4939 st->u.eval.prev_eval->u.eval.B); /* match B */
4940 }
4941
3b0527fe 4942 if (locinput < reginfo->till) {
a3621e74 4943 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4944 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4945 PL_colors[4],
4946 (long)(locinput - PL_reg_starttry),
3b0527fe 4947 (long)(reginfo->till - PL_reg_starttry),
7821416a 4948 PL_colors[5]));
58e23c8d 4949
262b90c4 4950 sayNO_SILENT; /* Cannot match: too short. */
7821416a
IZ
4951 }
4952 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 4953 sayYES; /* Success! */
dad79028
DM
4954
4955 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4956 DEBUG_EXECUTE_r(
4957 PerlIO_printf(Perl_debug_log,
4958 "%*s %ssubpattern success...%s\n",
5bc10b2c 4959 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
3280af22 4960 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 4961 sayYES; /* Success! */
dad79028 4962
40a82448
DM
4963#undef ST
4964#define ST st->u.ifmatch
4965
4966 case SUSPEND: /* (?>A) */
4967 ST.wanted = 1;
9fe1d20c 4968 PL_reginput = locinput;
9041c2e3 4969 goto do_ifmatch;
dad79028 4970
40a82448
DM
4971 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4972 ST.wanted = 0;
dad79028
DM
4973 goto ifmatch_trivial_fail_test;
4974
40a82448
DM
4975 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4976 ST.wanted = 1;
dad79028 4977 ifmatch_trivial_fail_test:
a0ed51b3 4978 if (scan->flags) {
52657f30 4979 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4980 if (!s) {
4981 /* trivial fail */
24d3c4a9
DM
4982 if (logical) {
4983 logical = 0;
4984 sw = 1 - (bool)ST.wanted;
dad79028 4985 }
40a82448 4986 else if (ST.wanted)
dad79028
DM
4987 sayNO;
4988 next = scan + ARG(scan);
4989 if (next == scan)
4990 next = NULL;
4991 break;
4992 }
efb30f32 4993 PL_reginput = s;
a0ed51b3
LW
4994 }
4995 else
4996 PL_reginput = locinput;
4997
c277df42 4998 do_ifmatch:
40a82448 4999 ST.me = scan;
24d3c4a9 5000 ST.logical = logical;
24d786f4
YO
5001 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5002
40a82448
DM
5003 /* execute body of (?...A) */
5004 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5005 /* NOTREACHED */
5006
5007 case IFMATCH_A_fail: /* body of (?...A) failed */
5008 ST.wanted = !ST.wanted;
5009 /* FALL THROUGH */
5010
5011 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9
DM
5012 if (ST.logical) {
5013 sw = (bool)ST.wanted;
40a82448
DM
5014 }
5015 else if (!ST.wanted)
5016 sayNO;
5017
5018 if (OP(ST.me) == SUSPEND)
5019 locinput = PL_reginput;
5020 else {
5021 locinput = PL_reginput = st->locinput;
5022 nextchr = UCHARAT(locinput);
5023 }
5024 scan = ST.me + ARG(ST.me);
5025 if (scan == ST.me)
5026 scan = NULL;
5027 continue; /* execute B */
5028
5029#undef ST
dad79028 5030
c277df42 5031 case LONGJMP:
c277df42
IZ
5032 next = scan + ARG(scan);
5033 if (next == scan)
5034 next = NULL;
a0d0e21e 5035 break;
54612592 5036 case COMMIT:
e2e6a0f1
YO
5037 reginfo->cutpoint = PL_regeol;
5038 /* FALLTHROUGH */
5d458dd8 5039 case PRUNE:
24b23f37 5040 PL_reginput = locinput;
e2e6a0f1 5041 if (!scan->flags)
ad64d0ec 5042 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
54612592
YO
5043 PUSH_STATE_GOTO(COMMIT_next,next);
5044 /* NOTREACHED */
5045 case COMMIT_next_fail:
5046 no_final = 1;
5047 /* FALLTHROUGH */
7f69552c
YO
5048 case OPFAIL:
5049 sayNO;
e2e6a0f1
YO
5050 /* NOTREACHED */
5051
5052#define ST st->u.mark
5053 case MARKPOINT:
5054 ST.prev_mark = mark_state;
5d458dd8 5055 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 5056 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1
YO
5057 mark_state = st;
5058 ST.mark_loc = PL_reginput = locinput;
5059 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5060 /* NOTREACHED */
5061 case MARKPOINT_next:
5062 mark_state = ST.prev_mark;
5063 sayYES;
5064 /* NOTREACHED */
5065 case MARKPOINT_next_fail:
5d458dd8 5066 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
5067 {
5068 if (ST.mark_loc > startpoint)
5069 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5070 popmark = NULL; /* we found our mark */
5071 sv_commit = ST.mark_name;
5072
5073 DEBUG_EXECUTE_r({
5d458dd8 5074 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
5075 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5076 REPORT_CODE_OFF+depth*2, "",
be2597df 5077 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
5078 });
5079 }
5080 mark_state = ST.prev_mark;
5d458dd8
YO
5081 sv_yes_mark = mark_state ?
5082 mark_state->u.mark.mark_name : NULL;
e2e6a0f1
YO
5083 sayNO;
5084 /* NOTREACHED */
5d458dd8
YO
5085 case SKIP:
5086 PL_reginput = locinput;
5087 if (scan->flags) {
2bf803e2 5088 /* (*SKIP) : if we fail we cut here*/
5d458dd8 5089 ST.mark_name = NULL;
e2e6a0f1 5090 ST.mark_loc = locinput;
5d458dd8
YO
5091 PUSH_STATE_GOTO(SKIP_next,next);
5092 } else {
2bf803e2 5093 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
5094 otherwise do nothing. Meaning we need to scan
5095 */
5096 regmatch_state *cur = mark_state;
ad64d0ec 5097 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
5098
5099 while (cur) {
5100 if ( sv_eq( cur->u.mark.mark_name,
5101 find ) )
5102 {
5103 ST.mark_name = find;
5104 PUSH_STATE_GOTO( SKIP_next, next );
5105 }
5106 cur = cur->u.mark.prev_mark;
5107 }
e2e6a0f1 5108 }
2bf803e2 5109 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8
YO
5110 break;
5111 case SKIP_next_fail:
5112 if (ST.mark_name) {
5113 /* (*CUT:NAME) - Set up to search for the name as we
5114 collapse the stack*/
5115 popmark = ST.mark_name;
5116 } else {
5117 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
5118 if (ST.mark_loc > startpoint)
5119 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
5120 /* but we set sv_commit to latest mark_name if there
5121 is one so they can test to see how things lead to this
5122 cut */
5123 if (mark_state)
5124 sv_commit=mark_state->u.mark.mark_name;
5125 }
e2e6a0f1
YO
5126 no_final = 1;
5127 sayNO;
5128 /* NOTREACHED */
5129#undef ST
32e6a07c
YO
5130 case FOLDCHAR:
5131 n = ARG(scan);
81d4fa0f 5132 if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
e64b1bd1
YO
5133 locinput += ln;
5134 } else if ( 0xDF == n && !do_utf8 && !UTF ) {
5135 sayNO;
5136 } else {
5137 U8 folded[UTF8_MAXBYTES_CASE+1];
5138 STRLEN foldlen;
5139 const char * const l = locinput;
5140 char *e = PL_regeol;
5141 to_uni_fold(n, folded, &foldlen);
5142
59fe32ea 5143 if (ibcmp_utf8((const char*) folded, 0, foldlen, 1,
e64b1bd1 5144 l, &e, 0, do_utf8)) {
32e6a07c 5145 sayNO;
e64b1bd1
YO
5146 }
5147 locinput = e;
32e6a07c
YO
5148 }
5149 nextchr = UCHARAT(locinput);
5150 break;
e1d1eefb
YO
5151 case LNBREAK:
5152 if ((n=is_LNBREAK(locinput,do_utf8))) {
5153 locinput += n;
5154 nextchr = UCHARAT(locinput);
5155 } else
5156 sayNO;
5157 break;
5158
5159#define CASE_CLASS(nAmE) \
5160 case nAmE: \
5161 if ((n=is_##nAmE(locinput,do_utf8))) { \
5162 locinput += n; \
5163 nextchr = UCHARAT(locinput); \
5164 } else \
5165 sayNO; \
5166 break; \
5167 case N##nAmE: \
5168 if ((n=is_##nAmE(locinput,do_utf8))) { \
5169 sayNO; \
5170 } else { \
5171 locinput += UTF8SKIP(locinput); \
5172 nextchr = UCHARAT(locinput); \
5173 } \
5174 break
5175
5176 CASE_CLASS(VERTWS);
5177 CASE_CLASS(HORIZWS);
5178#undef CASE_CLASS
5179
a0d0e21e 5180 default:
b900a521 5181 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 5182 PTR2UV(scan), OP(scan));
cea2e8a9 5183 Perl_croak(aTHX_ "regexp memory corruption");
5d458dd8
YO
5184
5185 } /* end switch */
95b24440 5186
5d458dd8
YO
5187 /* switch break jumps here */
5188 scan = next; /* prepare to execute the next op and ... */
5189 continue; /* ... jump back to the top, reusing st */
95b24440
DM
5190 /* NOTREACHED */
5191
40a82448
DM
5192 push_yes_state:
5193 /* push a state that backtracks on success */
5194 st->u.yes.prev_yes_state = yes_state;
5195 yes_state = st;
5196 /* FALL THROUGH */
5197 push_state:
5198 /* push a new regex state, then continue at scan */
5199 {
5200 regmatch_state *newst;
5201
24b23f37
YO
5202 DEBUG_STACK_r({
5203 regmatch_state *cur = st;
5204 regmatch_state *curyes = yes_state;
5205 int curd = depth;
5206 regmatch_slab *slab = PL_regmatch_slab;
5207 for (;curd > -1;cur--,curd--) {
5208 if (cur < SLAB_FIRST(slab)) {
5209 slab = slab->prev;
5210 cur = SLAB_LAST(slab);
5211 }
5212 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5213 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 5214 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
5215 (curyes == cur) ? "yes" : ""
5216 );
5217 if (curyes == cur)
5218 curyes = cur->u.yes.prev_yes_state;
5219 }
5220 } else
5221 DEBUG_STATE_pp("push")
5222 );
40a82448 5223 depth++;
40a82448
DM
5224 st->locinput = locinput;
5225 newst = st+1;
5226 if (newst > SLAB_LAST(PL_regmatch_slab))
5227 newst = S_push_slab(aTHX);
5228 PL_regmatch_state = newst;
786e8c11 5229
40a82448
DM
5230 locinput = PL_reginput;
5231 nextchr = UCHARAT(locinput);
5232 st = newst;
5233 continue;
5234 /* NOTREACHED */
5235 }
a0d0e21e 5236 }
a687059c 5237
a0d0e21e
LW
5238 /*
5239 * We get here only if there's trouble -- normally "case END" is
5240 * the terminating point.
5241 */
cea2e8a9 5242 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 5243 /*NOTREACHED*/
4633a7c4
LW
5244 sayNO;
5245
262b90c4 5246yes:
77cb431f
DM
5247 if (yes_state) {
5248 /* we have successfully completed a subexpression, but we must now
5249 * pop to the state marked by yes_state and continue from there */
77cb431f 5250 assert(st != yes_state);
5bc10b2c
DM
5251#ifdef DEBUGGING
5252 while (st != yes_state) {
5253 st--;
5254 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5255 PL_regmatch_slab = PL_regmatch_slab->prev;
5256 st = SLAB_LAST(PL_regmatch_slab);
5257 }
e2e6a0f1 5258 DEBUG_STATE_r({
54612592
YO
5259 if (no_final) {
5260 DEBUG_STATE_pp("pop (no final)");
5261 } else {
5262 DEBUG_STATE_pp("pop (yes)");
5263 }
e2e6a0f1 5264 });
5bc10b2c
DM
5265 depth--;
5266 }
5267#else
77cb431f
DM
5268 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5269 || yes_state > SLAB_LAST(PL_regmatch_slab))
5270 {
5271 /* not in this slab, pop slab */
5272 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5273 PL_regmatch_slab = PL_regmatch_slab->prev;
5274 st = SLAB_LAST(PL_regmatch_slab);
5275 }
5276 depth -= (st - yes_state);
5bc10b2c 5277#endif
77cb431f
DM
5278 st = yes_state;
5279 yes_state = st->u.yes.prev_yes_state;
5280 PL_regmatch_state = st;
24b23f37 5281
5d458dd8
YO
5282 if (no_final) {
5283 locinput= st->locinput;
5284 nextchr = UCHARAT(locinput);
5285 }
54612592 5286 state_num = st->resume_state + no_final;
24d3c4a9 5287 goto reenter_switch;
77cb431f
DM
5288 }
5289
a3621e74 5290 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 5291 PL_colors[4], PL_colors[5]));
02db2b7b 5292
19b95bf0
DM
5293 if (PL_reg_eval_set) {
5294 /* each successfully executed (?{...}) block does the equivalent of
5295 * local $^R = do {...}
5296 * When popping the save stack, all these locals would be undone;
5297 * bypass this by setting the outermost saved $^R to the latest
5298 * value */
5299 if (oreplsv != GvSV(PL_replgv))
5300 sv_setsv(oreplsv, GvSV(PL_replgv));
5301 }
95b24440 5302 result = 1;
aa283a38 5303 goto final_exit;
4633a7c4
LW
5304
5305no:
a3621e74 5306 DEBUG_EXECUTE_r(
7821416a 5307 PerlIO_printf(Perl_debug_log,
786e8c11 5308 "%*s %sfailed...%s\n",
5bc10b2c 5309 REPORT_CODE_OFF+depth*2, "",
786e8c11 5310 PL_colors[4], PL_colors[5])
7821416a 5311 );
aa283a38 5312
262b90c4 5313no_silent:
54612592
YO
5314 if (no_final) {
5315 if (yes_state) {
5316 goto yes;
5317 } else {
5318 goto final_exit;
5319 }
5320 }
aa283a38
DM
5321 if (depth) {
5322 /* there's a previous state to backtrack to */
40a82448
DM
5323 st--;
5324 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5325 PL_regmatch_slab = PL_regmatch_slab->prev;
5326 st = SLAB_LAST(PL_regmatch_slab);
5327 }
5328 PL_regmatch_state = st;
40a82448
DM
5329 locinput= st->locinput;
5330 nextchr = UCHARAT(locinput);
5331
5bc10b2c
DM
5332 DEBUG_STATE_pp("pop");
5333 depth--;
262b90c4
DM
5334 if (yes_state == st)
5335 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 5336
24d3c4a9
DM
5337 state_num = st->resume_state + 1; /* failure = success + 1 */
5338 goto reenter_switch;
95b24440 5339 }
24d3c4a9 5340 result = 0;
aa283a38 5341
262b90c4 5342 final_exit:
bbe252da 5343 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
5344 SV *sv_err = get_sv("REGERROR", 1);
5345 SV *sv_mrk = get_sv("REGMARK", 1);
5346 if (result) {
e2e6a0f1 5347 sv_commit = &PL_sv_no;
5d458dd8
YO
5348 if (!sv_yes_mark)
5349 sv_yes_mark = &PL_sv_yes;
5350 } else {
5351 if (!sv_commit)
5352 sv_commit = &PL_sv_yes;
5353 sv_yes_mark = &PL_sv_no;
5354 }
5355 sv_setsv(sv_err, sv_commit);
5356 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 5357 }
19b95bf0 5358
2f554ef7
DM
5359 /* clean up; in particular, free all slabs above current one */
5360 LEAVE_SCOPE(oldsave);
5d9a96ca 5361
95b24440 5362 return result;
a687059c
LW
5363}
5364
5365/*
5366 - regrepeat - repeatedly match something simple, report how many
5367 */
5368/*
5369 * [This routine now assumes that it will only match on things of length 1.
5370 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 5371 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 5372 */
76e3520e 5373STATIC I32
e2e6a0f1 5374S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
a687059c 5375{
27da23d5 5376 dVAR;
a0d0e21e 5377 register char *scan;
a0d0e21e 5378 register I32 c;
3280af22 5379 register char *loceol = PL_regeol;
a0ed51b3 5380 register I32 hardcount = 0;
53c4c00c 5381 register bool do_utf8 = PL_reg_match_utf8;
4f55667c
SP
5382#ifndef DEBUGGING
5383 PERL_UNUSED_ARG(depth);
5384#endif
a0d0e21e 5385
7918f24d
NC
5386 PERL_ARGS_ASSERT_REGREPEAT;
5387
3280af22 5388 scan = PL_reginput;
faf11cac
HS
5389 if (max == REG_INFTY)
5390 max = I32_MAX;
5391 else if (max < loceol - scan)
7f596f4c 5392 loceol = scan + max;
a0d0e21e 5393 switch (OP(p)) {
22c35a8c 5394 case REG_ANY:
1aa99e6b 5395 if (do_utf8) {
ffc61ed2 5396 loceol = PL_regeol;
1aa99e6b 5397 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
5398 scan += UTF8SKIP(scan);
5399 hardcount++;
5400 }
5401 } else {
5402 while (scan < loceol && *scan != '\n')
5403 scan++;
a0ed51b3
LW
5404 }
5405 break;
ffc61ed2 5406 case SANY:
def8e4ea
JH
5407 if (do_utf8) {
5408 loceol = PL_regeol;
a0804c9e 5409 while (scan < loceol && hardcount < max) {
def8e4ea
JH
5410 scan += UTF8SKIP(scan);
5411 hardcount++;
5412 }
5413 }
5414 else
5415 scan = loceol;
a0ed51b3 5416 break;
f33976b4
DB
5417 case CANY:
5418 scan = loceol;
5419 break;
090f7165
JH
5420 case EXACT: /* length of string is 1 */
5421 c = (U8)*STRING(p);
5422 while (scan < loceol && UCHARAT(scan) == c)
5423 scan++;
bbce6d69 5424 break;
5425 case EXACTF: /* length of string is 1 */
cd439c50 5426 c = (U8)*STRING(p);
bbce6d69 5427 while (scan < loceol &&
22c35a8c 5428 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 5429 scan++;
5430 break;
5431 case EXACTFL: /* length of string is 1 */
3280af22 5432 PL_reg_flags |= RF_tainted;
cd439c50 5433 c = (U8)*STRING(p);
bbce6d69 5434 while (scan < loceol &&
22c35a8c 5435 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
5436 scan++;
5437 break;
5438 case ANYOF:
ffc61ed2
JH
5439 if (do_utf8) {
5440 loceol = PL_regeol;
cfc92286 5441 while (hardcount < max && scan < loceol &&
32fc9b6a 5442 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
5443 scan += UTF8SKIP(scan);
5444 hardcount++;
5445 }
5446 } else {
32fc9b6a 5447 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
5448 scan++;
5449 }
a0d0e21e
LW
5450 break;
5451 case ALNUM:
1aa99e6b 5452 if (do_utf8) {
ffc61ed2 5453 loceol = PL_regeol;
1a4fad37 5454 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 5455 while (hardcount < max && scan < loceol &&
3568d838 5456 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5457 scan += UTF8SKIP(scan);
5458 hardcount++;
5459 }
5460 } else {
5461 while (scan < loceol && isALNUM(*scan))
5462 scan++;
a0ed51b3
LW
5463 }
5464 break;
bbce6d69 5465 case ALNUML:
3280af22 5466 PL_reg_flags |= RF_tainted;
1aa99e6b 5467 if (do_utf8) {
ffc61ed2 5468 loceol = PL_regeol;
1aa99e6b
IH
5469 while (hardcount < max && scan < loceol &&
5470 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5471 scan += UTF8SKIP(scan);
5472 hardcount++;
5473 }
5474 } else {
5475 while (scan < loceol && isALNUM_LC(*scan))
5476 scan++;
a0ed51b3
LW
5477 }
5478 break;
a0d0e21e 5479 case NALNUM:
1aa99e6b 5480 if (do_utf8) {
ffc61ed2 5481 loceol = PL_regeol;
1a4fad37 5482 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 5483 while (hardcount < max && scan < loceol &&
3568d838 5484 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5485 scan += UTF8SKIP(scan);
5486 hardcount++;
5487 }
5488 } else {
5489 while (scan < loceol && !isALNUM(*scan))
5490 scan++;
a0ed51b3
LW
5491 }
5492 break;
bbce6d69 5493 case NALNUML:
3280af22 5494 PL_reg_flags |= RF_tainted;
1aa99e6b 5495 if (do_utf8) {
ffc61ed2 5496 loceol = PL_regeol;
1aa99e6b
IH
5497 while (hardcount < max && scan < loceol &&
5498 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5499 scan += UTF8SKIP(scan);
5500 hardcount++;
5501 }
5502 } else {
5503 while (scan < loceol && !isALNUM_LC(*scan))
5504 scan++;
a0ed51b3
LW
5505 }
5506 break;
a0d0e21e 5507 case SPACE:
1aa99e6b 5508 if (do_utf8) {
ffc61ed2 5509 loceol = PL_regeol;
1a4fad37 5510 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5511 while (hardcount < max && scan < loceol &&
3568d838
JH
5512 (*scan == ' ' ||
5513 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5514 scan += UTF8SKIP(scan);
5515 hardcount++;
5516 }
5517 } else {
5518 while (scan < loceol && isSPACE(*scan))
5519 scan++;
a0ed51b3
LW
5520 }
5521 break;
bbce6d69 5522 case SPACEL:
3280af22 5523 PL_reg_flags |= RF_tainted;
1aa99e6b 5524 if (do_utf8) {
ffc61ed2 5525 loceol = PL_regeol;
1aa99e6b 5526 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5527 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5528 scan += UTF8SKIP(scan);
5529 hardcount++;
5530 }
5531 } else {
5532 while (scan < loceol && isSPACE_LC(*scan))
5533 scan++;
a0ed51b3
LW
5534 }
5535 break;
a0d0e21e 5536 case NSPACE:
1aa99e6b 5537 if (do_utf8) {
ffc61ed2 5538 loceol = PL_regeol;
1a4fad37 5539 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5540 while (hardcount < max && scan < loceol &&
3568d838
JH
5541 !(*scan == ' ' ||
5542 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5543 scan += UTF8SKIP(scan);
5544 hardcount++;
5545 }
5546 } else {
5547 while (scan < loceol && !isSPACE(*scan))
5548 scan++;
a0ed51b3 5549 }
0008a298 5550 break;
bbce6d69 5551 case NSPACEL:
3280af22 5552 PL_reg_flags |= RF_tainted;
1aa99e6b 5553 if (do_utf8) {
ffc61ed2 5554 loceol = PL_regeol;
1aa99e6b 5555 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5556 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5557 scan += UTF8SKIP(scan);
5558 hardcount++;
5559 }
5560 } else {
5561 while (scan < loceol && !isSPACE_LC(*scan))
5562 scan++;
a0ed51b3
LW
5563 }
5564 break;
a0d0e21e 5565 case DIGIT:
1aa99e6b 5566 if (do_utf8) {
ffc61ed2 5567 loceol = PL_regeol;
1a4fad37 5568 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5569 while (hardcount < max && scan < loceol &&
3568d838 5570 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5571 scan += UTF8SKIP(scan);
5572 hardcount++;
5573 }
5574 } else {
5575 while (scan < loceol && isDIGIT(*scan))
5576 scan++;
a0ed51b3
LW
5577 }
5578 break;
a0d0e21e 5579 case NDIGIT:
1aa99e6b 5580 if (do_utf8) {
ffc61ed2 5581 loceol = PL_regeol;
1a4fad37 5582 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5583 while (hardcount < max && scan < loceol &&
3568d838 5584 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5585 scan += UTF8SKIP(scan);
5586 hardcount++;
5587 }
5588 } else {
5589 while (scan < loceol && !isDIGIT(*scan))
5590 scan++;
a0ed51b3 5591 }
e1d1eefb
YO
5592 case LNBREAK:
5593 if (do_utf8) {
5594 loceol = PL_regeol;
5595 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5596 scan += c;
5597 hardcount++;
5598 }
5599 } else {
5600 /*
5601 LNBREAK can match two latin chars, which is ok,
5602 because we have a null terminated string, but we
5603 have to use hardcount in this situation
5604 */
5605 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
5606 scan+=c;
5607 hardcount++;
5608 }
5609 }
5610 break;
5611 case HORIZWS:
5612 if (do_utf8) {
5613 loceol = PL_regeol;
5614 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5615 scan += c;
5616 hardcount++;
5617 }
5618 } else {
5619 while (scan < loceol && is_HORIZWS_latin1(scan))
5620 scan++;
5621 }
a0ed51b3 5622 break;
e1d1eefb
YO
5623 case NHORIZWS:
5624 if (do_utf8) {
5625 loceol = PL_regeol;
5626 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5627 scan += UTF8SKIP(scan);
5628 hardcount++;
5629 }
5630 } else {
5631 while (scan < loceol && !is_HORIZWS_latin1(scan))
5632 scan++;
5633
5634 }
5635 break;
5636 case VERTWS:
5637 if (do_utf8) {
5638 loceol = PL_regeol;
5639 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5640 scan += c;
5641 hardcount++;
5642 }
5643 } else {
5644 while (scan < loceol && is_VERTWS_latin1(scan))
5645 scan++;
5646
5647 }
5648 break;
5649 case NVERTWS:
5650 if (do_utf8) {
5651 loceol = PL_regeol;
5652 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5653 scan += UTF8SKIP(scan);
5654 hardcount++;
5655 }
5656 } else {
5657 while (scan < loceol && !is_VERTWS_latin1(scan))
5658 scan++;
5659
5660 }
5661 break;
5662
a0d0e21e
LW
5663 default: /* Called on something of 0 width. */
5664 break; /* So match right here or not at all. */
5665 }
a687059c 5666
a0ed51b3
LW
5667 if (hardcount)
5668 c = hardcount;
5669 else
5670 c = scan - PL_reginput;
3280af22 5671 PL_reginput = scan;
a687059c 5672
a3621e74 5673 DEBUG_r({
e68ec53f 5674 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 5675 DEBUG_EXECUTE_r({
e68ec53f
YO
5676 SV * const prop = sv_newmortal();
5677 regprop(prog, prop, p);
5678 PerlIO_printf(Perl_debug_log,
be8e71aa 5679 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 5680 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 5681 });
be8e71aa 5682 });
9041c2e3 5683
a0d0e21e 5684 return(c);
a687059c
LW
5685}
5686
c277df42 5687
be8e71aa 5688#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 5689/*
ffc61ed2
JH
5690- regclass_swash - prepare the utf8 swash
5691*/
5692
5693SV *
32fc9b6a 5694Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 5695{
97aff369 5696 dVAR;
9e55ce06
JH
5697 SV *sw = NULL;
5698 SV *si = NULL;
5699 SV *alt = NULL;
f8fc2ecf
YO
5700 RXi_GET_DECL(prog,progi);
5701 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 5702
7918f24d
NC
5703 PERL_ARGS_ASSERT_REGCLASS_SWASH;
5704
4f639d21 5705 if (data && data->count) {
a3b680e6 5706 const U32 n = ARG(node);
ffc61ed2 5707
4f639d21 5708 if (data->what[n] == 's') {
ad64d0ec
NC
5709 SV * const rv = MUTABLE_SV(data->data[n]);
5710 AV * const av = MUTABLE_AV(SvRV(rv));
2d03de9c 5711 SV **const ary = AvARRAY(av);
9e55ce06 5712 SV **a, **b;
9041c2e3 5713
711a919c 5714 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5715 * documentation of these array elements. */
5716
b11f357e 5717 si = *ary;
fe5bfecd
JH
5718 a = SvROK(ary[1]) ? &ary[1] : NULL;
5719 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
b11f357e 5720
ffc61ed2
JH
5721 if (a)
5722 sw = *a;
5723 else if (si && doinit) {
5724 sw = swash_init("utf8", "", si, 1, 0);
5725 (void)av_store(av, 1, sw);
5726 }
9e55ce06
JH
5727 if (b)
5728 alt = *b;
ffc61ed2
JH
5729 }
5730 }
5731
9e55ce06
JH
5732 if (listsvp)
5733 *listsvp = si;
5734 if (altsvp)
5735 *altsvp = alt;
ffc61ed2
JH
5736
5737 return sw;
5738}
76234dfb 5739#endif
ffc61ed2
JH
5740
5741/*
ba7b4546 5742 - reginclass - determine if a character falls into a character class
832705d4
JH
5743
5744 The n is the ANYOF regnode, the p is the target string, lenp
5745 is pointer to the maximum length of how far to go in the p
5746 (if the lenp is zero, UTF8SKIP(p) is used),
5747 do_utf8 tells whether the target string is in UTF-8.
5748
bbce6d69 5749 */
5750
76e3520e 5751STATIC bool
32fc9b6a 5752S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5753{
27da23d5 5754 dVAR;
a3b680e6 5755 const char flags = ANYOF_FLAGS(n);
bbce6d69 5756 bool match = FALSE;
cc07378b 5757 UV c = *p;
ae9ddab8 5758 STRLEN len = 0;
9e55ce06 5759 STRLEN plen;
1aa99e6b 5760
7918f24d
NC
5761 PERL_ARGS_ASSERT_REGINCLASS;
5762
19f67299
TS
5763 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5764 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5765 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5766 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
e8a70c6f
SP
5767 if (len == (STRLEN)-1)
5768 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 5769 }
bbce6d69 5770
0f0076b4 5771 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5772 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5773 if (lenp)
5774 *lenp = 0;
ffc61ed2 5775 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5776 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5777 match = TRUE;
bbce6d69 5778 }
3568d838 5779 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5780 match = TRUE;
ffc61ed2 5781 if (!match) {
9e55ce06 5782 AV *av;
32fc9b6a 5783 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5784
5785 if (sw) {
3f0c5693
KW
5786 U8 * utf8_p;
5787 if (do_utf8) {
5788 utf8_p = (U8 *) p;
5789 } else {
5790 STRLEN len = 1;
5791 utf8_p = bytes_to_utf8(p, &len);
5792 }
5793 if (swash_fetch(sw, utf8_p, 1))
ffc61ed2
JH
5794 match = TRUE;
5795 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5796 if (!match && lenp && av) {
5797 I32 i;
9e55ce06 5798 for (i = 0; i <= av_len(av); i++) {
890ce7af 5799 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5800 STRLEN len;
890ce7af 5801 const char * const s = SvPV_const(sv, len);
3f0c5693 5802 if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
9e55ce06
JH
5803 *lenp = len;
5804 match = TRUE;
5805 break;
5806 }
5807 }
5808 }
5809 if (!match) {
89ebb4a3 5810 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43 5811
3f0c5693
KW
5812 STRLEN tmplen;
5813 to_utf8_fold(utf8_p, tmpbuf, &tmplen);
5814 if (swash_fetch(sw, tmpbuf, 1))
9e55ce06
JH
5815 match = TRUE;
5816 }
ffc61ed2 5817 }
b3a04dd3
KW
5818
5819 /* If we allocated a string above, free it */
5820 if (! do_utf8) Safefree(utf8_p);
ffc61ed2 5821 }
bbce6d69 5822 }
9e55ce06 5823 if (match && lenp && *lenp == 0)
0f0076b4 5824 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5825 }
1aa99e6b 5826 if (!match && c < 256) {
ffc61ed2
JH
5827 if (ANYOF_BITMAP_TEST(n, c))
5828 match = TRUE;
5829 else if (flags & ANYOF_FOLD) {
eb160463 5830 U8 f;
a0ed51b3 5831
ffc61ed2
JH
5832 if (flags & ANYOF_LOCALE) {
5833 PL_reg_flags |= RF_tainted;
5834 f = PL_fold_locale[c];
5835 }
5836 else
5837 f = PL_fold[c];
5838 if (f != c && ANYOF_BITMAP_TEST(n, f))
5839 match = TRUE;
5840 }
5841
5842 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5843 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5844 if (
5845 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5846 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5847 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5848 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5849 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5850 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5851 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5852 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5853 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5854 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5855 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5856 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5857 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5858 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5859 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5860 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5861 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5862 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5863 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5864 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5865 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5866 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5867 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5868 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5869 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5870 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5871 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5872 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5873 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5874 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5875 ) /* How's that for a conditional? */
5876 {
5877 match = TRUE;
5878 }
a0ed51b3 5879 }
a0ed51b3
LW
5880 }
5881
a0ed51b3
LW
5882 return (flags & ANYOF_INVERT) ? !match : match;
5883}
161b471a 5884
dfe13c55 5885STATIC U8 *
0ce71af7 5886S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5887{
97aff369 5888 dVAR;
7918f24d
NC
5889
5890 PERL_ARGS_ASSERT_REGHOP3;
5891
a0ed51b3 5892 if (off >= 0) {
1aa99e6b 5893 while (off-- && s < lim) {
ffc61ed2 5894 /* XXX could check well-formedness here */
a0ed51b3 5895 s += UTF8SKIP(s);
ffc61ed2 5896 }
a0ed51b3
LW
5897 }
5898 else {
1de06328
YO
5899 while (off++ && s > lim) {
5900 s--;
5901 if (UTF8_IS_CONTINUED(*s)) {
5902 while (s > lim && UTF8_IS_CONTINUATION(*s))
5903 s--;
a0ed51b3 5904 }
1de06328 5905 /* XXX could check well-formedness here */
a0ed51b3
LW
5906 }
5907 }
5908 return s;
5909}
161b471a 5910
f9f4320a
YO
5911#ifdef XXX_dmq
5912/* there are a bunch of places where we use two reghop3's that should
5913 be replaced with this routine. but since thats not done yet
5914 we ifdef it out - dmq
5915*/
dfe13c55 5916STATIC U8 *
1de06328
YO
5917S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5918{
5919 dVAR;
7918f24d
NC
5920
5921 PERL_ARGS_ASSERT_REGHOP4;
5922
1de06328
YO
5923 if (off >= 0) {
5924 while (off-- && s < rlim) {
5925 /* XXX could check well-formedness here */
5926 s += UTF8SKIP(s);
5927 }
5928 }
5929 else {
5930 while (off++ && s > llim) {
5931 s--;
5932 if (UTF8_IS_CONTINUED(*s)) {
5933 while (s > llim && UTF8_IS_CONTINUATION(*s))
5934 s--;
5935 }
5936 /* XXX could check well-formedness here */
5937 }
5938 }
5939 return s;
5940}
f9f4320a 5941#endif
1de06328
YO
5942
5943STATIC U8 *
0ce71af7 5944S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5945{
97aff369 5946 dVAR;
7918f24d
NC
5947
5948 PERL_ARGS_ASSERT_REGHOPMAYBE3;
5949
a0ed51b3 5950 if (off >= 0) {
1aa99e6b 5951 while (off-- && s < lim) {
ffc61ed2 5952 /* XXX could check well-formedness here */
a0ed51b3 5953 s += UTF8SKIP(s);
ffc61ed2 5954 }
a0ed51b3 5955 if (off >= 0)
3dab1dad 5956 return NULL;
a0ed51b3
LW
5957 }
5958 else {
1de06328
YO
5959 while (off++ && s > lim) {
5960 s--;
5961 if (UTF8_IS_CONTINUED(*s)) {
5962 while (s > lim && UTF8_IS_CONTINUATION(*s))
5963 s--;
a0ed51b3 5964 }
1de06328 5965 /* XXX could check well-formedness here */
a0ed51b3
LW
5966 }
5967 if (off <= 0)
3dab1dad 5968 return NULL;
a0ed51b3
LW
5969 }
5970 return s;
5971}
51371543 5972
51371543 5973static void
acfe0abc 5974restore_pos(pTHX_ void *arg)
51371543 5975{
97aff369 5976 dVAR;
097eb12c 5977 regexp * const rex = (regexp *)arg;
51371543
GS
5978 if (PL_reg_eval_set) {
5979 if (PL_reg_oldsaved) {
4f639d21
DM
5980 rex->subbeg = PL_reg_oldsaved;
5981 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5982#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5983 rex->saved_copy = PL_nrs;
ed252734 5984#endif
07bc277f 5985 RXp_MATCH_COPIED_on(rex);
51371543
GS
5986 }
5987 PL_reg_magic->mg_len = PL_reg_oldpos;
5988 PL_reg_eval_set = 0;
5989 PL_curpm = PL_reg_oldcurpm;
5990 }
5991}
33b8afdf
JH
5992
5993STATIC void
5994S_to_utf8_substr(pTHX_ register regexp *prog)
5995{
a1cac82e 5996 int i = 1;
7918f24d
NC
5997
5998 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
5999
a1cac82e
NC
6000 do {
6001 if (prog->substrs->data[i].substr
6002 && !prog->substrs->data[i].utf8_substr) {
6003 SV* const sv = newSVsv(prog->substrs->data[i].substr);
6004 prog->substrs->data[i].utf8_substr = sv;
6005 sv_utf8_upgrade(sv);
610460f9
NC
6006 if (SvVALID(prog->substrs->data[i].substr)) {
6007 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6008 if (flags & FBMcf_TAIL) {
6009 /* Trim the trailing \n that fbm_compile added last
6010 time. */
6011 SvCUR_set(sv, SvCUR(sv) - 1);
6012 /* Whilst this makes the SV technically "invalid" (as its
6013 buffer is no longer followed by "\0") when fbm_compile()
6014 adds the "\n" back, a "\0" is restored. */
6015 }
6016 fbm_compile(sv, flags);
6017 }
a1cac82e
NC
6018 if (prog->substrs->data[i].substr == prog->check_substr)
6019 prog->check_utf8 = sv;
6020 }
6021 } while (i--);
33b8afdf
JH
6022}
6023
6024STATIC void
6025S_to_byte_substr(pTHX_ register regexp *prog)
6026{
97aff369 6027 dVAR;
a1cac82e 6028 int i = 1;
7918f24d
NC
6029
6030 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6031
a1cac82e
NC
6032 do {
6033 if (prog->substrs->data[i].utf8_substr
6034 && !prog->substrs->data[i].substr) {
6035 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6036 if (sv_utf8_downgrade(sv, TRUE)) {
610460f9
NC
6037 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6038 const U8 flags
6039 = BmFLAGS(prog->substrs->data[i].utf8_substr);
6040 if (flags & FBMcf_TAIL) {
6041 /* Trim the trailing \n that fbm_compile added last
6042 time. */
6043 SvCUR_set(sv, SvCUR(sv) - 1);
6044 }
6045 fbm_compile(sv, flags);
6046 }
a1cac82e
NC
6047 } else {
6048 SvREFCNT_dec(sv);
6049 sv = &PL_sv_undef;
6050 }
6051 prog->substrs->data[i].substr = sv;
6052 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6053 prog->check_substr = sv;
33b8afdf 6054 }
a1cac82e 6055 } while (i--);
33b8afdf 6056}
66610fdd
RGS
6057
6058/*
6059 * Local variables:
6060 * c-indentation-style: bsd
6061 * c-basic-offset: 4
6062 * indent-tabs-mode: t
6063 * End:
6064 *
37442d52
RGS
6065 * ex: set ts=8 sts=4 sw=4 noet:
6066 */