This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Merge branch 'blead' of ssh://perl5.git.perl.org/perl into blead
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
c277df42
IZ
83#define RF_tainted 1 /* tainted information used? */
84#define RF_warned 2 /* warned about big count? */
faec1544 85
ab3bbdeb 86#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 87
eb160463 88#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
89
90#define RS_init 1 /* eval environment created */
91#define RS_set 2 /* replsv value is set */
c277df42 92
a687059c
LW
93#ifndef STATIC
94#define STATIC static
95#endif
96
32fc9b6a 97#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 98
c277df42
IZ
99/*
100 * Forwards.
101 */
102
33b8afdf 103#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 104#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 105
3dab1dad
YO
106#define HOPc(pos,off) \
107 (char *)(PL_reg_match_utf8 \
52657f30 108 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
109 : (U8*)(pos + off))
110#define HOPBACKc(pos, off) \
07be1b83
YO
111 (char*)(PL_reg_match_utf8\
112 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
113 : (pos - off >= PL_bostr) \
8e11feef 114 ? (U8*)pos - off \
3dab1dad 115 : NULL)
efb30f32 116
e7409c1b 117#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 118#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 119
20d0b1e9 120/* these are unrolled below in the CCC_TRY_XXX defined */
1a4fad37
AL
121#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
122 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
123#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
124#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
125#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
126#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 127
20d0b1e9 128
d1eb3177
YO
129/*
130 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
131 so that it is possible to override the option here without having to
132 rebuild the entire core. as we are required to do if we change regcomp.h
133 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
134*/
135#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
136#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
137#endif
138
139#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
140#define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS_ALNUM()
141#define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS_SPACE()
142#define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
143#define RE_utf8_perl_word PL_utf8_alnum
144#define RE_utf8_perl_space PL_utf8_space
145#define RE_utf8_posix_digit PL_utf8_digit
146#define perl_word alnum
147#define perl_space space
148#define posix_digit digit
149#else
150#define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS(perl_word,"a")
151#define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS(perl_space," ")
152#define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
153#define RE_utf8_perl_word PL_utf8_perl_word
154#define RE_utf8_perl_space PL_utf8_perl_space
155#define RE_utf8_posix_digit PL_utf8_posix_digit
156#endif
157
158
20d0b1e9
YO
159#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
160 case NAMEL: \
161 PL_reg_flags |= RF_tainted; \
162 /* FALL THROUGH */ \
163 case NAME: \
164 if (!nextchr) \
165 sayNO; \
166 if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) { \
167 if (!CAT2(PL_utf8_,CLASS)) { \
168 bool ok; \
169 ENTER; \
170 save_re_context(); \
171 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
172 assert(ok); \
173 LEAVE; \
174 } \
175 if (!(OP(scan) == NAME \
176 ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8) \
177 : LCFUNC_utf8((U8*)locinput))) \
178 { \
179 sayNO; \
180 } \
181 locinput += PL_utf8skip[nextchr]; \
182 nextchr = UCHARAT(locinput); \
183 break; \
184 } \
185 if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
186 sayNO; \
187 nextchr = UCHARAT(++locinput); \
188 break
189
190#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
191 case NAMEL: \
192 PL_reg_flags |= RF_tainted; \
193 /* FALL THROUGH */ \
194 case NAME : \
195 if (!nextchr && locinput >= PL_regeol) \
196 sayNO; \
197 if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) { \
198 if (!CAT2(PL_utf8_,CLASS)) { \
199 bool ok; \
200 ENTER; \
201 save_re_context(); \
202 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
203 assert(ok); \
204 LEAVE; \
205 } \
206 if ((OP(scan) == NAME \
207 ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8) \
208 : LCFUNC_utf8((U8*)locinput))) \
209 { \
210 sayNO; \
211 } \
212 locinput += PL_utf8skip[nextchr]; \
213 nextchr = UCHARAT(locinput); \
214 break; \
215 } \
216 if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
217 sayNO; \
218 nextchr = UCHARAT(++locinput); \
219 break
220
221
d1eb3177
YO
222
223
224
3dab1dad
YO
225/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
226
5f80c4cf 227/* for use after a quantifier and before an EXACT-like node -- japhy */
3e901dc0
YO
228/* it would be nice to rework regcomp.sym to generate this stuff. sigh */
229#define JUMPABLE(rn) ( \
230 OP(rn) == OPEN || \
231 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
232 OP(rn) == EVAL || \
cca55fe3
JP
233 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
234 OP(rn) == PLUS || OP(rn) == MINMOD || \
ee9b8eae 235 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
3dab1dad 236 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 237)
ee9b8eae 238#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 239
ee9b8eae
YO
240#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
241
242#if 0
243/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
244 we don't need this definition. */
245#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
246#define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
247#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
248
249#else
250/* ... so we use this as its faster. */
251#define IS_TEXT(rn) ( OP(rn)==EXACT )
252#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
253#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
254
255#endif
e2d8ce26 256
a84d97b6
HS
257/*
258 Search for mandatory following text node; for lookahead, the text must
259 follow but for lookbehind (rn->flags != 0) we skip to the next step.
260*/
cca55fe3 261#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
262 while (JUMPABLE(rn)) { \
263 const OPCODE type = OP(rn); \
264 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 265 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 266 else if (type == PLUS) \
cca55fe3 267 rn = NEXTOPER(rn); \
3dab1dad 268 else if (type == IFMATCH) \
a84d97b6 269 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 270 else rn += NEXT_OFF(rn); \
3dab1dad 271 } \
5f80c4cf 272} STMT_END
74750237 273
c476f425 274
acfe0abc 275static void restore_pos(pTHX_ void *arg);
51371543 276
76e3520e 277STATIC CHECKPOINT
cea2e8a9 278S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 279{
97aff369 280 dVAR;
a3b680e6 281 const int retval = PL_savestack_ix;
b1ce53c5 282#define REGCP_PAREN_ELEMS 4
a3b680e6 283 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 284 int p;
40a82448 285 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 286
e49a9654
IH
287 if (paren_elems_to_push < 0)
288 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
289
f0ab9afb 290#define REGCP_OTHER_ELEMS 7
4b3c1a47 291 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
7f69552c 292
3280af22 293 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 294/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
f0ab9afb
NC
295 SSPUSHINT(PL_regoffs[p].end);
296 SSPUSHINT(PL_regoffs[p].start);
3280af22 297 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 298 SSPUSHINT(p);
e7707071 299 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
40a82448 300 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
f0ab9afb 301 (UV)p, (IV)PL_regoffs[p].start,
40a82448 302 (IV)(PL_reg_start_tmp[p] - PL_bostr),
f0ab9afb 303 (IV)PL_regoffs[p].end
40a82448 304 ));
a0d0e21e 305 }
b1ce53c5 306/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
f0ab9afb 307 SSPUSHPTR(PL_regoffs);
3280af22
NIS
308 SSPUSHINT(PL_regsize);
309 SSPUSHINT(*PL_reglastparen);
a01268b5 310 SSPUSHINT(*PL_reglastcloseparen);
3280af22 311 SSPUSHPTR(PL_reginput);
41123dfd
JH
312#define REGCP_FRAME_ELEMS 2
313/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
314 * are needed for the regexp context stack bookkeeping. */
315 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 316 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 317
a0d0e21e
LW
318 return retval;
319}
320
c277df42 321/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
322#define REGCP_SET(cp) \
323 DEBUG_STATE_r( \
ab3bbdeb 324 PerlIO_printf(Perl_debug_log, \
e4f74956 325 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
326 (IV)PL_savestack_ix)); \
327 cp = PL_savestack_ix
c3464db5 328
ab3bbdeb 329#define REGCP_UNWIND(cp) \
e4f74956 330 DEBUG_STATE_r( \
ab3bbdeb 331 if (cp != PL_savestack_ix) \
e4f74956
YO
332 PerlIO_printf(Perl_debug_log, \
333 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
334 (IV)(cp), (IV)PL_savestack_ix)); \
335 regcpblow(cp)
c277df42 336
76e3520e 337STATIC char *
097eb12c 338S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 339{
97aff369 340 dVAR;
3b6647e0 341 U32 i;
a0d0e21e 342 char *input;
a3621e74
YO
343 GET_RE_DEBUG_FLAGS_DECL;
344
7918f24d
NC
345 PERL_ARGS_ASSERT_REGCPPOP;
346
b1ce53c5 347 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 348 i = SSPOPINT;
b1ce53c5
JH
349 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
350 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 351 input = (char *) SSPOPPTR;
a01268b5 352 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
353 *PL_reglastparen = SSPOPINT;
354 PL_regsize = SSPOPINT;
f0ab9afb 355 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
b1ce53c5 356
6bda09f9 357
b1ce53c5 358 /* Now restore the parentheses context. */
41123dfd
JH
359 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
360 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 361 I32 tmps;
097eb12c 362 U32 paren = (U32)SSPOPINT;
3280af22 363 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
f0ab9afb 364 PL_regoffs[paren].start = SSPOPINT;
cf93c79d 365 tmps = SSPOPINT;
3280af22 366 if (paren <= *PL_reglastparen)
f0ab9afb 367 PL_regoffs[paren].end = tmps;
e7707071 368 DEBUG_BUFFERS_r(
c3464db5 369 PerlIO_printf(Perl_debug_log,
b900a521 370 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
f0ab9afb 371 (UV)paren, (IV)PL_regoffs[paren].start,
b900a521 372 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
f0ab9afb 373 (IV)PL_regoffs[paren].end,
3280af22 374 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 375 );
a0d0e21e 376 }
e7707071 377 DEBUG_BUFFERS_r(
bb7a0f54 378 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 379 PerlIO_printf(Perl_debug_log,
faccc32b 380 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 381 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
382 }
383 );
daf18116 384#if 1
dafc8851
JH
385 /* It would seem that the similar code in regtry()
386 * already takes care of this, and in fact it is in
387 * a better location to since this code can #if 0-ed out
388 * but the code in regtry() is needed or otherwise tests
389 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
390 * (as of patchlevel 7877) will fail. Then again,
391 * this code seems to be necessary or otherwise
225593e1
DM
392 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
393 * --jhi updated by dapm */
3b6647e0 394 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
097eb12c 395 if (i > PL_regsize)
f0ab9afb
NC
396 PL_regoffs[i].start = -1;
397 PL_regoffs[i].end = -1;
a0d0e21e 398 }
dafc8851 399#endif
a0d0e21e
LW
400 return input;
401}
402
02db2b7b 403#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 404
a687059c 405/*
e50aee73 406 * pregexec and friends
a687059c
LW
407 */
408
76234dfb 409#ifndef PERL_IN_XSUB_RE
a687059c 410/*
c277df42 411 - pregexec - match a regexp against a string
a687059c 412 */
c277df42 413I32
49d7dfbc 414Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 415 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
416/* strend: pointer to null at end of string */
417/* strbeg: real beginning of string */
418/* minend: end of match must be >=minend after stringarg. */
419/* nosave: For optimizations. */
420{
7918f24d
NC
421 PERL_ARGS_ASSERT_PREGEXEC;
422
c277df42 423 return
9041c2e3 424 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
425 nosave ? 0 : REXEC_COPY_STR);
426}
76234dfb 427#endif
22e551b9 428
9041c2e3 429/*
cad2e5aa
JH
430 * Need to implement the following flags for reg_anch:
431 *
432 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
433 * USE_INTUIT_ML
434 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
435 * INTUIT_AUTORITATIVE_ML
436 * INTUIT_ONCE_NOML - Intuit can match in one location only.
437 * INTUIT_ONCE_ML
438 *
439 * Another flag for this function: SECOND_TIME (so that float substrs
440 * with giant delta may be not rechecked).
441 */
442
443/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
444
3f7c398e 445/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
446 Otherwise, only SvCUR(sv) is used to get strbeg. */
447
448/* XXXX We assume that strpos is strbeg unless sv. */
449
6eb5f6b9
JH
450/* XXXX Some places assume that there is a fixed substring.
451 An update may be needed if optimizer marks as "INTUITable"
452 RExen without fixed substrings. Similarly, it is assumed that
453 lengths of all the strings are no more than minlen, thus they
454 cannot come from lookahead.
40d049e4
YO
455 (Or minlen should take into account lookahead.)
456 NOTE: Some of this comment is not correct. minlen does now take account
457 of lookahead/behind. Further research is required. -- demerphq
458
459*/
6eb5f6b9 460
2c2d71f5
JH
461/* A failure to find a constant substring means that there is no need to make
462 an expensive call to REx engine, thus we celebrate a failure. Similarly,
463 finding a substring too deep into the string means that less calls to
30944b6d
IZ
464 regtry() should be needed.
465
466 REx compiler's optimizer found 4 possible hints:
467 a) Anchored substring;
468 b) Fixed substring;
469 c) Whether we are anchored (beginning-of-line or \G);
470 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 471 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
472 string which does not contradict any of them.
473 */
2c2d71f5 474
6eb5f6b9
JH
475/* Most of decisions we do here should have been done at compile time.
476 The nodes of the REx which we used for the search should have been
477 deleted from the finite automaton. */
478
cad2e5aa 479char *
288b8c02 480Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 481 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 482{
97aff369 483 dVAR;
288b8c02 484 struct regexp *const prog = (struct regexp *)SvANY(rx);
b7953727 485 register I32 start_shift = 0;
cad2e5aa 486 /* Should be nonnegative! */
b7953727 487 register I32 end_shift = 0;
2c2d71f5
JH
488 register char *s;
489 register SV *check;
a1933d95 490 char *strbeg;
cad2e5aa 491 char *t;
1de06328 492 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 493 I32 ml_anch;
bd61b366
SS
494 register char *other_last = NULL; /* other substr checked before this */
495 char *check_at = NULL; /* check substr found at this pos */
bbe252da 496 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 497 RXi_GET_DECL(prog,progi);
30944b6d 498#ifdef DEBUGGING
890ce7af 499 const char * const i_strpos = strpos;
30944b6d 500#endif
a3621e74
YO
501 GET_RE_DEBUG_FLAGS_DECL;
502
7918f24d
NC
503 PERL_ARGS_ASSERT_RE_INTUIT_START;
504
288b8c02 505 RX_MATCH_UTF8_set(rx,do_utf8);
cad2e5aa 506
3c8556c3 507 if (RX_UTF8(rx)) {
b8d68ded
JH
508 PL_reg_flags |= RF_utf8;
509 }
ab3bbdeb 510 DEBUG_EXECUTE_r(
efd26800 511 debug_start_match(rx, do_utf8, strpos, strend,
1de06328
YO
512 sv ? "Guessing start of match in sv for"
513 : "Guessing start of match in string for");
2a782b5b 514 );
cad2e5aa 515
c344f387
JH
516 /* CHR_DIST() would be more correct here but it makes things slow. */
517 if (prog->minlen > strend - strpos) {
a3621e74 518 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 519 "String too short... [re_intuit_start]\n"));
cad2e5aa 520 goto fail;
2c2d71f5 521 }
1de06328 522
a1933d95 523 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 524 PL_regeol = strend;
33b8afdf
JH
525 if (do_utf8) {
526 if (!prog->check_utf8 && prog->check_substr)
527 to_utf8_substr(prog);
528 check = prog->check_utf8;
529 } else {
530 if (!prog->check_substr && prog->check_utf8)
531 to_byte_substr(prog);
532 check = prog->check_substr;
533 }
1de06328 534 if (check == &PL_sv_undef) {
a3621e74 535 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 536 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
537 goto fail;
538 }
bbe252da
YO
539 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
540 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
541 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 542 && !multiline ) ); /* Check after \n? */
cad2e5aa 543
7e25d62c 544 if (!ml_anch) {
bbe252da
YO
545 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
546 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 547 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
548 && sv && !SvROK(sv)
549 && (strpos != strbeg)) {
a3621e74 550 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
551 goto fail;
552 }
553 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 554 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 555 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
556 I32 slen;
557
1aa99e6b 558 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 559
653099ff
GS
560 if (SvTAIL(check)) {
561 slen = SvCUR(check); /* >= 1 */
cad2e5aa 562
9041c2e3 563 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 564 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 565 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 566 goto fail_finish;
cad2e5aa
JH
567 }
568 /* Now should match s[0..slen-2] */
569 slen--;
3f7c398e 570 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 571 || (slen > 1
3f7c398e 572 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 573 report_neq:
a3621e74 574 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
575 goto fail_finish;
576 }
cad2e5aa 577 }
3f7c398e 578 else if (*SvPVX_const(check) != *s
653099ff 579 || ((slen = SvCUR(check)) > 1
3f7c398e 580 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 581 goto report_neq;
c315bfe8 582 check_at = s;
2c2d71f5 583 goto success_at_start;
7e25d62c 584 }
cad2e5aa 585 }
2c2d71f5 586 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 587 s = strpos;
2c2d71f5 588 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
589 end_shift = prog->check_end_shift;
590
2c2d71f5 591 if (!ml_anch) {
a3b680e6 592 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 593 - (SvTAIL(check) != 0);
a3b680e6 594 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
595
596 if (end_shift < eshift)
597 end_shift = eshift;
598 }
cad2e5aa 599 }
2c2d71f5 600 else { /* Can match at random position */
cad2e5aa
JH
601 ml_anch = 0;
602 s = strpos;
1de06328
YO
603 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
604 end_shift = prog->check_end_shift;
605
606 /* end shift should be non negative here */
cad2e5aa
JH
607 }
608
bcdf7404 609#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 610 if (end_shift < 0)
1de06328 611 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 612 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
613#endif
614
2c2d71f5
JH
615 restart:
616 /* Find a possible match in the region s..strend by looking for
617 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
618
619 {
620 I32 srch_start_shift = start_shift;
621 I32 srch_end_shift = end_shift;
622 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
623 srch_end_shift -= ((strbeg - s) - srch_start_shift);
624 srch_start_shift = strbeg - s;
625 }
6bda09f9 626 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
627 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
628 (IV)prog->check_offset_min,
629 (IV)srch_start_shift,
630 (IV)srch_end_shift,
631 (IV)prog->check_end_shift);
632 });
633
cad2e5aa 634 if (flags & REXEC_SCREAM) {
cad2e5aa 635 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 636 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 637
2c2d71f5
JH
638 if (PL_screamfirst[BmRARE(check)] >= 0
639 || ( BmRARE(check) == '\n'
85c508c3 640 && (BmPREVIOUS(check) == SvCUR(check) - 1)
2c2d71f5 641 && SvTAIL(check) ))
9041c2e3 642 s = screaminstr(sv, check,
1de06328 643 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 644 else
2c2d71f5 645 goto fail_finish;
4addbd3b 646 /* we may be pointing at the wrong string */
07bc277f 647 if (s && RXp_MATCH_COPIED(prog))
3f7c398e 648 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
649 if (data)
650 *data->scream_olds = s;
651 }
1de06328
YO
652 else {
653 U8* start_point;
654 U8* end_point;
bbe252da 655 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
656 start_point= (U8*)(s + srch_start_shift);
657 end_point= (U8*)(strend - srch_end_shift);
658 } else {
659 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
660 end_point= HOP3(strend, -srch_end_shift, strbeg);
661 }
6bda09f9 662 DEBUG_OPTIMISE_MORE_r({
56570a2c 663 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 664 (int)(end_point - start_point),
fc8cd66c 665 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
666 start_point);
667 });
668
669 s = fbm_instr( start_point, end_point,
7fba1cd6 670 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
671 }
672 }
cad2e5aa
JH
673 /* Update the count-of-usability, remove useless subpatterns,
674 unshift s. */
2c2d71f5 675
ab3bbdeb
YO
676 DEBUG_EXECUTE_r({
677 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
678 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
679 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 680 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
681 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
682 ? "anchored" : "floating"),
683 quoted,
684 RE_SV_TAIL(check),
685 (s ? " at offset " : "...\n") );
686 });
2c2d71f5
JH
687
688 if (!s)
689 goto fail_finish;
2c2d71f5 690 /* Finish the diagnostic message */
a3621e74 691 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 692
1de06328
YO
693 /* XXX dmq: first branch is for positive lookbehind...
694 Our check string is offset from the beginning of the pattern.
695 So we need to do any stclass tests offset forward from that
696 point. I think. :-(
697 */
698
699
700
701 check_at=s;
702
703
2c2d71f5
JH
704 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
705 Start with the other substr.
706 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 707 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
708 *always* match. Probably should be marked during compile...
709 Probably it is right to do no SCREAM here...
710 */
711
1de06328
YO
712 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
713 : (prog->float_substr && prog->anchored_substr))
714 {
30944b6d 715 /* Take into account the "other" substring. */
2c2d71f5
JH
716 /* XXXX May be hopelessly wrong for UTF... */
717 if (!other_last)
6eb5f6b9 718 other_last = strpos;
33b8afdf 719 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
720 do_other_anchored:
721 {
890ce7af
AL
722 char * const last = HOP3c(s, -start_shift, strbeg);
723 char *last1, *last2;
be8e71aa 724 char * const saved_s = s;
33b8afdf 725 SV* must;
2c2d71f5 726
2c2d71f5
JH
727 t = s - prog->check_offset_max;
728 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 729 && (!do_utf8
0ce71af7 730 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 731 && t > strpos)))
6f207bd3 732 NOOP;
2c2d71f5
JH
733 else
734 t = strpos;
1aa99e6b 735 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
736 if (t < other_last) /* These positions already checked */
737 t = other_last;
1aa99e6b 738 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
739 if (last < last1)
740 last1 = last;
1de06328
YO
741 /* XXXX It is not documented what units *_offsets are in.
742 We assume bytes, but this is clearly wrong.
743 Meaning this code needs to be carefully reviewed for errors.
744 dmq.
745 */
746
2c2d71f5 747 /* On end-of-str: see comment below. */
33b8afdf
JH
748 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
749 if (must == &PL_sv_undef) {
750 s = (char*)NULL;
1de06328 751 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
752 }
753 else
754 s = fbm_instr(
755 (unsigned char*)t,
756 HOP3(HOP3(last1, prog->anchored_offset, strend)
757 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
758 must,
7fba1cd6 759 multiline ? FBMrf_MULTILINE : 0
33b8afdf 760 );
ab3bbdeb
YO
761 DEBUG_EXECUTE_r({
762 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
763 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
764 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 765 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
766 quoted, RE_SV_TAIL(must));
767 });
768
769
2c2d71f5
JH
770 if (!s) {
771 if (last1 >= last2) {
a3621e74 772 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
773 ", giving up...\n"));
774 goto fail_finish;
775 }
a3621e74 776 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 777 ", trying floating at offset %ld...\n",
be8e71aa 778 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
779 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
780 s = HOP3c(last, 1, strend);
2c2d71f5
JH
781 goto restart;
782 }
783 else {
a3621e74 784 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 785 (long)(s - i_strpos)));
1aa99e6b
IH
786 t = HOP3c(s, -prog->anchored_offset, strbeg);
787 other_last = HOP3c(s, 1, strend);
be8e71aa 788 s = saved_s;
2c2d71f5
JH
789 if (t == strpos)
790 goto try_at_start;
2c2d71f5
JH
791 goto try_at_offset;
792 }
30944b6d 793 }
2c2d71f5
JH
794 }
795 else { /* Take into account the floating substring. */
33b8afdf 796 char *last, *last1;
be8e71aa 797 char * const saved_s = s;
33b8afdf
JH
798 SV* must;
799
800 t = HOP3c(s, -start_shift, strbeg);
801 last1 = last =
802 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
803 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
804 last = HOP3c(t, prog->float_max_offset, strend);
805 s = HOP3c(t, prog->float_min_offset, strend);
806 if (s < other_last)
807 s = other_last;
2c2d71f5 808 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
809 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
810 /* fbm_instr() takes into account exact value of end-of-str
811 if the check is SvTAIL(ed). Since false positives are OK,
812 and end-of-str is not later than strend we are OK. */
813 if (must == &PL_sv_undef) {
814 s = (char*)NULL;
1de06328 815 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
816 }
817 else
2c2d71f5 818 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
819 (unsigned char*)last + SvCUR(must)
820 - (SvTAIL(must)!=0),
7fba1cd6 821 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
822 DEBUG_EXECUTE_r({
823 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
824 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
825 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 826 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
827 quoted, RE_SV_TAIL(must));
828 });
33b8afdf
JH
829 if (!s) {
830 if (last1 == last) {
a3621e74 831 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
832 ", giving up...\n"));
833 goto fail_finish;
2c2d71f5 834 }
a3621e74 835 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 836 ", trying anchored starting at offset %ld...\n",
be8e71aa 837 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
838 other_last = last;
839 s = HOP3c(t, 1, strend);
840 goto restart;
841 }
842 else {
a3621e74 843 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
844 (long)(s - i_strpos)));
845 other_last = s; /* Fix this later. --Hugo */
be8e71aa 846 s = saved_s;
33b8afdf
JH
847 if (t == strpos)
848 goto try_at_start;
849 goto try_at_offset;
850 }
2c2d71f5 851 }
cad2e5aa 852 }
2c2d71f5 853
1de06328 854
9ef43ace 855 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 856
6bda09f9 857 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
858 PerlIO_printf(Perl_debug_log,
859 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
860 (IV)prog->check_offset_min,
861 (IV)prog->check_offset_max,
862 (IV)(s-strpos),
863 (IV)(t-strpos),
864 (IV)(t-s),
865 (IV)(strend-strpos)
866 )
867 );
868
2c2d71f5 869 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 870 && (!do_utf8
9ef43ace 871 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
872 && t > strpos)))
873 {
2c2d71f5
JH
874 /* Fixed substring is found far enough so that the match
875 cannot start at strpos. */
876 try_at_offset:
cad2e5aa 877 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
878 /* Eventually fbm_*() should handle this, but often
879 anchored_offset is not 0, so this check will not be wasted. */
880 /* XXXX In the code below we prefer to look for "^" even in
881 presence of anchored substrings. And we search even
882 beyond the found float position. These pessimizations
883 are historical artefacts only. */
884 find_anchor:
2c2d71f5 885 while (t < strend - prog->minlen) {
cad2e5aa 886 if (*t == '\n') {
4ee3650e 887 if (t < check_at - prog->check_offset_min) {
33b8afdf 888 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
889 /* Since we moved from the found position,
890 we definitely contradict the found anchored
30944b6d
IZ
891 substr. Due to the above check we do not
892 contradict "check" substr.
893 Thus we can arrive here only if check substr
894 is float. Redo checking for "other"=="fixed".
895 */
9041c2e3 896 strpos = t + 1;
a3621e74 897 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 898 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
899 goto do_other_anchored;
900 }
4ee3650e
GS
901 /* We don't contradict the found floating substring. */
902 /* XXXX Why not check for STCLASS? */
cad2e5aa 903 s = t + 1;
a3621e74 904 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 905 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
906 goto set_useful;
907 }
4ee3650e
GS
908 /* Position contradicts check-string */
909 /* XXXX probably better to look for check-string
910 than for "\n", so one should lower the limit for t? */
a3621e74 911 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 912 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 913 other_last = strpos = s = t + 1;
cad2e5aa
JH
914 goto restart;
915 }
916 t++;
917 }
a3621e74 918 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 919 PL_colors[0], PL_colors[1]));
2c2d71f5 920 goto fail_finish;
cad2e5aa 921 }
f5952150 922 else {
a3621e74 923 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 924 PL_colors[0], PL_colors[1]));
f5952150 925 }
cad2e5aa
JH
926 s = t;
927 set_useful:
33b8afdf 928 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
929 }
930 else {
f5952150 931 /* The found string does not prohibit matching at strpos,
2c2d71f5 932 - no optimization of calling REx engine can be performed,
f5952150
GS
933 unless it was an MBOL and we are not after MBOL,
934 or a future STCLASS check will fail this. */
2c2d71f5
JH
935 try_at_start:
936 /* Even in this situation we may use MBOL flag if strpos is offset
937 wrt the start of the string. */
05b4157f 938 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 939 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 940 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 941 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 942 {
cad2e5aa
JH
943 t = strpos;
944 goto find_anchor;
945 }
a3621e74 946 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 947 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 948 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 949 );
2c2d71f5 950 success_at_start:
bbe252da 951 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
952 && (do_utf8 ? (
953 prog->check_utf8 /* Could be deleted already */
954 && --BmUSEFUL(prog->check_utf8) < 0
955 && (prog->check_utf8 == prog->float_utf8)
956 ) : (
957 prog->check_substr /* Could be deleted already */
958 && --BmUSEFUL(prog->check_substr) < 0
959 && (prog->check_substr == prog->float_substr)
960 )))
66e933ab 961 {
cad2e5aa 962 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 963 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
ef8d46e8 964 /* XXX Does the destruction order has to change with do_utf8? */
33b8afdf 965 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
ef8d46e8 966 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
967 prog->check_substr = prog->check_utf8 = NULL; /* disable */
968 prog->float_substr = prog->float_utf8 = NULL; /* clear */
969 check = NULL; /* abort */
cad2e5aa 970 s = strpos;
3cf5c195
IZ
971 /* XXXX This is a remnant of the old implementation. It
972 looks wasteful, since now INTUIT can use many
6eb5f6b9 973 other heuristics. */
bbe252da 974 prog->extflags &= ~RXf_USE_INTUIT;
cad2e5aa
JH
975 }
976 else
977 s = strpos;
978 }
979
6eb5f6b9
JH
980 /* Last resort... */
981 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
982 /* trie stclasses are too expensive to use here, we are better off to
983 leave it to regmatch itself */
f8fc2ecf 984 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
985 /* minlen == 0 is possible if regstclass is \b or \B,
986 and the fixed substr is ''$.
987 Since minlen is already taken into account, s+1 is before strend;
988 accidentally, minlen >= 1 guaranties no false positives at s + 1
989 even for \b or \B. But (minlen? 1 : 0) below assumes that
990 regstclass does not come from lookahead... */
991 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
992 This leaves EXACTF only, which is dealt with in find_byclass(). */
f8fc2ecf
YO
993 const U8* const str = (U8*)STRING(progi->regstclass);
994 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
995 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 996 : 1);
1de06328
YO
997 char * endpos;
998 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
999 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1000 else if (prog->float_substr || prog->float_utf8)
1001 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1002 else
1003 endpos= strend;
1004
70685ca0
JH
1005 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1006 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1de06328 1007
6eb5f6b9 1008 t = s;
f8fc2ecf 1009 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
6eb5f6b9
JH
1010 if (!s) {
1011#ifdef DEBUGGING
cbbf8932 1012 const char *what = NULL;
6eb5f6b9
JH
1013#endif
1014 if (endpos == strend) {
a3621e74 1015 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1016 "Could not match STCLASS...\n") );
1017 goto fail;
1018 }
a3621e74 1019 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1020 "This position contradicts STCLASS...\n") );
bbe252da 1021 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1022 goto fail;
6eb5f6b9 1023 /* Contradict one of substrings */
33b8afdf
JH
1024 if (prog->anchored_substr || prog->anchored_utf8) {
1025 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1026 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1027 hop_and_restart:
1aa99e6b 1028 s = HOP3c(t, 1, strend);
66e933ab
GS
1029 if (s + start_shift + end_shift > strend) {
1030 /* XXXX Should be taken into account earlier? */
a3621e74 1031 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1032 "Could not match STCLASS...\n") );
1033 goto fail;
1034 }
5e39e1e5
HS
1035 if (!check)
1036 goto giveup;
a3621e74 1037 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1038 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1039 what, (long)(s + start_shift - i_strpos)) );
1040 goto restart;
1041 }
66e933ab 1042 /* Have both, check_string is floating */
6eb5f6b9
JH
1043 if (t + start_shift >= check_at) /* Contradicts floating=check */
1044 goto retry_floating_check;
1045 /* Recheck anchored substring, but not floating... */
9041c2e3 1046 s = check_at;
5e39e1e5
HS
1047 if (!check)
1048 goto giveup;
a3621e74 1049 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1050 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1051 (long)(other_last - i_strpos)) );
1052 goto do_other_anchored;
1053 }
60e71179
GS
1054 /* Another way we could have checked stclass at the
1055 current position only: */
1056 if (ml_anch) {
1057 s = t = t + 1;
5e39e1e5
HS
1058 if (!check)
1059 goto giveup;
a3621e74 1060 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1061 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1062 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1063 goto try_at_offset;
66e933ab 1064 }
33b8afdf 1065 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1066 goto fail;
6eb5f6b9
JH
1067 /* Check is floating subtring. */
1068 retry_floating_check:
1069 t = check_at - start_shift;
a3621e74 1070 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1071 goto hop_and_restart;
1072 }
b7953727 1073 if (t != s) {
a3621e74 1074 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1075 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1076 (long)(t - i_strpos), (long)(s - i_strpos))
1077 );
1078 }
1079 else {
a3621e74 1080 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1081 "Does not contradict STCLASS...\n");
1082 );
1083 }
6eb5f6b9 1084 }
5e39e1e5 1085 giveup:
a3621e74 1086 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1087 PL_colors[4], (check ? "Guessed" : "Giving up"),
1088 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1089 return s;
2c2d71f5
JH
1090
1091 fail_finish: /* Substring not found */
33b8afdf
JH
1092 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1093 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1094 fail:
a3621e74 1095 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1096 PL_colors[4], PL_colors[5]));
bd61b366 1097 return NULL;
cad2e5aa 1098}
9661b544 1099
a0a388a1
YO
1100#define DECL_TRIE_TYPE(scan) \
1101 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1102 trie_type = (scan->flags != EXACT) \
1103 ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
1104 : (do_utf8 ? trie_utf8 : trie_plain)
3b0527fe 1105
55eed653
NC
1106#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1107uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
4cadc6a9
YO
1108 switch (trie_type) { \
1109 case trie_utf8_fold: \
1110 if ( foldlen>0 ) { \
0abd0d78 1111 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
4cadc6a9
YO
1112 foldlen -= len; \
1113 uscan += len; \
1114 len=0; \
1115 } else { \
0abd0d78 1116 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
4cadc6a9
YO
1117 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1118 foldlen -= UNISKIP( uvc ); \
1119 uscan = foldbuf + UNISKIP( uvc ); \
1120 } \
1121 break; \
a0a388a1
YO
1122 case trie_latin_utf8_fold: \
1123 if ( foldlen>0 ) { \
1124 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1125 foldlen -= len; \
1126 uscan += len; \
1127 len=0; \
1128 } else { \
1129 len = 1; \
1130 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
1131 foldlen -= UNISKIP( uvc ); \
1132 uscan = foldbuf + UNISKIP( uvc ); \
1133 } \
1134 break; \
4cadc6a9
YO
1135 case trie_utf8: \
1136 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1137 break; \
1138 case trie_plain: \
1139 uvc = (UV)*uc; \
1140 len = 1; \
1141 } \
4cadc6a9
YO
1142 if (uvc < 256) { \
1143 charid = trie->charmap[ uvc ]; \
1144 } \
1145 else { \
1146 charid = 0; \
55eed653
NC
1147 if (widecharmap) { \
1148 SV** const svpp = hv_fetch(widecharmap, \
4cadc6a9
YO
1149 (char*)&uvc, sizeof(UV), 0); \
1150 if (svpp) \
1151 charid = (U16)SvIV(*svpp); \
1152 } \
1153 } \
1154} STMT_END
1155
a0a388a1
YO
1156#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1157{ \
1158 char *my_strend= (char *)strend; \
4cadc6a9
YO
1159 if ( (CoNd) \
1160 && (ln == len || \
a0a388a1 1161 !ibcmp_utf8(s, &my_strend, 0, do_utf8, \
4cadc6a9 1162 m, NULL, ln, (bool)UTF)) \
a0a388a1 1163 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1164 goto got_it; \
1165 else { \
1166 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1167 uvchr_to_utf8(tmpbuf, c); \
1168 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1169 if ( f != c \
1170 && (f == c1 || f == c2) \
a0a388a1
YO
1171 && (ln == len || \
1172 !ibcmp_utf8(s, &my_strend, 0, do_utf8,\
1173 m, NULL, ln, (bool)UTF)) \
1174 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1175 goto got_it; \
1176 } \
a0a388a1
YO
1177} \
1178s += len
4cadc6a9
YO
1179
1180#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1181STMT_START { \
1182 while (s <= e) { \
1183 if ( (CoNd) \
1184 && (ln == 1 || !(OP(c) == EXACTF \
1185 ? ibcmp(s, m, ln) \
1186 : ibcmp_locale(s, m, ln))) \
24b23f37 1187 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1188 goto got_it; \
1189 s++; \
1190 } \
1191} STMT_END
1192
1193#define REXEC_FBC_UTF8_SCAN(CoDe) \
1194STMT_START { \
1195 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1196 CoDe \
1197 s += uskip; \
1198 } \
1199} STMT_END
1200
1201#define REXEC_FBC_SCAN(CoDe) \
1202STMT_START { \
1203 while (s < strend) { \
1204 CoDe \
1205 s++; \
1206 } \
1207} STMT_END
1208
1209#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1210REXEC_FBC_UTF8_SCAN( \
1211 if (CoNd) { \
24b23f37 1212 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1213 goto got_it; \
1214 else \
1215 tmp = doevery; \
1216 } \
1217 else \
1218 tmp = 1; \
1219)
1220
1221#define REXEC_FBC_CLASS_SCAN(CoNd) \
1222REXEC_FBC_SCAN( \
1223 if (CoNd) { \
24b23f37 1224 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1225 goto got_it; \
1226 else \
1227 tmp = doevery; \
1228 } \
1229 else \
1230 tmp = 1; \
1231)
1232
1233#define REXEC_FBC_TRYIT \
24b23f37 1234if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1235 goto got_it
1236
e1d1eefb
YO
1237#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1238 if (do_utf8) { \
1239 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1240 } \
1241 else { \
1242 REXEC_FBC_CLASS_SCAN(CoNd); \
1243 } \
1244 break
1245
4cadc6a9
YO
1246#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1247 if (do_utf8) { \
1248 UtFpReLoAd; \
1249 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1250 } \
1251 else { \
1252 REXEC_FBC_CLASS_SCAN(CoNd); \
1253 } \
1254 break
1255
1256#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1257 PL_reg_flags |= RF_tainted; \
1258 if (do_utf8) { \
1259 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1260 } \
1261 else { \
1262 REXEC_FBC_CLASS_SCAN(CoNd); \
1263 } \
1264 break
1265
786e8c11
YO
1266#define DUMP_EXEC_POS(li,s,doutf8) \
1267 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1268
1269/* We know what class REx starts with. Try to find this position... */
1270/* if reginfo is NULL, its a dryrun */
1271/* annoyingly all the vars in this routine have different names from their counterparts
1272 in regmatch. /grrr */
1273
3c3eec57 1274STATIC char *
07be1b83 1275S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1276 const char *strend, regmatch_info *reginfo)
a687059c 1277{
27da23d5 1278 dVAR;
bbe252da 1279 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
6eb5f6b9 1280 char *m;
d8093b23 1281 STRLEN ln;
5dab1207 1282 STRLEN lnc;
078c425b 1283 register STRLEN uskip;
d8093b23
G
1284 unsigned int c1;
1285 unsigned int c2;
6eb5f6b9
JH
1286 char *e;
1287 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1288 register const bool do_utf8 = PL_reg_match_utf8;
f8fc2ecf 1289 RXi_GET_DECL(prog,progi);
7918f24d
NC
1290
1291 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1292
6eb5f6b9
JH
1293 /* We know what class it must start with. */
1294 switch (OP(c)) {
6eb5f6b9 1295 case ANYOF:
388cc4de 1296 if (do_utf8) {
4cadc6a9 1297 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1298 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1299 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1300 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1301 }
1302 else {
1303 while (s < strend) {
1304 STRLEN skip = 1;
1305
32fc9b6a 1306 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1307 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1308 /* The assignment of 2 is intentional:
1309 * for the folded sharp s, the skip is 2. */
1310 (skip = SHARP_S_SKIP))) {
24b23f37 1311 if (tmp && (!reginfo || regtry(reginfo, &s)))
388cc4de
HS
1312 goto got_it;
1313 else
1314 tmp = doevery;
1315 }
1316 else
1317 tmp = 1;
1318 s += skip;
1319 }
a0d0e21e 1320 }
6eb5f6b9 1321 break;
f33976b4 1322 case CANY:
4cadc6a9 1323 REXEC_FBC_SCAN(
24b23f37 1324 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1325 goto got_it;
1326 else
1327 tmp = doevery;
4cadc6a9 1328 );
f33976b4 1329 break;
6eb5f6b9 1330 case EXACTF:
5dab1207
NIS
1331 m = STRING(c);
1332 ln = STR_LEN(c); /* length to match in octets/bytes */
1333 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1334 if (UTF) {
a2a2844f 1335 STRLEN ulen1, ulen2;
5dab1207 1336 U8 *sm = (U8 *) m;
89ebb4a3
JH
1337 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1338 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
97dc7d3e
RGS
1339 /* used by commented-out code below */
1340 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
a0a388a1
YO
1341
1342 /* XXX: Since the node will be case folded at compile
1343 time this logic is a little odd, although im not
1344 sure that its actually wrong. --dmq */
1345
1346 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1347 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1348
1349 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1350 codepoint of the first character in the converted
1351 form, yet originally we did the extra step.
1352 No tests fail by commenting this code out however
1353 so Ive left it out. -- dmq.
1354
89ebb4a3 1355 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1356 0, uniflags);
89ebb4a3 1357 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1358 0, uniflags);
a0a388a1
YO
1359 */
1360
5dab1207
NIS
1361 lnc = 0;
1362 while (sm < ((U8 *) m + ln)) {
1363 lnc++;
1364 sm += UTF8SKIP(sm);
1365 }
1aa99e6b
IH
1366 }
1367 else {
1368 c1 = *(U8*)m;
1369 c2 = PL_fold[c1];
1370 }
6eb5f6b9
JH
1371 goto do_exactf;
1372 case EXACTFL:
5dab1207
NIS
1373 m = STRING(c);
1374 ln = STR_LEN(c);
1375 lnc = (I32) ln;
d8093b23 1376 c1 = *(U8*)m;
6eb5f6b9
JH
1377 c2 = PL_fold_locale[c1];
1378 do_exactf:
db12adc6 1379 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1380
3b0527fe 1381 if (!reginfo && e < s)
6eb5f6b9 1382 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1383
60a8b682
JH
1384 /* The idea in the EXACTF* cases is to first find the
1385 * first character of the EXACTF* node and then, if
1386 * necessary, case-insensitively compare the full
1387 * text of the node. The c1 and c2 are the first
1388 * characters (though in Unicode it gets a bit
1389 * more complicated because there are more cases
7f16dd3d
JH
1390 * than just upper and lower: one needs to use
1391 * the so-called folding case for case-insensitive
1392 * matching (called "loose matching" in Unicode).
1393 * ibcmp_utf8() will do just that. */
60a8b682 1394
a0a388a1 1395 if (do_utf8 || UTF) {
575cac57 1396 UV c, f;
89ebb4a3 1397 U8 tmpbuf [UTF8_MAXBYTES+1];
a0a388a1
YO
1398 STRLEN len = 1;
1399 STRLEN foldlen;
4ad0818d 1400 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1401 if (c1 == c2) {
5dab1207
NIS
1402 /* Upper and lower of 1st char are equal -
1403 * probably not a "letter". */
1aa99e6b 1404 while (s <= e) {
a0a388a1
YO
1405 if (do_utf8) {
1406 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1407 uniflags);
a0a388a1
YO
1408 } else {
1409 c = *((U8*)s);
1410 }
4cadc6a9 1411 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1412 }
09091399
JH
1413 }
1414 else {
1aa99e6b 1415 while (s <= e) {
a0a388a1
YO
1416 if (do_utf8) {
1417 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1418 uniflags);
a0a388a1
YO
1419 } else {
1420 c = *((U8*)s);
1421 }
80aecb99 1422
60a8b682 1423 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1424 * Note that not all the possible combinations
1425 * are handled here: some of them are handled
1426 * by the standard folding rules, and some of
1427 * them (the character class or ANYOF cases)
1428 * are handled during compiletime in
1429 * regexec.c:S_regclass(). */
880bd946
JH
1430 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1431 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1432 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1433
4cadc6a9 1434 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1435 }
09091399 1436 }
1aa99e6b
IH
1437 }
1438 else {
a0a388a1 1439 /* Neither pattern nor string are UTF8 */
1aa99e6b 1440 if (c1 == c2)
4cadc6a9 1441 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1442 else
4cadc6a9 1443 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1444 }
1445 break;
bbce6d69 1446 case BOUNDL:
3280af22 1447 PL_reg_flags |= RF_tainted;
bbce6d69 1448 /* FALL THROUGH */
a0d0e21e 1449 case BOUND:
ffc61ed2 1450 if (do_utf8) {
12d33761 1451 if (s == PL_bostr)
ffc61ed2
JH
1452 tmp = '\n';
1453 else {
6136c704 1454 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1455 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1456 }
1457 tmp = ((OP(c) == BOUND ?
9041c2e3 1458 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1459 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1460 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1461 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1462 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1463 isALNUM_LC_utf8((U8*)s)))
1464 {
1465 tmp = !tmp;
4cadc6a9 1466 REXEC_FBC_TRYIT;
a687059c 1467 }
4cadc6a9 1468 );
a0d0e21e 1469 }
667bb95a 1470 else {
12d33761 1471 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1472 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1473 REXEC_FBC_SCAN(
ffc61ed2
JH
1474 if (tmp ==
1475 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1476 tmp = !tmp;
4cadc6a9 1477 REXEC_FBC_TRYIT;
a0ed51b3 1478 }
4cadc6a9 1479 );
a0ed51b3 1480 }
24b23f37 1481 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1482 goto got_it;
1483 break;
bbce6d69 1484 case NBOUNDL:
3280af22 1485 PL_reg_flags |= RF_tainted;
bbce6d69 1486 /* FALL THROUGH */
a0d0e21e 1487 case NBOUND:
ffc61ed2 1488 if (do_utf8) {
12d33761 1489 if (s == PL_bostr)
ffc61ed2
JH
1490 tmp = '\n';
1491 else {
6136c704 1492 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1493 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1494 }
1495 tmp = ((OP(c) == NBOUND ?
9041c2e3 1496 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1497 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1498 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1499 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1500 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1501 isALNUM_LC_utf8((U8*)s)))
1502 tmp = !tmp;
4cadc6a9
YO
1503 else REXEC_FBC_TRYIT;
1504 );
a0d0e21e 1505 }
667bb95a 1506 else {
12d33761 1507 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1508 tmp = ((OP(c) == NBOUND ?
1509 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1510 REXEC_FBC_SCAN(
ffc61ed2
JH
1511 if (tmp ==
1512 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1513 tmp = !tmp;
4cadc6a9
YO
1514 else REXEC_FBC_TRYIT;
1515 );
a0ed51b3 1516 }
24b23f37 1517 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1518 goto got_it;
1519 break;
a0d0e21e 1520 case ALNUM:
4cadc6a9 1521 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177
YO
1522 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1523 swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
4cadc6a9
YO
1524 isALNUM(*s)
1525 );
bbce6d69 1526 case ALNUML:
4cadc6a9
YO
1527 REXEC_FBC_CSCAN_TAINT(
1528 isALNUM_LC_utf8((U8*)s),
1529 isALNUM_LC(*s)
1530 );
a0d0e21e 1531 case NALNUM:
4cadc6a9 1532 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177
YO
1533 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1534 !swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
4cadc6a9
YO
1535 !isALNUM(*s)
1536 );
bbce6d69 1537 case NALNUML:
4cadc6a9
YO
1538 REXEC_FBC_CSCAN_TAINT(
1539 !isALNUM_LC_utf8((U8*)s),
1540 !isALNUM_LC(*s)
1541 );
a0d0e21e 1542 case SPACE:
4cadc6a9 1543 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177
YO
1544 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1545 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8),
4cadc6a9
YO
1546 isSPACE(*s)
1547 );
bbce6d69 1548 case SPACEL:
4cadc6a9
YO
1549 REXEC_FBC_CSCAN_TAINT(
1550 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1551 isSPACE_LC(*s)
1552 );
a0d0e21e 1553 case NSPACE:
4cadc6a9 1554 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177
YO
1555 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1556 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8)),
4cadc6a9
YO
1557 !isSPACE(*s)
1558 );
bbce6d69 1559 case NSPACEL:
4cadc6a9
YO
1560 REXEC_FBC_CSCAN_TAINT(
1561 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1562 !isSPACE_LC(*s)
1563 );
a0d0e21e 1564 case DIGIT:
4cadc6a9 1565 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177
YO
1566 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1567 swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
4cadc6a9
YO
1568 isDIGIT(*s)
1569 );
b8c5462f 1570 case DIGITL:
4cadc6a9
YO
1571 REXEC_FBC_CSCAN_TAINT(
1572 isDIGIT_LC_utf8((U8*)s),
1573 isDIGIT_LC(*s)
1574 );
a0d0e21e 1575 case NDIGIT:
4cadc6a9 1576 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177
YO
1577 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1578 !swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
4cadc6a9
YO
1579 !isDIGIT(*s)
1580 );
b8c5462f 1581 case NDIGITL:
4cadc6a9
YO
1582 REXEC_FBC_CSCAN_TAINT(
1583 !isDIGIT_LC_utf8((U8*)s),
1584 !isDIGIT_LC(*s)
1585 );
e1d1eefb
YO
1586 case LNBREAK:
1587 REXEC_FBC_CSCAN(
1588 is_LNBREAK_utf8(s),
1589 is_LNBREAK_latin1(s)
1590 );
1591 case VERTWS:
1592 REXEC_FBC_CSCAN(
1593 is_VERTWS_utf8(s),
1594 is_VERTWS_latin1(s)
1595 );
1596 case NVERTWS:
1597 REXEC_FBC_CSCAN(
1598 !is_VERTWS_utf8(s),
1599 !is_VERTWS_latin1(s)
1600 );
1601 case HORIZWS:
1602 REXEC_FBC_CSCAN(
1603 is_HORIZWS_utf8(s),
1604 is_HORIZWS_latin1(s)
1605 );
1606 case NHORIZWS:
1607 REXEC_FBC_CSCAN(
1608 !is_HORIZWS_utf8(s),
1609 !is_HORIZWS_latin1(s)
1610 );
1de06328
YO
1611 case AHOCORASICKC:
1612 case AHOCORASICK:
07be1b83 1613 {
a0a388a1 1614 DECL_TRIE_TYPE(c);
07be1b83
YO
1615 /* what trie are we using right now */
1616 reg_ac_data *aho
f8fc2ecf 1617 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1618 reg_trie_data *trie
1619 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1620 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1621
1622 const char *last_start = strend - trie->minlen;
6148ee25 1623#ifdef DEBUGGING
07be1b83 1624 const char *real_start = s;
6148ee25 1625#endif
07be1b83 1626 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1627 SV *sv_points;
1628 U8 **points; /* map of where we were in the input string
786e8c11 1629 when reading a given char. For ASCII this
be8e71aa 1630 is unnecessary overhead as the relationship
38a44b82
NC
1631 is always 1:1, but for Unicode, especially
1632 case folded Unicode this is not true. */
f9e705e8 1633 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1634 U8 *bitmap=NULL;
1635
07be1b83
YO
1636
1637 GET_RE_DEBUG_FLAGS_DECL;
1638
be8e71aa
YO
1639 /* We can't just allocate points here. We need to wrap it in
1640 * an SV so it gets freed properly if there is a croak while
1641 * running the match */
1642 ENTER;
1643 SAVETMPS;
1644 sv_points=newSV(maxlen * sizeof(U8 *));
1645 SvCUR_set(sv_points,
1646 maxlen * sizeof(U8 *));
1647 SvPOK_on(sv_points);
1648 sv_2mortal(sv_points);
1649 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1650 if ( trie_type != trie_utf8_fold
1651 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1652 {
786e8c11
YO
1653 if (trie->bitmap)
1654 bitmap=(U8*)trie->bitmap;
1655 else
1656 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1657 }
786e8c11
YO
1658 /* this is the Aho-Corasick algorithm modified a touch
1659 to include special handling for long "unknown char"
1660 sequences. The basic idea being that we use AC as long
1661 as we are dealing with a possible matching char, when
1662 we encounter an unknown char (and we have not encountered
1663 an accepting state) we scan forward until we find a legal
1664 starting char.
1665 AC matching is basically that of trie matching, except
1666 that when we encounter a failing transition, we fall back
1667 to the current states "fail state", and try the current char
1668 again, a process we repeat until we reach the root state,
1669 state 1, or a legal transition. If we fail on the root state
1670 then we can either terminate if we have reached an accepting
1671 state previously, or restart the entire process from the beginning
1672 if we have not.
1673
1674 */
07be1b83
YO
1675 while (s <= last_start) {
1676 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1677 U8 *uc = (U8*)s;
1678 U16 charid = 0;
1679 U32 base = 1;
1680 U32 state = 1;
1681 UV uvc = 0;
1682 STRLEN len = 0;
1683 STRLEN foldlen = 0;
1684 U8 *uscan = (U8*)NULL;
1685 U8 *leftmost = NULL;
786e8c11
YO
1686#ifdef DEBUGGING
1687 U32 accepted_word= 0;
1688#endif
07be1b83
YO
1689 U32 pointpos = 0;
1690
1691 while ( state && uc <= (U8*)strend ) {
1692 int failed=0;
786e8c11
YO
1693 U32 word = aho->states[ state ].wordnum;
1694
1de06328
YO
1695 if( state==1 ) {
1696 if ( bitmap ) {
1697 DEBUG_TRIE_EXECUTE_r(
1698 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1699 dump_exec_pos( (char *)uc, c, strend, real_start,
1700 (char *)uc, do_utf8 );
1701 PerlIO_printf( Perl_debug_log,
1702 " Scanning for legal start char...\n");
1703 }
1704 );
1705 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1706 uc++;
786e8c11 1707 }
1de06328 1708 s= (char *)uc;
786e8c11 1709 }
786e8c11
YO
1710 if (uc >(U8*)last_start) break;
1711 }
1712
1713 if ( word ) {
1714 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1715 if (!leftmost || lpos < leftmost) {
1716 DEBUG_r(accepted_word=word);
07be1b83 1717 leftmost= lpos;
786e8c11 1718 }
07be1b83 1719 if (base==0) break;
786e8c11 1720
07be1b83
YO
1721 }
1722 points[pointpos++ % maxlen]= uc;
55eed653
NC
1723 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1724 uscan, len, uvc, charid, foldlen,
1725 foldbuf, uniflags);
786e8c11
YO
1726 DEBUG_TRIE_EXECUTE_r({
1727 dump_exec_pos( (char *)uc, c, strend, real_start,
1728 s, do_utf8 );
07be1b83 1729 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1730 " Charid:%3u CP:%4"UVxf" ",
1731 charid, uvc);
1732 });
07be1b83
YO
1733
1734 do {
6148ee25 1735#ifdef DEBUGGING
786e8c11 1736 word = aho->states[ state ].wordnum;
6148ee25 1737#endif
07be1b83
YO
1738 base = aho->states[ state ].trans.base;
1739
786e8c11
YO
1740 DEBUG_TRIE_EXECUTE_r({
1741 if (failed)
1742 dump_exec_pos( (char *)uc, c, strend, real_start,
1743 s, do_utf8 );
07be1b83 1744 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1745 "%sState: %4"UVxf", word=%"UVxf,
1746 failed ? " Fail transition to " : "",
1747 (UV)state, (UV)word);
1748 });
07be1b83
YO
1749 if ( base ) {
1750 U32 tmp;
1751 if (charid &&
1752 (base + charid > trie->uniquecharcount )
1753 && (base + charid - 1 - trie->uniquecharcount
1754 < trie->lasttrans)
1755 && trie->trans[base + charid - 1 -
1756 trie->uniquecharcount].check == state
1757 && (tmp=trie->trans[base + charid - 1 -
1758 trie->uniquecharcount ].next))
1759 {
786e8c11
YO
1760 DEBUG_TRIE_EXECUTE_r(
1761 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1762 state = tmp;
1763 break;
1764 }
1765 else {
786e8c11
YO
1766 DEBUG_TRIE_EXECUTE_r(
1767 PerlIO_printf( Perl_debug_log," - fail\n"));
1768 failed = 1;
1769 state = aho->fail[state];
07be1b83
YO
1770 }
1771 }
1772 else {
1773 /* we must be accepting here */
786e8c11
YO
1774 DEBUG_TRIE_EXECUTE_r(
1775 PerlIO_printf( Perl_debug_log," - accepting\n"));
1776 failed = 1;
07be1b83
YO
1777 break;
1778 }
1779 } while(state);
786e8c11 1780 uc += len;
07be1b83
YO
1781 if (failed) {
1782 if (leftmost)
1783 break;
786e8c11 1784 if (!state) state = 1;
07be1b83
YO
1785 }
1786 }
1787 if ( aho->states[ state ].wordnum ) {
1788 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
786e8c11
YO
1789 if (!leftmost || lpos < leftmost) {
1790 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1791 leftmost = lpos;
786e8c11 1792 }
07be1b83 1793 }
07be1b83
YO
1794 if (leftmost) {
1795 s = (char*)leftmost;
786e8c11
YO
1796 DEBUG_TRIE_EXECUTE_r({
1797 PerlIO_printf(
70685ca0
JH
1798 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1799 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
1800 );
1801 });
24b23f37 1802 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
1803 FREETMPS;
1804 LEAVE;
07be1b83 1805 goto got_it;
be8e71aa 1806 }
07be1b83 1807 s = HOPc(s,1);
786e8c11
YO
1808 DEBUG_TRIE_EXECUTE_r({
1809 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1810 });
07be1b83 1811 } else {
786e8c11
YO
1812 DEBUG_TRIE_EXECUTE_r(
1813 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1814 break;
1815 }
1816 }
be8e71aa
YO
1817 FREETMPS;
1818 LEAVE;
07be1b83
YO
1819 }
1820 break;
b3c9acc1 1821 default:
3c3eec57
GS
1822 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1823 break;
d6a28714 1824 }
6eb5f6b9
JH
1825 return 0;
1826 got_it:
1827 return s;
1828}
1829
fae667d5 1830
6eb5f6b9
JH
1831/*
1832 - regexec_flags - match a regexp against a string
1833 */
1834I32
288b8c02 1835Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9
JH
1836 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1837/* strend: pointer to null at end of string */
1838/* strbeg: real beginning of string */
1839/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
1840/* data: May be used for some additional optimizations.
1841 Currently its only used, with a U32 cast, for transmitting
1842 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
1843/* nosave: For optimizations. */
1844{
97aff369 1845 dVAR;
288b8c02 1846 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 1847 /*register*/ char *s;
6eb5f6b9 1848 register regnode *c;
24b23f37 1849 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
1850 I32 minlen; /* must match at least this many chars */
1851 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1852 I32 end_shift = 0; /* Same for the end. */ /* CC */
1853 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1854 char *scream_olds = NULL;
f9f4320a 1855 const bool do_utf8 = (bool)DO_UTF8(sv);
2757e526 1856 I32 multiline;
f8fc2ecf 1857 RXi_GET_DECL(prog,progi);
3b0527fe 1858 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 1859 regexp_paren_pair *swap = NULL;
a3621e74
YO
1860 GET_RE_DEBUG_FLAGS_DECL;
1861
7918f24d 1862 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 1863 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1864
1865 /* Be paranoid... */
1866 if (prog == NULL || startpos == NULL) {
1867 Perl_croak(aTHX_ "NULL regexp parameter");
1868 return 0;
1869 }
1870
bbe252da 1871 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 1872 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 1873
288b8c02 1874 RX_MATCH_UTF8_set(rx, do_utf8);
1de06328 1875 DEBUG_EXECUTE_r(
efd26800 1876 debug_start_match(rx, do_utf8, startpos, strend,
1de06328
YO
1877 "Matching");
1878 );
bac06658 1879
6eb5f6b9 1880 minlen = prog->minlen;
1de06328
YO
1881
1882 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 1883 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1884 "String too short [regexec_flags]...\n"));
1885 goto phooey;
1aa99e6b 1886 }
6eb5f6b9 1887
1de06328 1888
6eb5f6b9 1889 /* Check validity of program. */
f8fc2ecf 1890 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
1891 Perl_croak(aTHX_ "corrupted regexp program");
1892 }
1893
1894 PL_reg_flags = 0;
1895 PL_reg_eval_set = 0;
1896 PL_reg_maxiter = 0;
1897
3c8556c3 1898 if (RX_UTF8(rx))
6eb5f6b9
JH
1899 PL_reg_flags |= RF_utf8;
1900
1901 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1902 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1903 PL_bostr = strbeg;
3b0527fe 1904 reginfo.sv = sv;
6eb5f6b9
JH
1905
1906 /* Mark end of line for $ (and such) */
1907 PL_regeol = strend;
1908
1909 /* see how far we have to get to not match where we matched before */
3b0527fe 1910 reginfo.till = startpos+minend;
6eb5f6b9 1911
6eb5f6b9
JH
1912 /* If there is a "must appear" string, look for it. */
1913 s = startpos;
1914
bbe252da 1915 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 1916 MAGIC *mg;
2c296965 1917 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 1918 reginfo.ganch = startpos + prog->gofs;
2c296965 1919 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 1920 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 1921 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 1922 && SvMAGIC(sv)
14befaf4
DM
1923 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1924 && mg->mg_len >= 0) {
3b0527fe 1925 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 1926 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 1927 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 1928
bbe252da 1929 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 1930 if (s > reginfo.ganch)
6eb5f6b9 1931 goto phooey;
58e23c8d 1932 s = reginfo.ganch - prog->gofs;
2c296965 1933 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 1934 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
1935 if (s < strbeg)
1936 goto phooey;
6eb5f6b9
JH
1937 }
1938 }
58e23c8d 1939 else if (data) {
70685ca0 1940 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
1941 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1942 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
1943
1944 } else { /* pos() not defined */
3b0527fe 1945 reginfo.ganch = strbeg;
2c296965
YO
1946 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1947 "GPOS: reginfo.ganch = strbeg\n"));
1948 }
6eb5f6b9 1949 }
288b8c02 1950 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
1951 /* We have to be careful. If the previous successful match
1952 was from this regex we don't want a subsequent partially
1953 successful match to clobber the old results.
1954 So when we detect this possibility we add a swap buffer
1955 to the re, and switch the buffer each match. If we fail
1956 we switch it back, otherwise we leave it swapped.
1957 */
1958 swap = prog->offs;
1959 /* do we need a save destructor here for eval dies? */
1960 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
c74340f9 1961 }
a0714e2c 1962 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1963 re_scream_pos_data d;
1964
1965 d.scream_olds = &scream_olds;
1966 d.scream_pos = &scream_pos;
288b8c02 1967 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 1968 if (!s) {
a3621e74 1969 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1970 goto phooey; /* not present */
3fa9c3d7 1971 }
6eb5f6b9
JH
1972 }
1973
1de06328 1974
6eb5f6b9
JH
1975
1976 /* Simplest case: anchored match need be tried only once. */
1977 /* [unless only anchor is BOL and multiline is set] */
bbe252da 1978 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 1979 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 1980 goto got_it;
bbe252da
YO
1981 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1982 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
1983 {
1984 char *end;
1985
1986 if (minlen)
1987 dontbother = minlen - 1;
1aa99e6b 1988 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1989 /* for multiline we only have to try after newlines */
33b8afdf 1990 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1991 if (s == startpos)
1992 goto after_try;
1993 while (1) {
24b23f37 1994 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1995 goto got_it;
1996 after_try:
5339e136 1997 if (s > end)
6eb5f6b9 1998 goto phooey;
bbe252da 1999 if (prog->extflags & RXf_USE_INTUIT) {
288b8c02 2000 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
6eb5f6b9
JH
2001 if (!s)
2002 goto phooey;
2003 }
2004 else
2005 s++;
2006 }
2007 } else {
2008 if (s > startpos)
2009 s--;
2010 while (s < end) {
2011 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2012 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2013 goto got_it;
2014 }
2015 }
2016 }
2017 }
2018 goto phooey;
bbe252da 2019 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a
YO
2020 {
2021 /* the warning about reginfo.ganch being used without intialization
bbe252da 2022 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2023 and we only enter this block when the same bit is set. */
58e23c8d 2024 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2025
2026 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2027 goto got_it;
2028 goto phooey;
2029 }
2030
2031 /* Messy cases: unanchored match. */
bbe252da 2032 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9
JH
2033 /* we have /x+whatever/ */
2034 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 2035 char ch;
bf93d4cc
GS
2036#ifdef DEBUGGING
2037 int did_match = 0;
2038#endif
33b8afdf
JH
2039 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2040 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 2041 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 2042
1aa99e6b 2043 if (do_utf8) {
4cadc6a9 2044 REXEC_FBC_SCAN(
6eb5f6b9 2045 if (*s == ch) {
a3621e74 2046 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2047 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2048 s += UTF8SKIP(s);
2049 while (s < strend && *s == ch)
2050 s += UTF8SKIP(s);
2051 }
4cadc6a9 2052 );
6eb5f6b9
JH
2053 }
2054 else {
4cadc6a9 2055 REXEC_FBC_SCAN(
6eb5f6b9 2056 if (*s == ch) {
a3621e74 2057 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2058 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2059 s++;
2060 while (s < strend && *s == ch)
2061 s++;
2062 }
4cadc6a9 2063 );
6eb5f6b9 2064 }
a3621e74 2065 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2066 PerlIO_printf(Perl_debug_log,
b7953727
JH
2067 "Did not find anchored character...\n")
2068 );
6eb5f6b9 2069 }
a0714e2c
SS
2070 else if (prog->anchored_substr != NULL
2071 || prog->anchored_utf8 != NULL
2072 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2073 && prog->float_max_offset < strend - s)) {
2074 SV *must;
2075 I32 back_max;
2076 I32 back_min;
2077 char *last;
6eb5f6b9 2078 char *last1; /* Last position checked before */
bf93d4cc
GS
2079#ifdef DEBUGGING
2080 int did_match = 0;
2081#endif
33b8afdf
JH
2082 if (prog->anchored_substr || prog->anchored_utf8) {
2083 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2084 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2085 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2086 back_max = back_min = prog->anchored_offset;
2087 } else {
2088 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2089 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2090 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2091 back_max = prog->float_max_offset;
2092 back_min = prog->float_min_offset;
2093 }
1de06328
YO
2094
2095
33b8afdf
JH
2096 if (must == &PL_sv_undef)
2097 /* could not downgrade utf8 check substring, so must fail */
2098 goto phooey;
2099
1de06328
YO
2100 if (back_min<0) {
2101 last = strend;
2102 } else {
2103 last = HOP3c(strend, /* Cannot start after this */
2104 -(I32)(CHR_SVLEN(must)
2105 - (SvTAIL(must) != 0) + back_min), strbeg);
2106 }
6eb5f6b9
JH
2107 if (s > PL_bostr)
2108 last1 = HOPc(s, -1);
2109 else
2110 last1 = s - 1; /* bogus */
2111
a0288114 2112 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2113 check_substr==must. */
2114 scream_pos = -1;
2115 dontbother = end_shift;
2116 strend = HOPc(strend, -dontbother);
2117 while ( (s <= last) &&
9041c2e3 2118 ((flags & REXEC_SCREAM)
1de06328 2119 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 2120 end_shift, &scream_pos, 0))
1de06328 2121 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2122 (unsigned char*)strend, must,
7fba1cd6 2123 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b 2124 /* we may be pointing at the wrong string */
07bc277f 2125 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
3f7c398e 2126 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2127 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2128 if (HOPc(s, -back_max) > last1) {
2129 last1 = HOPc(s, -back_min);
2130 s = HOPc(s, -back_max);
2131 }
2132 else {
52657f30 2133 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2134
2135 last1 = HOPc(s, -back_min);
52657f30 2136 s = t;
6eb5f6b9 2137 }
1aa99e6b 2138 if (do_utf8) {
6eb5f6b9 2139 while (s <= last1) {
24b23f37 2140 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2141 goto got_it;
2142 s += UTF8SKIP(s);
2143 }
2144 }
2145 else {
2146 while (s <= last1) {
24b23f37 2147 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2148 goto got_it;
2149 s++;
2150 }
2151 }
2152 }
ab3bbdeb
YO
2153 DEBUG_EXECUTE_r(if (!did_match) {
2154 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2155 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2156 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2157 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2158 ? "anchored" : "floating"),
ab3bbdeb
YO
2159 quoted, RE_SV_TAIL(must));
2160 });
6eb5f6b9
JH
2161 goto phooey;
2162 }
f8fc2ecf 2163 else if ( (c = progi->regstclass) ) {
f14c76ed 2164 if (minlen) {
f8fc2ecf 2165 const OPCODE op = OP(progi->regstclass);
66e933ab 2166 /* don't bother with what can't match */
786e8c11 2167 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2168 strend = HOPc(strend, -(minlen - 1));
2169 }
a3621e74 2170 DEBUG_EXECUTE_r({
be8e71aa 2171 SV * const prop = sv_newmortal();
32fc9b6a 2172 regprop(prog, prop, c);
0df25f3d 2173 {
02daf0ab 2174 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2175 s,strend-s,60);
0df25f3d 2176 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2177 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 2178 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2179 quoted, (int)(strend - s));
0df25f3d 2180 }
ffc61ed2 2181 });
3b0527fe 2182 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2183 goto got_it;
07be1b83 2184 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2185 }
2186 else {
2187 dontbother = 0;
a0714e2c 2188 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2189 /* Trim the end. */
d6a28714 2190 char *last;
33b8afdf
JH
2191 SV* float_real;
2192
2193 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2194 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2195 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2196
2197 if (flags & REXEC_SCREAM) {
33b8afdf 2198 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2199 end_shift, &scream_pos, 1); /* last one */
2200 if (!last)
ffc61ed2 2201 last = scream_olds; /* Only one occurrence. */
4addbd3b 2202 /* we may be pointing at the wrong string */
07bc277f 2203 else if (RXp_MATCH_COPIED(prog))
3f7c398e 2204 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2205 }
d6a28714
JH
2206 else {
2207 STRLEN len;
cfd0369c 2208 const char * const little = SvPV_const(float_real, len);
d6a28714 2209
33b8afdf 2210 if (SvTAIL(float_real)) {
d6a28714
JH
2211 if (memEQ(strend - len + 1, little, len - 1))
2212 last = strend - len + 1;
7fba1cd6 2213 else if (!multiline)
9041c2e3 2214 last = memEQ(strend - len, little, len)
bd61b366 2215 ? strend - len : NULL;
b8c5462f 2216 else
d6a28714
JH
2217 goto find_last;
2218 } else {
2219 find_last:
9041c2e3 2220 if (len)
d6a28714 2221 last = rninstr(s, strend, little, little + len);
b8c5462f 2222 else
a0288114 2223 last = strend; /* matching "$" */
b8c5462f 2224 }
b8c5462f 2225 }
bf93d4cc 2226 if (last == NULL) {
6bda09f9
YO
2227 DEBUG_EXECUTE_r(
2228 PerlIO_printf(Perl_debug_log,
2229 "%sCan't trim the tail, match fails (should not happen)%s\n",
2230 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2231 goto phooey; /* Should not happen! */
2232 }
d6a28714
JH
2233 dontbother = strend - last + prog->float_min_offset;
2234 }
2235 if (minlen && (dontbother < minlen))
2236 dontbother = minlen - 1;
2237 strend -= dontbother; /* this one's always in bytes! */
2238 /* We don't know much -- general case. */
1aa99e6b 2239 if (do_utf8) {
d6a28714 2240 for (;;) {
24b23f37 2241 if (regtry(&reginfo, &s))
d6a28714
JH
2242 goto got_it;
2243 if (s >= strend)
2244 break;
b8c5462f 2245 s += UTF8SKIP(s);
d6a28714
JH
2246 };
2247 }
2248 else {
2249 do {
24b23f37 2250 if (regtry(&reginfo, &s))
d6a28714
JH
2251 goto got_it;
2252 } while (s++ < strend);
2253 }
2254 }
2255
2256 /* Failure. */
2257 goto phooey;
2258
2259got_it:
e9105d30 2260 Safefree(swap);
288b8c02 2261 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2262
19b95bf0 2263 if (PL_reg_eval_set)
4f639d21 2264 restore_pos(aTHX_ prog);
5daac39c
NC
2265 if (RXp_PAREN_NAMES(prog))
2266 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2267
2268 /* make sure $`, $&, $', and $digit will work later */
2269 if ( !(flags & REXEC_NOT_FIRST) ) {
288b8c02 2270 RX_MATCH_COPY_FREE(rx);
d6a28714 2271 if (flags & REXEC_COPY_STR) {
be8e71aa 2272 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2273#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2274 if ((SvIsCOW(sv)
2275 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2276 if (DEBUG_C_TEST) {
2277 PerlIO_printf(Perl_debug_log,
2278 "Copy on write: regexp capture, type %d\n",
2279 (int) SvTYPE(sv));
2280 }
2281 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2282 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2283 assert (SvPOKp(prog->saved_copy));
2284 } else
2285#endif
2286 {
288b8c02 2287 RX_MATCH_COPIED_on(rx);
ed252734
NC
2288 s = savepvn(strbeg, i);
2289 prog->subbeg = s;
2290 }
d6a28714 2291 prog->sublen = i;
d6a28714
JH
2292 }
2293 else {
2294 prog->subbeg = strbeg;
2295 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2296 }
2297 }
9041c2e3 2298
d6a28714
JH
2299 return 1;
2300
2301phooey:
a3621e74 2302 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2303 PL_colors[4], PL_colors[5]));
d6a28714 2304 if (PL_reg_eval_set)
4f639d21 2305 restore_pos(aTHX_ prog);
e9105d30 2306 if (swap) {
c74340f9 2307 /* we failed :-( roll it back */
e9105d30
GG
2308 Safefree(prog->offs);
2309 prog->offs = swap;
2310 }
2311
d6a28714
JH
2312 return 0;
2313}
2314
6bda09f9 2315
d6a28714
JH
2316/*
2317 - regtry - try match at specific point
2318 */
2319STATIC I32 /* 0 failure, 1 success */
24b23f37 2320S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2321{
97aff369 2322 dVAR;
d6a28714 2323 CHECKPOINT lastcp;
288b8c02
NC
2324 REGEXP *const rx = reginfo->prog;
2325 regexp *const prog = (struct regexp *)SvANY(rx);
f8fc2ecf 2326 RXi_GET_DECL(prog,progi);
a3621e74 2327 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2328
2329 PERL_ARGS_ASSERT_REGTRY;
2330
24b23f37 2331 reginfo->cutpoint=NULL;
d6a28714 2332
bbe252da 2333 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
d6a28714
JH
2334 MAGIC *mg;
2335
2336 PL_reg_eval_set = RS_init;
a3621e74 2337 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2338 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2339 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2340 ));
ea8d6ae1 2341 SAVESTACK_CXPOS();
d6a28714
JH
2342 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2343 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2344 SAVETMPS;
2345 /* Apparently this is not needed, judging by wantarray. */
e8347627 2346 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2347 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2348
3b0527fe 2349 if (reginfo->sv) {
d6a28714 2350 /* Make $_ available to executed code. */
3b0527fe 2351 if (reginfo->sv != DEFSV) {
59f00321 2352 SAVE_DEFSV;
414bf5ae 2353 DEFSV_set(reginfo->sv);
b8c5462f 2354 }
d6a28714 2355
3b0527fe
DM
2356 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2357 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2358 /* prepare for quick setting of pos */
d300d9fa 2359#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2360 if (SvIsCOW(reginfo->sv))
2361 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2362#endif
3dab1dad 2363 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2364 &PL_vtbl_mglob, NULL, 0);
d6a28714 2365 mg->mg_len = -1;
b8c5462f 2366 }
d6a28714
JH
2367 PL_reg_magic = mg;
2368 PL_reg_oldpos = mg->mg_len;
4f639d21 2369 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2370 }
09687e5a 2371 if (!PL_reg_curpm) {
a02a5408 2372 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2373#ifdef USE_ITHREADS
2374 {
14a49a24 2375 SV* const repointer = &PL_sv_undef;
92313705
NC
2376 /* this regexp is also owned by the new PL_reg_curpm, which
2377 will try to free it. */
d2ece331 2378 av_push(PL_regex_padav, repointer);
09687e5a
AB
2379 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2380 PL_regex_pad = AvARRAY(PL_regex_padav);
2381 }
2382#endif
2383 }
86c29d75
NC
2384#ifdef USE_ITHREADS
2385 /* It seems that non-ithreads works both with and without this code.
2386 So for efficiency reasons it seems best not to have the code
2387 compiled when it is not needed. */
92313705
NC
2388 /* This is safe against NULLs: */
2389 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2390 /* PM_reg_curpm owns a reference to this regexp. */
2391 ReREFCNT_inc(rx);
86c29d75 2392#endif
288b8c02 2393 PM_SETRE(PL_reg_curpm, rx);
d6a28714
JH
2394 PL_reg_oldcurpm = PL_curpm;
2395 PL_curpm = PL_reg_curpm;
07bc277f 2396 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2397 /* Here is a serious problem: we cannot rewrite subbeg,
2398 since it may be needed if this match fails. Thus
2399 $` inside (?{}) could fail... */
2400 PL_reg_oldsaved = prog->subbeg;
2401 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2402#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2403 PL_nrs = prog->saved_copy;
2404#endif
07bc277f 2405 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2406 }
2407 else
bd61b366 2408 PL_reg_oldsaved = NULL;
d6a28714
JH
2409 prog->subbeg = PL_bostr;
2410 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2411 }
24b23f37 2412 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
f0ab9afb 2413 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2414 PL_reginput = *startpos;
d6a28714 2415 PL_reglastparen = &prog->lastparen;
a01268b5 2416 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2417 prog->lastparen = 0;
03994de8 2418 prog->lastcloseparen = 0;
d6a28714 2419 PL_regsize = 0;
f0ab9afb 2420 PL_regoffs = prog->offs;
d6a28714
JH
2421 if (PL_reg_start_tmpl <= prog->nparens) {
2422 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2423 if(PL_reg_start_tmp)
2424 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2425 else
a02a5408 2426 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2427 }
2428
2429 /* XXXX What this code is doing here?!!! There should be no need
2430 to do this again and again, PL_reglastparen should take care of
3dd2943c 2431 this! --ilya*/
dafc8851
JH
2432
2433 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2434 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116 2435 * PL_reglastparen), is not needed at all by the test suite
225593e1
DM
2436 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2437 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2438 * Meanwhile, this code *is* needed for the
daf18116
JH
2439 * above-mentioned test suite tests to succeed. The common theme
2440 * on those tests seems to be returning null fields from matches.
225593e1 2441 * --jhi updated by dapm */
dafc8851 2442#if 1
d6a28714 2443 if (prog->nparens) {
f0ab9afb 2444 regexp_paren_pair *pp = PL_regoffs;
097eb12c 2445 register I32 i;
eb160463 2446 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
f0ab9afb
NC
2447 ++pp;
2448 pp->start = -1;
2449 pp->end = -1;
d6a28714
JH
2450 }
2451 }
dafc8851 2452#endif
02db2b7b 2453 REGCP_SET(lastcp);
f8fc2ecf 2454 if (regmatch(reginfo, progi->program + 1)) {
f0ab9afb 2455 PL_regoffs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2456 return 1;
2457 }
24b23f37
YO
2458 if (reginfo->cutpoint)
2459 *startpos= reginfo->cutpoint;
02db2b7b 2460 REGCP_UNWIND(lastcp);
d6a28714
JH
2461 return 0;
2462}
2463
02db2b7b 2464
8ba1375e
MJD
2465#define sayYES goto yes
2466#define sayNO goto no
262b90c4 2467#define sayNO_SILENT goto no_silent
8ba1375e 2468
f9f4320a
YO
2469/* we dont use STMT_START/END here because it leads to
2470 "unreachable code" warnings, which are bogus, but distracting. */
2471#define CACHEsayNO \
c476f425
DM
2472 if (ST.cache_mask) \
2473 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2474 sayNO
3298f257 2475
a3621e74 2476/* this is used to determine how far from the left messages like
265c4333
YO
2477 'failed...' are printed. It should be set such that messages
2478 are inline with the regop output that created them.
a3621e74 2479*/
265c4333 2480#define REPORT_CODE_OFF 32
a3621e74
YO
2481
2482
2483/* Make sure there is a test for this +1 options in re_tests */
2484#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2485
40a82448
DM
2486#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2487#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2488
86545054
DM
2489#define SLAB_FIRST(s) (&(s)->states[0])
2490#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2491
5d9a96ca
DM
2492/* grab a new slab and return the first slot in it */
2493
2494STATIC regmatch_state *
2495S_push_slab(pTHX)
2496{
a35a87e7 2497#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2498 dMY_CXT;
2499#endif
5d9a96ca
DM
2500 regmatch_slab *s = PL_regmatch_slab->next;
2501 if (!s) {
2502 Newx(s, 1, regmatch_slab);
2503 s->prev = PL_regmatch_slab;
2504 s->next = NULL;
2505 PL_regmatch_slab->next = s;
2506 }
2507 PL_regmatch_slab = s;
86545054 2508 return SLAB_FIRST(s);
5d9a96ca 2509}
5b47454d 2510
95b24440 2511
40a82448
DM
2512/* push a new state then goto it */
2513
2514#define PUSH_STATE_GOTO(state, node) \
2515 scan = node; \
2516 st->resume_state = state; \
2517 goto push_state;
2518
2519/* push a new state with success backtracking, then goto it */
2520
2521#define PUSH_YES_STATE_GOTO(state, node) \
2522 scan = node; \
2523 st->resume_state = state; \
2524 goto push_yes_state;
2525
aa283a38 2526
aa283a38 2527
d6a28714 2528/*
95b24440 2529
bf1f174e
DM
2530regmatch() - main matching routine
2531
2532This is basically one big switch statement in a loop. We execute an op,
2533set 'next' to point the next op, and continue. If we come to a point which
2534we may need to backtrack to on failure such as (A|B|C), we push a
2535backtrack state onto the backtrack stack. On failure, we pop the top
2536state, and re-enter the loop at the state indicated. If there are no more
2537states to pop, we return failure.
2538
2539Sometimes we also need to backtrack on success; for example /A+/, where
2540after successfully matching one A, we need to go back and try to
2541match another one; similarly for lookahead assertions: if the assertion
2542completes successfully, we backtrack to the state just before the assertion
2543and then carry on. In these cases, the pushed state is marked as
2544'backtrack on success too'. This marking is in fact done by a chain of
2545pointers, each pointing to the previous 'yes' state. On success, we pop to
2546the nearest yes state, discarding any intermediate failure-only states.
2547Sometimes a yes state is pushed just to force some cleanup code to be
2548called at the end of a successful match or submatch; e.g. (??{$re}) uses
2549it to free the inner regex.
2550
2551Note that failure backtracking rewinds the cursor position, while
2552success backtracking leaves it alone.
2553
2554A pattern is complete when the END op is executed, while a subpattern
2555such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2556ops trigger the "pop to last yes state if any, otherwise return true"
2557behaviour.
2558
2559A common convention in this function is to use A and B to refer to the two
2560subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2561the subpattern to be matched possibly multiple times, while B is the entire
2562rest of the pattern. Variable and state names reflect this convention.
2563
2564The states in the main switch are the union of ops and failure/success of
2565substates associated with with that op. For example, IFMATCH is the op
2566that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2567'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2568successfully matched A and IFMATCH_A_fail is a state saying that we have
2569just failed to match A. Resume states always come in pairs. The backtrack
2570state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2571at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2572on success or failure.
2573
2574The struct that holds a backtracking state is actually a big union, with
2575one variant for each major type of op. The variable st points to the
2576top-most backtrack struct. To make the code clearer, within each
2577block of code we #define ST to alias the relevant union.
2578
2579Here's a concrete example of a (vastly oversimplified) IFMATCH
2580implementation:
2581
2582 switch (state) {
2583 ....
2584
2585#define ST st->u.ifmatch
2586
2587 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2588 ST.foo = ...; // some state we wish to save
95b24440 2589 ...
bf1f174e
DM
2590 // push a yes backtrack state with a resume value of
2591 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2592 // first node of A:
2593 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2594 // NOTREACHED
2595
2596 case IFMATCH_A: // we have successfully executed A; now continue with B
2597 next = B;
2598 bar = ST.foo; // do something with the preserved value
2599 break;
2600
2601 case IFMATCH_A_fail: // A failed, so the assertion failed
2602 ...; // do some housekeeping, then ...
2603 sayNO; // propagate the failure
2604
2605#undef ST
95b24440 2606
bf1f174e
DM
2607 ...
2608 }
95b24440 2609
bf1f174e
DM
2610For any old-timers reading this who are familiar with the old recursive
2611approach, the code above is equivalent to:
95b24440 2612
bf1f174e
DM
2613 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2614 {
2615 int foo = ...
95b24440 2616 ...
bf1f174e
DM
2617 if (regmatch(A)) {
2618 next = B;
2619 bar = foo;
2620 break;
95b24440 2621 }
bf1f174e
DM
2622 ...; // do some housekeeping, then ...
2623 sayNO; // propagate the failure
95b24440 2624 }
bf1f174e
DM
2625
2626The topmost backtrack state, pointed to by st, is usually free. If you
2627want to claim it, populate any ST.foo fields in it with values you wish to
2628save, then do one of
2629
2630 PUSH_STATE_GOTO(resume_state, node);
2631 PUSH_YES_STATE_GOTO(resume_state, node);
2632
2633which sets that backtrack state's resume value to 'resume_state', pushes a
2634new free entry to the top of the backtrack stack, then goes to 'node'.
2635On backtracking, the free slot is popped, and the saved state becomes the
2636new free state. An ST.foo field in this new top state can be temporarily
2637accessed to retrieve values, but once the main loop is re-entered, it
2638becomes available for reuse.
2639
2640Note that the depth of the backtrack stack constantly increases during the
2641left-to-right execution of the pattern, rather than going up and down with
2642the pattern nesting. For example the stack is at its maximum at Z at the
2643end of the pattern, rather than at X in the following:
2644
2645 /(((X)+)+)+....(Y)+....Z/
2646
2647The only exceptions to this are lookahead/behind assertions and the cut,
2648(?>A), which pop all the backtrack states associated with A before
2649continuing.
2650
2651Bascktrack state structs are allocated in slabs of about 4K in size.
2652PL_regmatch_state and st always point to the currently active state,
2653and PL_regmatch_slab points to the slab currently containing
2654PL_regmatch_state. The first time regmatch() is called, the first slab is
2655allocated, and is never freed until interpreter destruction. When the slab
2656is full, a new one is allocated and chained to the end. At exit from
2657regmatch(), slabs allocated since entry are freed.
2658
2659*/
95b24440 2660
40a82448 2661
5bc10b2c 2662#define DEBUG_STATE_pp(pp) \
265c4333 2663 DEBUG_STATE_r({ \
5bc10b2c
DM
2664 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2665 PerlIO_printf(Perl_debug_log, \
5d458dd8 2666 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2667 depth*2, "", \
13d6edb4 2668 PL_reg_name[st->resume_state], \
5d458dd8
YO
2669 ((st==yes_state||st==mark_state) ? "[" : ""), \
2670 ((st==yes_state) ? "Y" : ""), \
2671 ((st==mark_state) ? "M" : ""), \
2672 ((st==yes_state||st==mark_state) ? "]" : "") \
2673 ); \
265c4333 2674 });
5bc10b2c 2675
40a82448 2676
3dab1dad 2677#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2678
3df15adc 2679#ifdef DEBUGGING
5bc10b2c 2680
ab3bbdeb 2681STATIC void
efd26800 2682S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8,
ab3bbdeb
YO
2683 const char *start, const char *end, const char *blurb)
2684{
efd26800 2685 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
2686
2687 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2688
ab3bbdeb
YO
2689 if (!PL_colorset)
2690 reginitcolors();
2691 {
2692 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 2693 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb
YO
2694
2695 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2696 start, end - start, 60);
2697
2698 PerlIO_printf(Perl_debug_log,
2699 "%s%s REx%s %s against %s\n",
2700 PL_colors[4], blurb, PL_colors[5], s0, s1);
2701
2702 if (do_utf8||utf8_pat)
1de06328
YO
2703 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2704 utf8_pat ? "pattern" : "",
2705 utf8_pat && do_utf8 ? " and " : "",
2706 do_utf8 ? "string" : ""
ab3bbdeb
YO
2707 );
2708 }
2709}
3df15adc
YO
2710
2711STATIC void
786e8c11
YO
2712S_dump_exec_pos(pTHX_ const char *locinput,
2713 const regnode *scan,
2714 const char *loc_regeol,
2715 const char *loc_bostr,
2716 const char *loc_reg_starttry,
2717 const bool do_utf8)
07be1b83 2718{
786e8c11 2719 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2720 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2721 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2722 /* The part of the string before starttry has one color
2723 (pref0_len chars), between starttry and current
2724 position another one (pref_len - pref0_len chars),
2725 after the current position the third one.
2726 We assume that pref0_len <= pref_len, otherwise we
2727 decrease pref0_len. */
786e8c11
YO
2728 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2729 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2730 int pref0_len;
2731
7918f24d
NC
2732 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2733
07be1b83
YO
2734 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2735 pref_len++;
786e8c11
YO
2736 pref0_len = pref_len - (locinput - loc_reg_starttry);
2737 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2738 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2739 ? (5 + taill) - pref_len : loc_regeol - locinput);
07be1b83
YO
2740 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2741 l--;
2742 if (pref0_len < 0)
2743 pref0_len = 0;
2744 if (pref0_len > pref_len)
2745 pref0_len = pref_len;
2746 {
3df15adc 2747 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2748
ab3bbdeb 2749 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2750 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2751
ab3bbdeb 2752 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2753 (locinput - pref_len + pref0_len),
1de06328 2754 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2755
ab3bbdeb 2756 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2757 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2758
1de06328 2759 const STRLEN tlen=len0+len1+len2;
3df15adc 2760 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2761 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2762 (IV)(locinput - loc_bostr),
07be1b83 2763 len0, s0,
07be1b83 2764 len1, s1,
07be1b83 2765 (docolor ? "" : "> <"),
07be1b83 2766 len2, s2,
f9f4320a 2767 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
2768 "");
2769 }
2770}
3df15adc 2771
07be1b83
YO
2772#endif
2773
0a4db386
YO
2774/* reg_check_named_buff_matched()
2775 * Checks to see if a named buffer has matched. The data array of
2776 * buffer numbers corresponding to the buffer is expected to reside
2777 * in the regexp->data->data array in the slot stored in the ARG() of
2778 * node involved. Note that this routine doesn't actually care about the
2779 * name, that information is not preserved from compilation to execution.
2780 * Returns the index of the leftmost defined buffer with the given name
2781 * or 0 if non of the buffers matched.
2782 */
2783STATIC I32
7918f24d
NC
2784S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2785{
0a4db386 2786 I32 n;
f8fc2ecf 2787 RXi_GET_DECL(rex,rexi);
ad64d0ec 2788 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 2789 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
2790
2791 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2792
0a4db386
YO
2793 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2794 if ((I32)*PL_reglastparen >= nums[n] &&
f0ab9afb 2795 PL_regoffs[nums[n]].end != -1)
0a4db386
YO
2796 {
2797 return nums[n];
2798 }
2799 }
2800 return 0;
2801}
2802
2f554ef7
DM
2803
2804/* free all slabs above current one - called during LEAVE_SCOPE */
2805
2806STATIC void
2807S_clear_backtrack_stack(pTHX_ void *p)
2808{
2809 regmatch_slab *s = PL_regmatch_slab->next;
2810 PERL_UNUSED_ARG(p);
2811
2812 if (!s)
2813 return;
2814 PL_regmatch_slab->next = NULL;
2815 while (s) {
2816 regmatch_slab * const osl = s;
2817 s = s->next;
2818 Safefree(osl);
2819 }
2820}
2821
2822
28d8d7f4
YO
2823#define SETREX(Re1,Re2) \
2824 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2825 Re1 = (Re2)
2826
d6a28714 2827STATIC I32 /* 0 failure, 1 success */
24b23f37 2828S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 2829{
a35a87e7 2830#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2831 dMY_CXT;
2832#endif
27da23d5 2833 dVAR;
95b24440 2834 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2835 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
2836 REGEXP *rex_sv = reginfo->prog;
2837 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 2838 RXi_GET_DECL(rex,rexi);
2f554ef7 2839 I32 oldsave;
5d9a96ca
DM
2840 /* the current state. This is a cached copy of PL_regmatch_state */
2841 register regmatch_state *st;
5d9a96ca
DM
2842 /* cache heavy used fields of st in registers */
2843 register regnode *scan;
2844 register regnode *next;
438e9bae 2845 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 2846 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 2847 register char *locinput = PL_reginput;
5d9a96ca 2848 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 2849
b69b0499 2850 bool result = 0; /* return value of S_regmatch */
24d3c4a9 2851 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
2852 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2853 const U32 max_nochange_depth =
2854 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2855 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
2856 regmatch_state *yes_state = NULL; /* state to pop to on success of
2857 subpattern */
e2e6a0f1
YO
2858 /* mark_state piggy backs on the yes_state logic so that when we unwind
2859 the stack on success we can update the mark_state as we go */
2860 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 2861 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 2862 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 2863 U32 state_num;
5d458dd8
YO
2864 bool no_final = 0; /* prevent failure from backtracking? */
2865 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 2866 char *startpoint = PL_reginput;
5d458dd8
YO
2867 SV *popmark = NULL; /* are we looking for a mark? */
2868 SV *sv_commit = NULL; /* last mark name seen in failure */
2869 SV *sv_yes_mark = NULL; /* last mark name we have seen
2870 during a successfull match */
2871 U32 lastopen = 0; /* last open we saw */
2872 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 2873 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
2874 /* these three flags are set by various ops to signal information to
2875 * the very next op. They have a useful lifetime of exactly one loop
2876 * iteration, and are not preserved or restored by state pushes/pops
2877 */
2878 bool sw = 0; /* the condition value in (?(cond)a|b) */
2879 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2880 int logical = 0; /* the following EVAL is:
2881 0: (?{...})
2882 1: (?(?{...})X|Y)
2883 2: (??{...})
2884 or the following IFMATCH/UNLESSM is:
2885 false: plain (?=foo)
2886 true: used as a condition: (?(?=foo))
2887 */
95b24440 2888#ifdef DEBUGGING
e68ec53f 2889 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2890#endif
2891
7918f24d
NC
2892 PERL_ARGS_ASSERT_REGMATCH;
2893
3b57cd43 2894 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 2895 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 2896 }));
5d9a96ca
DM
2897 /* on first ever call to regmatch, allocate first slab */
2898 if (!PL_regmatch_slab) {
2899 Newx(PL_regmatch_slab, 1, regmatch_slab);
2900 PL_regmatch_slab->prev = NULL;
2901 PL_regmatch_slab->next = NULL;
86545054 2902 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2903 }
2904
2f554ef7
DM
2905 oldsave = PL_savestack_ix;
2906 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2907 SAVEVPTR(PL_regmatch_slab);
2908 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
2909
2910 /* grab next free state slot */
2911 st = ++PL_regmatch_state;
86545054 2912 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2913 st = PL_regmatch_state = S_push_slab(aTHX);
2914
d6a28714
JH
2915 /* Note that nextchr is a byte even in UTF */
2916 nextchr = UCHARAT(locinput);
2917 scan = prog;
2918 while (scan != NULL) {
8ba1375e 2919
a3621e74 2920 DEBUG_EXECUTE_r( {
6136c704 2921 SV * const prop = sv_newmortal();
1de06328 2922 regnode *rnext=regnext(scan);
786e8c11 2923 DUMP_EXEC_POS( locinput, scan, do_utf8 );
32fc9b6a 2924 regprop(rex, prop, scan);
07be1b83
YO
2925
2926 PerlIO_printf(Perl_debug_log,
2927 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 2928 (IV)(scan - rexi->program), depth*2, "",
07be1b83 2929 SvPVX_const(prop),
1de06328 2930 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 2931 0 : (IV)(rnext - rexi->program));
2a782b5b 2932 });
d6a28714
JH
2933
2934 next = scan + NEXT_OFF(scan);
2935 if (next == scan)
2936 next = NULL;
40a82448 2937 state_num = OP(scan);
d6a28714 2938
40a82448 2939 reenter_switch:
34a81e2b
B
2940
2941 assert(PL_reglastparen == &rex->lastparen);
2942 assert(PL_reglastcloseparen == &rex->lastcloseparen);
2943 assert(PL_regoffs == rex->offs);
2944
40a82448 2945 switch (state_num) {
d6a28714 2946 case BOL:
7fba1cd6 2947 if (locinput == PL_bostr)
d6a28714 2948 {
3b0527fe 2949 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2950 break;
2951 }
d6a28714
JH
2952 sayNO;
2953 case MBOL:
12d33761
HS
2954 if (locinput == PL_bostr ||
2955 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2956 {
b8c5462f
JH
2957 break;
2958 }
d6a28714
JH
2959 sayNO;
2960 case SBOL:
c2a73568 2961 if (locinput == PL_bostr)
b8c5462f 2962 break;
d6a28714
JH
2963 sayNO;
2964 case GPOS:
3b0527fe 2965 if (locinput == reginfo->ganch)
d6a28714
JH
2966 break;
2967 sayNO;
ee9b8eae
YO
2968
2969 case KEEPS:
2970 /* update the startpoint */
f0ab9afb 2971 st->u.keeper.val = PL_regoffs[0].start;
ee9b8eae 2972 PL_reginput = locinput;
f0ab9afb 2973 PL_regoffs[0].start = locinput - PL_bostr;
ee9b8eae
YO
2974 PUSH_STATE_GOTO(KEEPS_next, next);
2975 /*NOT-REACHED*/
2976 case KEEPS_next_fail:
2977 /* rollback the start point change */
f0ab9afb 2978 PL_regoffs[0].start = st->u.keeper.val;
ee9b8eae
YO
2979 sayNO_SILENT;
2980 /*NOT-REACHED*/
d6a28714 2981 case EOL:
d6a28714
JH
2982 goto seol;
2983 case MEOL:
d6a28714 2984 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2985 sayNO;
b8c5462f 2986 break;
d6a28714
JH
2987 case SEOL:
2988 seol:
2989 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2990 sayNO;
d6a28714 2991 if (PL_regeol - locinput > 1)
b8c5462f 2992 sayNO;
b8c5462f 2993 break;
d6a28714
JH
2994 case EOS:
2995 if (PL_regeol != locinput)
b8c5462f 2996 sayNO;
d6a28714 2997 break;
ffc61ed2 2998 case SANY:
d6a28714 2999 if (!nextchr && locinput >= PL_regeol)
4633a7c4 3000 sayNO;
f33976b4
DB
3001 if (do_utf8) {
3002 locinput += PL_utf8skip[nextchr];
3003 if (locinput > PL_regeol)
3004 sayNO;
3005 nextchr = UCHARAT(locinput);
3006 }
3007 else
3008 nextchr = UCHARAT(++locinput);
3009 break;
3010 case CANY:
3011 if (!nextchr && locinput >= PL_regeol)
3012 sayNO;
b8c5462f 3013 nextchr = UCHARAT(++locinput);
a0d0e21e 3014 break;
ffc61ed2 3015 case REG_ANY:
1aa99e6b
IH
3016 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3017 sayNO;
3018 if (do_utf8) {
b8c5462f 3019 locinput += PL_utf8skip[nextchr];
d6a28714
JH
3020 if (locinput > PL_regeol)
3021 sayNO;
a0ed51b3 3022 nextchr = UCHARAT(locinput);
a0ed51b3 3023 }
1aa99e6b
IH
3024 else
3025 nextchr = UCHARAT(++locinput);
a0ed51b3 3026 break;
166ba7cd
DM
3027
3028#undef ST
3029#define ST st->u.trie
786e8c11
YO
3030 case TRIEC:
3031 /* In this case the charclass data is available inline so
3032 we can fail fast without a lot of extra overhead.
3033 */
3034 if (scan->flags == EXACT || !do_utf8) {
3035 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3036 DEBUG_EXECUTE_r(
3037 PerlIO_printf(Perl_debug_log,
3038 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3039 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
786e8c11
YO
3040 );
3041 sayNO_SILENT;
3042 /* NOTREACHED */
3043 }
3044 }
3045 /* FALL THROUGH */
5b47454d 3046 case TRIE:
3dab1dad 3047 {
07be1b83 3048 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3049 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3050
3051 /* what trie are we using right now */
be8e71aa 3052 reg_trie_data * const trie
f8fc2ecf 3053 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3054 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3055 U32 state = trie->startstate;
166ba7cd 3056
3dab1dad
YO
3057 if (trie->bitmap && trie_type != trie_utf8_fold &&
3058 !TRIE_BITMAP_TEST(trie,*locinput)
3059 ) {
3060 if (trie->states[ state ].wordnum) {
3061 DEBUG_EXECUTE_r(
3062 PerlIO_printf(Perl_debug_log,
3063 "%*s %smatched empty string...%s\n",
5bc10b2c 3064 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3065 );
3066 break;
3067 } else {
3068 DEBUG_EXECUTE_r(
3069 PerlIO_printf(Perl_debug_log,
786e8c11 3070 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3071 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3072 );
3073 sayNO_SILENT;
3074 }
3075 }
166ba7cd 3076
786e8c11
YO
3077 {
3078 U8 *uc = ( U8* )locinput;
3079
3080 STRLEN len = 0;
3081 STRLEN foldlen = 0;
3082 U8 *uscan = (U8*)NULL;
3083 STRLEN bufflen=0;
3084 SV *sv_accept_buff = NULL;
3085 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3086
3087 ST.accepted = 0; /* how many accepting states we have seen */
3088 ST.B = next;
3089 ST.jump = trie->jump;
786e8c11 3090 ST.me = scan;
07be1b83
YO
3091 /*
3092 traverse the TRIE keeping track of all accepting states
3093 we transition through until we get to a failing node.
3094 */
3095
a3621e74 3096 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3097 U32 base = trie->states[ state ].trans.base;
f9f4320a 3098 UV uvc = 0;
786e8c11
YO
3099 U16 charid;
3100 /* We use charid to hold the wordnum as we don't use it
3101 for charid until after we have done the wordnum logic.
3102 We define an alias just so that the wordnum logic reads
3103 more naturally. */
3104
3105#define got_wordnum charid
3106 got_wordnum = trie->states[ state ].wordnum;
3107
3108 if ( got_wordnum ) {
3109 if ( ! ST.accepted ) {
5b47454d 3110 ENTER;
6962fb1a 3111 SAVETMPS; /* XXX is this necessary? dmq */
5b47454d
DM
3112 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
3113 sv_accept_buff=newSV(bufflen *
3114 sizeof(reg_trie_accepted) - 1);
786e8c11 3115 SvCUR_set(sv_accept_buff, 0);
5b47454d
DM
3116 SvPOK_on(sv_accept_buff);
3117 sv_2mortal(sv_accept_buff);
166ba7cd
DM
3118 SAVETMPS;
3119 ST.accept_buff =
5b47454d
DM
3120 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
3121 }
786e8c11 3122 do {
166ba7cd 3123 if (ST.accepted >= bufflen) {
5b47454d 3124 bufflen *= 2;
166ba7cd 3125 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
3126 SvGROW(sv_accept_buff,
3127 bufflen * sizeof(reg_trie_accepted));
3128 }
3129 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
3130 + sizeof(reg_trie_accepted));
a3621e74 3131
786e8c11
YO
3132
3133 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
3134 ST.accept_buff[ST.accepted].endpos = uc;
3135 ++ST.accepted;
3136 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
3137 }
3138#undef got_wordnum
a3621e74 3139
07be1b83 3140 DEBUG_TRIE_EXECUTE_r({
786e8c11 3141 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
a3621e74 3142 PerlIO_printf( Perl_debug_log,
786e8c11 3143 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
5bc10b2c 3144 2+depth * 2, "", PL_colors[4],
786e8c11 3145 (UV)state, (UV)ST.accepted );
07be1b83 3146 });
a3621e74
YO
3147
3148 if ( base ) {
55eed653
NC
3149 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3150 uscan, len, uvc, charid, foldlen,
3151 foldbuf, uniflags);
a3621e74 3152
5b47454d
DM
3153 if (charid &&
3154 (base + charid > trie->uniquecharcount )
3155 && (base + charid - 1 - trie->uniquecharcount
3156 < trie->lasttrans)
3157 && trie->trans[base + charid - 1 -
3158 trie->uniquecharcount].check == state)
3159 {
3160 state = trie->trans[base + charid - 1 -
3161 trie->uniquecharcount ].next;
3162 }
3163 else {
3164 state = 0;
3165 }
3166 uc += len;
3167
3168 }
3169 else {
a3621e74
YO
3170 state = 0;
3171 }
3172 DEBUG_TRIE_EXECUTE_r(
e4584336 3173 PerlIO_printf( Perl_debug_log,
786e8c11 3174 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3175 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3176 );
3177 }
166ba7cd 3178 if (!ST.accepted )
a3621e74 3179 sayNO;
a3621e74 3180
166ba7cd
DM
3181 DEBUG_EXECUTE_r(
3182 PerlIO_printf( Perl_debug_log,
3183 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3184 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3185 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3186 );
786e8c11 3187 }}
fae667d5
YO
3188 goto trie_first_try; /* jump into the fail handler */
3189 /* NOTREACHED */
166ba7cd 3190 case TRIE_next_fail: /* we failed - try next alterative */
fae667d5
YO
3191 if ( ST.jump) {
3192 REGCP_UNWIND(ST.cp);
3193 for (n = *PL_reglastparen; n > ST.lastparen; n--)
f0ab9afb 3194 PL_regoffs[n].end = -1;
fae667d5
YO
3195 *PL_reglastparen = n;
3196 }
3197 trie_first_try:
5d458dd8
YO
3198 if (do_cutgroup) {
3199 do_cutgroup = 0;
3200 no_final = 0;
3201 }
fae667d5
YO
3202
3203 if ( ST.jump) {
3204 ST.lastparen = *PL_reglastparen;
3205 REGCP_SET(ST.cp);
3206 }
166ba7cd
DM
3207 if ( ST.accepted == 1 ) {
3208 /* only one choice left - just continue */
3209 DEBUG_EXECUTE_r({
2b8b4781 3210 AV *const trie_words
502c6561 3211 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
2b8b4781 3212 SV ** const tmp = av_fetch( trie_words,
5c9f2f80 3213 ST.accept_buff[ 0 ].wordnum-1, 0 );
de734bd5
A
3214 SV *sv= tmp ? sv_newmortal() : NULL;
3215
166ba7cd
DM
3216 PerlIO_printf( Perl_debug_log,
3217 "%*s %sonly one match left: #%d <%s>%s\n",
5bc10b2c 3218 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3219 ST.accept_buff[ 0 ].wordnum,
de734bd5
A
3220 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3221 PL_colors[0], PL_colors[1],
3222 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3223 )
3224 : "not compiled under -Dr",
166ba7cd
DM
3225 PL_colors[5] );
3226 });
3227 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3228 /* in this case we free tmps/leave before we call regmatch
3229 as we wont be using accept_buff again. */
5d458dd8 3230
166ba7cd
DM
3231 locinput = PL_reginput;
3232 nextchr = UCHARAT(locinput);
5d458dd8
YO
3233 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3234 scan = ST.B;
3235 else
3236 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3237 if (!has_cutgroup) {
3238 FREETMPS;
3239 LEAVE;
3240 } else {
3241 ST.accepted--;
3242 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3243 }
786e8c11 3244
166ba7cd
DM
3245 continue; /* execute rest of RE */
3246 }
fae667d5
YO
3247
3248 if ( !ST.accepted-- ) {
5d458dd8
YO
3249 DEBUG_EXECUTE_r({
3250 PerlIO_printf( Perl_debug_log,
3251 "%*s %sTRIE failed...%s\n",
3252 REPORT_CODE_OFF+depth*2, "",
3253 PL_colors[4],
3254 PL_colors[5] );
3255 });
166ba7cd
DM
3256 FREETMPS;
3257 LEAVE;
5d458dd8 3258 sayNO_SILENT;
fae667d5
YO
3259 /*NOTREACHED*/
3260 }
166ba7cd 3261
a3621e74 3262 /*
166ba7cd
DM
3263 There are at least two accepting states left. Presumably
3264 the number of accepting states is going to be low,
3265 typically two. So we simply scan through to find the one
3266 with lowest wordnum. Once we find it, we swap the last
3267 state into its place and decrement the size. We then try to
3268 match the rest of the pattern at the point where the word
3269 ends. If we succeed, control just continues along the
3270 regex; if we fail we return here to try the next accepting
3271 state
3272 */
a3621e74 3273
166ba7cd
DM
3274 {
3275 U32 best = 0;
3276 U32 cur;
3277 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3278 DEBUG_TRIE_EXECUTE_r(
f2278c82 3279 PerlIO_printf( Perl_debug_log,
166ba7cd 3280 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
5bc10b2c 3281 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
166ba7cd
DM
3282 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3283 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3284 );
3285
3286 if (ST.accept_buff[cur].wordnum <
3287 ST.accept_buff[best].wordnum)
3288 best = cur;
a3621e74 3289 }
166ba7cd
DM
3290
3291 DEBUG_EXECUTE_r({
2b8b4781 3292 AV *const trie_words
502c6561 3293 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
2b8b4781 3294 SV ** const tmp = av_fetch( trie_words,
5c9f2f80 3295 ST.accept_buff[ best ].wordnum - 1, 0 );
7f69552c 3296 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
265c4333 3297 ST.B :
7f69552c 3298 ST.me + ST.jump[ST.accept_buff[best].wordnum];
de734bd5
A
3299 SV *sv= tmp ? sv_newmortal() : NULL;
3300
265c4333
YO
3301 PerlIO_printf( Perl_debug_log,
3302 "%*s %strying alternation #%d <%s> at node #%d %s\n",
5bc10b2c 3303 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3304 ST.accept_buff[best].wordnum,
de734bd5
A
3305 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3306 PL_colors[0], PL_colors[1],
3307 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3308 ) : "not compiled under -Dr",
265c4333 3309 REG_NODE_NUM(nextop),
166ba7cd
DM
3310 PL_colors[5] );
3311 });
3312
3313 if ( best<ST.accepted ) {
3314 reg_trie_accepted tmp = ST.accept_buff[ best ];
3315 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3316 ST.accept_buff[ ST.accepted ] = tmp;
3317 best = ST.accepted;
a3621e74 3318 }
166ba7cd 3319 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
7f69552c 3320 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
5d458dd8 3321 scan = ST.B;
786e8c11 3322 } else {
5d458dd8 3323 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
5d458dd8 3324 }
6b173516 3325 PUSH_YES_STATE_GOTO(TRIE_next, scan);
786e8c11 3326 /* NOTREACHED */
166ba7cd 3327 }
166ba7cd 3328 /* NOTREACHED */
5d458dd8 3329 case TRIE_next:
6962fb1a
YO
3330 /* we dont want to throw this away, see bug 57042*/
3331 if (oreplsv != GvSV(PL_replgv))
3332 sv_setsv(oreplsv, GvSV(PL_replgv));
5d458dd8
YO
3333 FREETMPS;
3334 LEAVE;
3335 sayYES;
166ba7cd
DM
3336#undef ST
3337
95b24440
DM
3338 case EXACT: {
3339 char *s = STRING(scan);
24d3c4a9 3340 ln = STR_LEN(scan);
eb160463 3341 if (do_utf8 != UTF) {
bc517b45 3342 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3343 char *l = locinput;
24d3c4a9 3344 const char * const e = s + ln;
a72c7584 3345
5ff6fc6d
JH
3346 if (do_utf8) {
3347 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3348 while (s < e) {
a3b680e6 3349 STRLEN ulen;
1aa99e6b 3350 if (l >= PL_regeol)
5ff6fc6d
JH
3351 sayNO;
3352 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3353 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3354 uniflags))
5ff6fc6d 3355 sayNO;
bc517b45 3356 l += ulen;
5ff6fc6d 3357 s ++;
1aa99e6b 3358 }
5ff6fc6d
JH
3359 }
3360 else {
3361 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3362 while (s < e) {
a3b680e6 3363 STRLEN ulen;
1aa99e6b
IH
3364 if (l >= PL_regeol)
3365 sayNO;
5ff6fc6d 3366 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3367 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3368 uniflags))
1aa99e6b 3369 sayNO;
bc517b45 3370 s += ulen;
a72c7584 3371 l ++;
1aa99e6b 3372 }
5ff6fc6d 3373 }
1aa99e6b
IH
3374 locinput = l;
3375 nextchr = UCHARAT(locinput);
3376 break;
3377 }
bc517b45 3378 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3379 /* Inline the first character, for speed. */
3380 if (UCHARAT(s) != nextchr)
3381 sayNO;
24d3c4a9 3382 if (PL_regeol - locinput < ln)
d6a28714 3383 sayNO;
24d3c4a9 3384 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 3385 sayNO;
24d3c4a9 3386 locinput += ln;
d6a28714
JH
3387 nextchr = UCHARAT(locinput);
3388 break;
95b24440 3389 }
d6a28714 3390 case EXACTFL:
b8c5462f
JH
3391 PL_reg_flags |= RF_tainted;
3392 /* FALL THROUGH */
95b24440 3393 case EXACTF: {
be8e71aa 3394 char * const s = STRING(scan);
24d3c4a9 3395 ln = STR_LEN(scan);
d6a28714 3396
d07ddd77
JH
3397 if (do_utf8 || UTF) {
3398 /* Either target or the pattern are utf8. */
be8e71aa 3399 const char * const l = locinput;
d07ddd77 3400 char *e = PL_regeol;
bc517b45 3401
24d3c4a9 3402 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 3403 l, &e, 0, do_utf8)) {
5486206c
JH
3404 /* One more case for the sharp s:
3405 * pack("U0U*", 0xDF) =~ /ss/i,
3406 * the 0xC3 0x9F are the UTF-8
3407 * byte sequence for the U+00DF. */
e1d1eefb 3408
5486206c 3409 if (!(do_utf8 &&
e1d1eefb 3410 toLOWER(s[0]) == 's' &&
24d3c4a9 3411 ln >= 2 &&
5486206c
JH
3412 toLOWER(s[1]) == 's' &&
3413 (U8)l[0] == 0xC3 &&
3414 e - l >= 2 &&
3415 (U8)l[1] == 0x9F))
3416 sayNO;
3417 }
d07ddd77
JH
3418 locinput = e;
3419 nextchr = UCHARAT(locinput);
3420 break;
a0ed51b3 3421 }
d6a28714 3422
bc517b45
JH
3423 /* Neither the target and the pattern are utf8. */
3424
d6a28714
JH
3425 /* Inline the first character, for speed. */
3426 if (UCHARAT(s) != nextchr &&
3427 UCHARAT(s) != ((OP(scan) == EXACTF)
3428 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3429 sayNO;
24d3c4a9 3430 if (PL_regeol - locinput < ln)
b8c5462f 3431 sayNO;
24d3c4a9
DM
3432 if (ln > 1 && (OP(scan) == EXACTF
3433 ? ibcmp(s, locinput, ln)
3434 : ibcmp_locale(s, locinput, ln)))
4633a7c4 3435 sayNO;
24d3c4a9 3436 locinput += ln;
d6a28714 3437 nextchr = UCHARAT(locinput);
a0d0e21e 3438 break;
95b24440 3439 }
b2680017
YO
3440 case BOUNDL:
3441 case NBOUNDL:
3442 PL_reg_flags |= RF_tainted;
3443 /* FALL THROUGH */
3444 case BOUND:
3445 case NBOUND:
3446 /* was last char in word? */
3447 if (do_utf8) {
3448 if (locinput == PL_bostr)
3449 ln = '\n';
3450 else {
3451 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3452
3453 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3454 }
3455 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3456 ln = isALNUM_uni(ln);
3457 LOAD_UTF8_CHARCLASS_ALNUM();
3458 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3459 }
3460 else {
3461 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3462 n = isALNUM_LC_utf8((U8*)locinput);
3463 }
3464 }
3465 else {
3466 ln = (locinput != PL_bostr) ?
3467 UCHARAT(locinput - 1) : '\n';
3468 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3469 ln = isALNUM(ln);
3470 n = isALNUM(nextchr);
3471 }
3472 else {
3473 ln = isALNUM_LC(ln);
3474 n = isALNUM_LC(nextchr);
3475 }
3476 }
3477 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3478 OP(scan) == BOUNDL))
3479 sayNO;
3480 break;
d6a28714 3481 case ANYOF:
ffc61ed2 3482 if (do_utf8) {
9e55ce06
JH
3483 STRLEN inclasslen = PL_regeol - locinput;
3484
32fc9b6a 3485 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
262b90c4 3486 goto anyof_fail;
ffc61ed2
JH
3487 if (locinput >= PL_regeol)
3488 sayNO;
0f0076b4 3489 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3490 nextchr = UCHARAT(locinput);
e0f9d4a8 3491 break;
ffc61ed2
JH
3492 }
3493 else {
3494 if (nextchr < 0)
3495 nextchr = UCHARAT(locinput);
32fc9b6a 3496 if (!REGINCLASS(rex, scan, (U8*)locinput))
262b90c4 3497 goto anyof_fail;
ffc61ed2
JH
3498 if (!nextchr && locinput >= PL_regeol)
3499 sayNO;
3500 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3501 break;
3502 }
262b90c4 3503 anyof_fail:
e0f9d4a8
JH
3504 /* If we might have the case of the German sharp s
3505 * in a casefolding Unicode character class. */
3506
ebc501f0
JH
3507 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3508 locinput += SHARP_S_SKIP;
e0f9d4a8 3509 nextchr = UCHARAT(locinput);
ffc61ed2 3510 }
e0f9d4a8
JH
3511 else
3512 sayNO;
b8c5462f 3513 break;
20d0b1e9 3514 /* Special char classes - The defines start on line 129 or so */
d1eb3177
YO
3515 CCC_TRY_AFF( ALNUM, ALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3516 CCC_TRY_NEG(NALNUM, NALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
20d0b1e9 3517
d1eb3177
YO
3518 CCC_TRY_AFF( SPACE, SPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3519 CCC_TRY_NEG(NSPACE, NSPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
20d0b1e9 3520
d1eb3177
YO
3521 CCC_TRY_AFF( DIGIT, DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3522 CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
20d0b1e9 3523
a0ed51b3 3524 case CLUMP:
b7c83a7e 3525 if (locinput >= PL_regeol)
a0ed51b3 3526 sayNO;
b7c83a7e 3527 if (do_utf8) {
1a4fad37 3528 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3529 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3530 sayNO;
3531 locinput += PL_utf8skip[nextchr];
3532 while (locinput < PL_regeol &&
3533 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3534 locinput += UTF8SKIP(locinput);
3535 if (locinput > PL_regeol)
3536 sayNO;
eb08e2da
JH
3537 }
3538 else
3539 locinput++;
a0ed51b3
LW
3540 nextchr = UCHARAT(locinput);
3541 break;
81714fb9
YO
3542
3543 case NREFFL:
3544 {
3545 char *s;
ff1157ca 3546 char type;
81714fb9
YO
3547 PL_reg_flags |= RF_tainted;
3548 /* FALL THROUGH */
3549 case NREF:
3550 case NREFF:
ff1157ca 3551 type = OP(scan);
0a4db386
YO
3552 n = reg_check_named_buff_matched(rex,scan);
3553
3554 if ( n ) {
3555 type = REF + ( type - NREF );
3556 goto do_ref;
3557 } else {
81714fb9 3558 sayNO;
0a4db386
YO
3559 }
3560 /* unreached */
c8756f30 3561 case REFFL:
3280af22 3562 PL_reg_flags |= RF_tainted;
c8756f30 3563 /* FALL THROUGH */
c277df42 3564 case REF:
81714fb9 3565 case REFF:
c277df42 3566 n = ARG(scan); /* which paren pair */
81714fb9
YO
3567 type = OP(scan);
3568 do_ref:
f0ab9afb 3569 ln = PL_regoffs[n].start;
2c2d71f5 3570 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3b6647e0 3571 if (*PL_reglastparen < n || ln == -1)
af3f8c16 3572 sayNO; /* Do not match unless seen CLOSEn. */
f0ab9afb 3573 if (ln == PL_regoffs[n].end)
a0d0e21e 3574 break;
a0ed51b3 3575
24d3c4a9 3576 s = PL_bostr + ln;
81714fb9 3577 if (do_utf8 && type != REF) { /* REF can do byte comparison */
a0ed51b3 3578 char *l = locinput;
f0ab9afb 3579 const char *e = PL_bostr + PL_regoffs[n].end;
a0ed51b3
LW
3580 /*
3581 * Note that we can't do the "other character" lookup trick as
3582 * in the 8-bit case (no pun intended) because in Unicode we
3583 * have to map both upper and title case to lower case.
3584 */
81714fb9 3585 if (type == REFF) {
a0ed51b3 3586 while (s < e) {
a3b680e6
AL
3587 STRLEN ulen1, ulen2;
3588 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3589 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3590
a0ed51b3
LW
3591 if (l >= PL_regeol)
3592 sayNO;
a2a2844f
JH
3593 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3594 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3595 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3596 sayNO;
a2a2844f
JH
3597 s += ulen1;
3598 l += ulen2;
a0ed51b3
LW
3599 }
3600 }
3601 locinput = l;
3602 nextchr = UCHARAT(locinput);
3603 break;
3604 }
3605
a0d0e21e 3606 /* Inline the first character, for speed. */
76e3520e 3607 if (UCHARAT(s) != nextchr &&
81714fb9
YO
3608 (type == REF ||
3609 (UCHARAT(s) != (type == REFF
3610 ? PL_fold : PL_fold_locale)[nextchr])))
4633a7c4 3611 sayNO;
f0ab9afb 3612 ln = PL_regoffs[n].end - ln;
24d3c4a9 3613 if (locinput + ln > PL_regeol)
4633a7c4 3614 sayNO;
81714fb9 3615 if (ln > 1 && (type == REF
24d3c4a9 3616 ? memNE(s, locinput, ln)
81714fb9 3617 : (type == REFF
24d3c4a9
DM
3618 ? ibcmp(s, locinput, ln)
3619 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3620 sayNO;
24d3c4a9 3621 locinput += ln;
76e3520e 3622 nextchr = UCHARAT(locinput);
a0d0e21e 3623 break;
81714fb9 3624 }
a0d0e21e 3625 case NOTHING:
c277df42 3626 case TAIL:
a0d0e21e
LW
3627 break;
3628 case BACK:
3629 break;
40a82448
DM
3630
3631#undef ST
3632#define ST st->u.eval
c277df42 3633 {
c277df42 3634 SV *ret;
d2f13c59 3635 REGEXP *re_sv;
6bda09f9 3636 regexp *re;
f8fc2ecf 3637 regexp_internal *rei;
1a147d38
YO
3638 regnode *startpoint;
3639
3640 case GOSTART:
e7707071
YO
3641 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
3642 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 3643 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 3644 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 3645 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
3646 Perl_croak(aTHX_
3647 "Pattern subroutine nesting without pos change"
3648 " exceeded limit in regex");
6bda09f9
YO
3649 } else {
3650 nochange_depth = 0;
1a147d38 3651 }
288b8c02 3652 re_sv = rex_sv;
6bda09f9 3653 re = rex;
f8fc2ecf 3654 rei = rexi;
288b8c02 3655 (void)ReREFCNT_inc(rex_sv);
1a147d38 3656 if (OP(scan)==GOSUB) {
6bda09f9
YO
3657 startpoint = scan + ARG2L(scan);
3658 ST.close_paren = ARG(scan);
3659 } else {
f8fc2ecf 3660 startpoint = rei->program+1;
6bda09f9
YO
3661 ST.close_paren = 0;
3662 }
3663 goto eval_recurse_doit;
3664 /* NOTREACHED */
3665 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3666 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 3667 if ( ++nochange_depth > max_nochange_depth )
1a147d38 3668 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
3669 } else {
3670 nochange_depth = 0;
3671 }
8e5e9ebe 3672 {
4aabdb9b
DM
3673 /* execute the code in the {...} */
3674 dSP;
6136c704 3675 SV ** const before = SP;
4aabdb9b
DM
3676 OP_4tree * const oop = PL_op;
3677 COP * const ocurcop = PL_curcop;
3678 PAD *old_comppad;
d80618d2 3679 char *saved_regeol = PL_regeol;
4aabdb9b
DM
3680
3681 n = ARG(scan);
f8fc2ecf 3682 PL_op = (OP_4tree*)rexi->data->data[n];
24b23f37
YO
3683 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3684 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f8fc2ecf 3685 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
f0ab9afb 3686 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4aabdb9b 3687
2bf803e2
YO
3688 if (sv_yes_mark) {
3689 SV *sv_mrk = get_sv("REGMARK", 1);
3690 sv_setsv(sv_mrk, sv_yes_mark);
3691 }
3692
8e5e9ebe
RGS
3693 CALLRUNOPS(aTHX); /* Scalar context. */
3694 SPAGAIN;
3695 if (SP == before)
075aa684 3696 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3697 else {
3698 ret = POPs;
3699 PUTBACK;
3700 }
4aabdb9b
DM
3701
3702 PL_op = oop;
3703 PAD_RESTORE_LOCAL(old_comppad);
3704 PL_curcop = ocurcop;
d80618d2 3705 PL_regeol = saved_regeol;
24d3c4a9 3706 if (!logical) {
4aabdb9b
DM
3707 /* /(?{...})/ */
3708 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3709 break;
3710 }
8e5e9ebe 3711 }
24d3c4a9
DM
3712 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3713 logical = 0;
4aabdb9b 3714 {
4f639d21
DM
3715 /* extract RE object from returned value; compiling if
3716 * necessary */
6136c704 3717 MAGIC *mg = NULL;
288b8c02 3718 REGEXP *rx = NULL;
5c35adbb
NC
3719
3720 if (SvROK(ret)) {
288b8c02 3721 SV *const sv = SvRV(ret);
5c35adbb
NC
3722
3723 if (SvTYPE(sv) == SVt_REGEXP) {
d2f13c59 3724 rx = (REGEXP*) sv;
5c35adbb
NC
3725 } else if (SvSMAGICAL(sv)) {
3726 mg = mg_find(sv, PERL_MAGIC_qr);
3727 assert(mg);
3728 }
3729 } else if (SvTYPE(ret) == SVt_REGEXP) {
d2f13c59 3730 rx = (REGEXP*) ret;
5c35adbb 3731 } else if (SvSMAGICAL(ret)) {
124ee91a
NC
3732 if (SvGMAGICAL(ret)) {
3733 /* I don't believe that there is ever qr magic
3734 here. */
3735 assert(!mg_find(ret, PERL_MAGIC_qr));
faf82a0b 3736 sv_unmagic(ret, PERL_MAGIC_qr);
124ee91a
NC
3737 }
3738 else {
faf82a0b 3739 mg = mg_find(ret, PERL_MAGIC_qr);
124ee91a
NC
3740 /* testing suggests mg only ends up non-NULL for
3741 scalars who were upgraded and compiled in the
3742 else block below. In turn, this is only
3743 triggered in the "postponed utf8 string" tests
3744 in t/op/pat.t */
3745 }
0f5d15d6 3746 }
faf82a0b 3747
0f5d15d6 3748 if (mg) {
d2f13c59 3749 rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
e2560c33 3750 assert(rx);
0f5d15d6 3751 }
288b8c02 3752 if (rx) {
f0826785 3753 rx = reg_temp_copy(NULL, rx);
288b8c02 3754 }
0f5d15d6 3755 else {
c737faaf 3756 U32 pm_flags = 0;
a3b680e6 3757 const I32 osize = PL_regsize;
0f5d15d6 3758
b9ad30b4
NC
3759 if (DO_UTF8(ret)) {
3760 assert (SvUTF8(ret));
3761 } else if (SvUTF8(ret)) {
3762 /* Not doing UTF-8, despite what the SV says. Is
3763 this only if we're trapped in use 'bytes'? */
3764 /* Make a copy of the octet sequence, but without
3765 the flag on, as the compiler now honours the
3766 SvUTF8 flag on ret. */
3767 STRLEN len;
3768 const char *const p = SvPV(ret, len);
3769 ret = newSVpvn_flags(p, len, SVs_TEMP);
3770 }
288b8c02 3771 rx = CALLREGCOMP(ret, pm_flags);
9041c2e3 3772 if (!(SvFLAGS(ret)
faf82a0b 3773 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3ce3ed55 3774 | SVs_GMG))) {
a2794585
NC
3775 /* This isn't a first class regexp. Instead, it's
3776 caching a regexp onto an existing, Perl visible
3777 scalar. */
ad64d0ec 3778 sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
3ce3ed55 3779 }
0f5d15d6 3780 PL_regsize = osize;
0f5d15d6 3781 }
288b8c02
NC
3782 re_sv = rx;
3783 re = (struct regexp *)SvANY(rx);
4aabdb9b 3784 }
07bc277f 3785 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
3786 re->subbeg = rex->subbeg;
3787 re->sublen = rex->sublen;
f8fc2ecf 3788 rei = RXi_GET(re);
6bda09f9 3789 DEBUG_EXECUTE_r(
efd26800 3790 debug_start_match(re_sv, do_utf8, locinput, PL_regeol,
6bda09f9
YO
3791 "Matching embedded");
3792 );
f8fc2ecf 3793 startpoint = rei->program + 1;
1a147d38 3794 ST.close_paren = 0; /* only used for GOSUB */
6bda09f9
YO
3795 /* borrowed from regtry */
3796 if (PL_reg_start_tmpl <= re->nparens) {
3797 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3798 if(PL_reg_start_tmp)
3799 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3800 else
3801 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
dd5def09 3802 }
aa283a38 3803
1a147d38 3804 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 3805 /* run the pattern returned from (??{...}) */
40a82448
DM
3806 ST.cp = regcppush(0); /* Save *all* the positions. */
3807 REGCP_SET(ST.lastcp);
6bda09f9 3808
f0ab9afb 3809 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
6bda09f9 3810
0357f1fd
ML
3811 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
3812 PL_reglastparen = &re->lastparen;
3813 PL_reglastcloseparen = &re->lastcloseparen;
3814 re->lastparen = 0;
3815 re->lastcloseparen = 0;
3816
4aabdb9b 3817 PL_reginput = locinput;
ae0beba1 3818 PL_regsize = 0;
4aabdb9b
DM
3819
3820 /* XXXX This is too dramatic a measure... */
3821 PL_reg_maxiter = 0;
3822
faec1544 3823 ST.toggle_reg_flags = PL_reg_flags;
3c8556c3 3824 if (RX_UTF8(re_sv))
faec1544
DM
3825 PL_reg_flags |= RF_utf8;
3826 else
3827 PL_reg_flags &= ~RF_utf8;
3828 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3829
288b8c02 3830 ST.prev_rex = rex_sv;
faec1544 3831 ST.prev_curlyx = cur_curlyx;
288b8c02
NC
3832 SETREX(rex_sv,re_sv);
3833 rex = re;
f8fc2ecf 3834 rexi = rei;
faec1544 3835 cur_curlyx = NULL;
40a82448 3836 ST.B = next;
faec1544
DM
3837 ST.prev_eval = cur_eval;
3838 cur_eval = st;
faec1544 3839 /* now continue from first node in postoned RE */
6bda09f9 3840 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4aabdb9b 3841 /* NOTREACHED */
a0ed51b3 3842 }
24d3c4a9
DM
3843 /* logical is 1, /(?(?{...})X|Y)/ */
3844 sw = (bool)SvTRUE(ret);
3845 logical = 0;
c277df42
IZ
3846 break;
3847 }
40a82448 3848
faec1544
DM
3849 case EVAL_AB: /* cleanup after a successful (??{A})B */
3850 /* note: this is called twice; first after popping B, then A */
3851 PL_reg_flags ^= ST.toggle_reg_flags;
288b8c02
NC
3852 ReREFCNT_dec(rex_sv);
3853 SETREX(rex_sv,ST.prev_rex);
3854 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3855 rexi = RXi_GET(rex);
faec1544
DM
3856 regcpblow(ST.cp);
3857 cur_eval = ST.prev_eval;
3858 cur_curlyx = ST.prev_curlyx;
34a81e2b
B
3859
3860 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
0357f1fd
ML
3861 PL_reglastparen = &rex->lastparen;
3862 PL_reglastcloseparen = &rex->lastcloseparen;
34a81e2b
B
3863 /* also update PL_regoffs */
3864 PL_regoffs = rex->offs;
0357f1fd 3865
40a82448
DM
3866 /* XXXX This is too dramatic a measure... */
3867 PL_reg_maxiter = 0;
e7707071 3868 if ( nochange_depth )
4b196cd4 3869 nochange_depth--;
262b90c4 3870 sayYES;
40a82448 3871
40a82448 3872
faec1544
DM
3873 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3874 /* note: this is called twice; first after popping B, then A */
3875 PL_reg_flags ^= ST.toggle_reg_flags;
288b8c02
NC
3876 ReREFCNT_dec(rex_sv);
3877 SETREX(rex_sv,ST.prev_rex);
3878 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3879 rexi = RXi_GET(rex);
34a81e2b 3880 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
0357f1fd
ML
3881 PL_reglastparen = &rex->lastparen;
3882 PL_reglastcloseparen = &rex->lastcloseparen;
3883
40a82448
DM
3884 PL_reginput = locinput;
3885 REGCP_UNWIND(ST.lastcp);
3886 regcppop(rex);
faec1544
DM
3887 cur_eval = ST.prev_eval;
3888 cur_curlyx = ST.prev_curlyx;
3889 /* XXXX This is too dramatic a measure... */
3890 PL_reg_maxiter = 0;
e7707071 3891 if ( nochange_depth )
4b196cd4 3892 nochange_depth--;
40a82448 3893 sayNO_SILENT;
40a82448
DM
3894#undef ST
3895
a0d0e21e 3896 case OPEN:
c277df42 3897 n = ARG(scan); /* which paren pair */
3280af22
NIS
3898 PL_reg_start_tmp[n] = locinput;
3899 if (n > PL_regsize)
3900 PL_regsize = n;
e2e6a0f1 3901 lastopen = n;
a0d0e21e
LW
3902 break;
3903 case CLOSE:
c277df42 3904 n = ARG(scan); /* which paren pair */
f0ab9afb
NC
3905 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
3906 PL_regoffs[n].end = locinput - PL_bostr;
7f69552c
YO
3907 /*if (n > PL_regsize)
3908 PL_regsize = n;*/
3b6647e0 3909 if (n > *PL_reglastparen)
3280af22 3910 *PL_reglastparen = n;
a01268b5 3911 *PL_reglastcloseparen = n;
3b6647e0 3912 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
3913 goto fake_end;
3914 }
a0d0e21e 3915 break;
e2e6a0f1
YO
3916 case ACCEPT:
3917 if (ARG(scan)){
3918 regnode *cursor;
3919 for (cursor=scan;
3920 cursor && OP(cursor)!=END;
3921 cursor=regnext(cursor))
3922 {
3923 if ( OP(cursor)==CLOSE ){
3924 n = ARG(cursor);
3925 if ( n <= lastopen ) {
f0ab9afb
NC
3926 PL_regoffs[n].start
3927 = PL_reg_start_tmp[n] - PL_bostr;
3928 PL_regoffs[n].end = locinput - PL_bostr;
e2e6a0f1
YO
3929 /*if (n > PL_regsize)
3930 PL_regsize = n;*/
3b6647e0 3931 if (n > *PL_reglastparen)
e2e6a0f1
YO
3932 *PL_reglastparen = n;
3933 *PL_reglastcloseparen = n;
3b6647e0
RB
3934 if ( n == ARG(scan) || (cur_eval &&
3935 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
3936 break;
3937 }
3938 }
3939 }
3940 }
3941 goto fake_end;
3942 /*NOTREACHED*/
c277df42
IZ
3943 case GROUPP:
3944 n = ARG(scan); /* which paren pair */
f0ab9afb 3945 sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
c277df42 3946 break;
0a4db386
YO
3947 case NGROUPP:
3948 /* reg_check_named_buff_matched returns 0 for no match */
3949 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3950 break;
1a147d38 3951 case INSUBP:
0a4db386 3952 n = ARG(scan);
3b6647e0 3953 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386
YO
3954 break;
3955 case DEFINEP:
3956 sw = 0;
3957 break;
c277df42 3958 case IFTHEN:
2c2d71f5 3959 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 3960 if (sw)
c277df42
IZ
3961 next = NEXTOPER(NEXTOPER(scan));
3962 else {
3963 next = scan + ARG(scan);
3964 if (OP(next) == IFTHEN) /* Fake one. */
3965 next = NEXTOPER(NEXTOPER(next));
3966 }
3967 break;
3968 case LOGICAL:
24d3c4a9 3969 logical = scan->flags;
c277df42 3970 break;
c476f425 3971
2ab05381 3972/*******************************************************************
2ab05381 3973
c476f425
DM
3974The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3975pattern, where A and B are subpatterns. (For simple A, CURLYM or
3976STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 3977
c476f425 3978A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 3979
c476f425
DM
3980On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3981state, which contains the current count, initialised to -1. It also sets
3982cur_curlyx to point to this state, with any previous value saved in the
3983state block.
2ab05381 3984
c476f425
DM
3985CURLYX then jumps straight to the WHILEM op, rather than executing A,
3986since the pattern may possibly match zero times (i.e. it's a while {} loop
3987rather than a do {} while loop).
2ab05381 3988
c476f425
DM
3989Each entry to WHILEM represents a successful match of A. The count in the
3990CURLYX block is incremented, another WHILEM state is pushed, and execution
3991passes to A or B depending on greediness and the current count.
2ab05381 3992
c476f425
DM
3993For example, if matching against the string a1a2a3b (where the aN are
3994substrings that match /A/), then the match progresses as follows: (the
3995pushed states are interspersed with the bits of strings matched so far):
2ab05381 3996
c476f425
DM
3997 <CURLYX cnt=-1>
3998 <CURLYX cnt=0><WHILEM>
3999 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4000 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4001 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4002 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 4003
c476f425
DM
4004(Contrast this with something like CURLYM, which maintains only a single
4005backtrack state:
2ab05381 4006
c476f425
DM
4007 <CURLYM cnt=0> a1
4008 a1 <CURLYM cnt=1> a2
4009 a1 a2 <CURLYM cnt=2> a3
4010 a1 a2 a3 <CURLYM cnt=3> b
4011)
2ab05381 4012
c476f425
DM
4013Each WHILEM state block marks a point to backtrack to upon partial failure
4014of A or B, and also contains some minor state data related to that
4015iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4016overall state, such as the count, and pointers to the A and B ops.
2ab05381 4017
c476f425
DM
4018This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4019must always point to the *current* CURLYX block, the rules are:
2ab05381 4020
c476f425
DM
4021When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4022and set cur_curlyx to point the new block.
2ab05381 4023
c476f425
DM
4024When popping the CURLYX block after a successful or unsuccessful match,
4025restore the previous cur_curlyx.
2ab05381 4026
c476f425
DM
4027When WHILEM is about to execute B, save the current cur_curlyx, and set it
4028to the outer one saved in the CURLYX block.
2ab05381 4029
c476f425
DM
4030When popping the WHILEM block after a successful or unsuccessful B match,
4031restore the previous cur_curlyx.
2ab05381 4032
c476f425
DM
4033Here's an example for the pattern (AI* BI)*BO
4034I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 4035
c476f425
DM
4036cur_
4037curlyx backtrack stack
4038------ ---------------
4039NULL
4040CO <CO prev=NULL> <WO>
4041CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4042CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4043NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 4044
c476f425
DM
4045At this point the pattern succeeds, and we work back down the stack to
4046clean up, restoring as we go:
95b24440 4047
c476f425
DM
4048CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4049CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4050CO <CO prev=NULL> <WO>
4051NULL
a0374537 4052
c476f425
DM
4053*******************************************************************/
4054
4055#define ST st->u.curlyx
4056
4057 case CURLYX: /* start of /A*B/ (for complex A) */
4058 {
4059 /* No need to save/restore up to this paren */
4060 I32 parenfloor = scan->flags;
4061
4062 assert(next); /* keep Coverity happy */
4063 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4064 next += ARG(next);
4065
4066 /* XXXX Probably it is better to teach regpush to support
4067 parenfloor > PL_regsize... */
4068 if (parenfloor > (I32)*PL_reglastparen)
4069 parenfloor = *PL_reglastparen; /* Pessimization... */
4070
4071 ST.prev_curlyx= cur_curlyx;
4072 cur_curlyx = st;
4073 ST.cp = PL_savestack_ix;
4074
4075 /* these fields contain the state of the current curly.
4076 * they are accessed by subsequent WHILEMs */
4077 ST.parenfloor = parenfloor;
4078 ST.min = ARG1(scan);
4079 ST.max = ARG2(scan);
4080 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4081 ST.B = next;
24d3c4a9
DM
4082 ST.minmod = minmod;
4083 minmod = 0;
c476f425
DM
4084 ST.count = -1; /* this will be updated by WHILEM */
4085 ST.lastloc = NULL; /* this will be updated by WHILEM */
4086
4087 PL_reginput = locinput;
4088 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
5f66b61c 4089 /* NOTREACHED */
c476f425 4090 }
a0d0e21e 4091
c476f425 4092 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
4093 cur_curlyx = ST.prev_curlyx;
4094 sayYES;
4095 /* NOTREACHED */
a0d0e21e 4096
c476f425
DM
4097 case CURLYX_end_fail: /* just failed to match all of A*B */
4098 regcpblow(ST.cp);
4099 cur_curlyx = ST.prev_curlyx;
4100 sayNO;
4101 /* NOTREACHED */
4633a7c4 4102
a0d0e21e 4103
c476f425
DM
4104#undef ST
4105#define ST st->u.whilem
4106
4107 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4108 {
4109 /* see the discussion above about CURLYX/WHILEM */
c476f425
DM
4110 I32 n;
4111 assert(cur_curlyx); /* keep Coverity happy */
4112 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4113 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4114 ST.cache_offset = 0;
4115 ST.cache_mask = 0;
4116
4117 PL_reginput = locinput;
4118
4119 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4120 "%*s whilem: matched %ld out of %ld..%ld\n",
4121 REPORT_CODE_OFF+depth*2, "", (long)n,
4122 (long)cur_curlyx->u.curlyx.min,
4123 (long)cur_curlyx->u.curlyx.max)
4124 );
a0d0e21e 4125
c476f425 4126 /* First just match a string of min A's. */
a0d0e21e 4127
c476f425
DM
4128 if (n < cur_curlyx->u.curlyx.min) {
4129 cur_curlyx->u.curlyx.lastloc = locinput;
4130 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4131 /* NOTREACHED */
4132 }
4133
4134 /* If degenerate A matches "", assume A done. */
4135
4136 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4137 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4138 "%*s whilem: empty match detected, trying continuation...\n",
4139 REPORT_CODE_OFF+depth*2, "")
4140 );
4141 goto do_whilem_B_max;
4142 }
4143
4144 /* super-linear cache processing */
4145
4146 if (scan->flags) {
a0d0e21e 4147
2c2d71f5 4148 if (!PL_reg_maxiter) {
c476f425
DM
4149 /* start the countdown: Postpone detection until we
4150 * know the match is not *that* much linear. */
2c2d71f5 4151 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
4152 /* possible overflow for long strings and many CURLYX's */
4153 if (PL_reg_maxiter < 0)
4154 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
4155 PL_reg_leftiter = PL_reg_maxiter;
4156 }
c476f425 4157
2c2d71f5 4158 if (PL_reg_leftiter-- == 0) {
c476f425 4159 /* initialise cache */
3298f257 4160 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 4161 if (PL_reg_poscache) {
eb160463 4162 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
4163 Renew(PL_reg_poscache, size, char);
4164 PL_reg_poscache_size = size;
4165 }
4166 Zero(PL_reg_poscache, size, char);
4167 }
4168 else {
4169 PL_reg_poscache_size = size;
a02a5408 4170 Newxz(PL_reg_poscache, size, char);
2c2d71f5 4171 }
c476f425
DM
4172 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4173 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4174 PL_colors[4], PL_colors[5])
4175 );
2c2d71f5 4176 }
c476f425 4177
2c2d71f5 4178 if (PL_reg_leftiter < 0) {
c476f425
DM
4179 /* have we already failed at this position? */
4180 I32 offset, mask;
4181 offset = (scan->flags & 0xf) - 1
4182 + (locinput - PL_bostr) * (scan->flags>>4);
4183 mask = 1 << (offset % 8);
4184 offset /= 8;
4185 if (PL_reg_poscache[offset] & mask) {
4186 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4187 "%*s whilem: (cache) already tried at this position...\n",
4188 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 4189 );
3298f257 4190 sayNO; /* cache records failure */
2c2d71f5 4191 }
c476f425
DM
4192 ST.cache_offset = offset;
4193 ST.cache_mask = mask;
2c2d71f5 4194 }
c476f425 4195 }
2c2d71f5 4196
c476f425 4197 /* Prefer B over A for minimal matching. */
a687059c 4198
c476f425
DM
4199 if (cur_curlyx->u.curlyx.minmod) {
4200 ST.save_curlyx = cur_curlyx;
4201 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4202 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4203 REGCP_SET(ST.lastcp);
4204 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4205 /* NOTREACHED */
4206 }
a0d0e21e 4207
c476f425
DM
4208 /* Prefer A over B for maximal matching. */
4209
4210 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4211 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4212 cur_curlyx->u.curlyx.lastloc = locinput;
4213 REGCP_SET(ST.lastcp);
4214 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4215 /* NOTREACHED */
4216 }
4217 goto do_whilem_B_max;
4218 }
4219 /* NOTREACHED */
4220
4221 case WHILEM_B_min: /* just matched B in a minimal match */
4222 case WHILEM_B_max: /* just matched B in a maximal match */
4223 cur_curlyx = ST.save_curlyx;
4224 sayYES;
4225 /* NOTREACHED */
4226
4227 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4228 cur_curlyx = ST.save_curlyx;
4229 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4230 cur_curlyx->u.curlyx.count--;
4231 CACHEsayNO;
4232 /* NOTREACHED */
4233
4234 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4235 REGCP_UNWIND(ST.lastcp);
4236 regcppop(rex);
4237 /* FALL THROUGH */
4238 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4239 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4240 cur_curlyx->u.curlyx.count--;
4241 CACHEsayNO;
4242 /* NOTREACHED */
4243
4244 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4245 REGCP_UNWIND(ST.lastcp);
4246 regcppop(rex); /* Restore some previous $<digit>s? */
4247 PL_reginput = locinput;
4248 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4249 "%*s whilem: failed, trying continuation...\n",
4250 REPORT_CODE_OFF+depth*2, "")
4251 );
4252 do_whilem_B_max:
4253 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4254 && ckWARN(WARN_REGEXP)
4255 && !(PL_reg_flags & RF_warned))
4256 {
4257 PL_reg_flags |= RF_warned;
4258 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4259 "Complex regular subexpression recursion",
4260 REG_INFTY - 1);
4261 }
4262
4263 /* now try B */
4264 ST.save_curlyx = cur_curlyx;
4265 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4266 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4267 /* NOTREACHED */
4268
4269 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4270 cur_curlyx = ST.save_curlyx;
4271 REGCP_UNWIND(ST.lastcp);
4272 regcppop(rex);
4273
4274 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4275 /* Maximum greed exceeded */
4276 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4277 && ckWARN(WARN_REGEXP)
4278 && !(PL_reg_flags & RF_warned))
4279 {
3280af22 4280 PL_reg_flags |= RF_warned;
c476f425
DM
4281 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4282 "%s limit (%d) exceeded",
4283 "Complex regular subexpression recursion",
4284 REG_INFTY - 1);
a0d0e21e 4285 }
c476f425 4286 cur_curlyx->u.curlyx.count--;
3ab3c9b4 4287 CACHEsayNO;
a0d0e21e 4288 }
c476f425
DM
4289
4290 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4291 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4292 );
4293 /* Try grabbing another A and see if it helps. */
4294 PL_reginput = locinput;
4295 cur_curlyx->u.curlyx.lastloc = locinput;
4296 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4297 REGCP_SET(ST.lastcp);
4298 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
5f66b61c 4299 /* NOTREACHED */
40a82448
DM
4300
4301#undef ST
4302#define ST st->u.branch
4303
4304 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
4305 next = scan + ARG(scan);
4306 if (next == scan)
4307 next = NULL;
40a82448
DM
4308 scan = NEXTOPER(scan);
4309 /* FALL THROUGH */
c277df42 4310
40a82448
DM
4311 case BRANCH: /* /(...|A|...)/ */
4312 scan = NEXTOPER(scan); /* scan now points to inner node */
40a82448
DM
4313 ST.lastparen = *PL_reglastparen;
4314 ST.next_branch = next;
4315 REGCP_SET(ST.cp);
4316 PL_reginput = locinput;
02db2b7b 4317
40a82448 4318 /* Now go into the branch */
5d458dd8
YO
4319 if (has_cutgroup) {
4320 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4321 } else {
4322 PUSH_STATE_GOTO(BRANCH_next, scan);
4323 }
40a82448 4324 /* NOTREACHED */
5d458dd8
YO
4325 case CUTGROUP:
4326 PL_reginput = locinput;
4327 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 4328 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
4329 PUSH_STATE_GOTO(CUTGROUP_next,next);
4330 /* NOTREACHED */
4331 case CUTGROUP_next_fail:
4332 do_cutgroup = 1;
4333 no_final = 1;
4334 if (st->u.mark.mark_name)
4335 sv_commit = st->u.mark.mark_name;
4336 sayNO;
4337 /* NOTREACHED */
4338 case BRANCH_next:
4339 sayYES;
4340 /* NOTREACHED */
40a82448 4341 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
4342 if (do_cutgroup) {
4343 do_cutgroup = 0;
4344 no_final = 0;
4345 }
40a82448
DM
4346 REGCP_UNWIND(ST.cp);
4347 for (n = *PL_reglastparen; n > ST.lastparen; n--)
f0ab9afb 4348 PL_regoffs[n].end = -1;
40a82448 4349 *PL_reglastparen = n;
0a4db386 4350 /*dmq: *PL_reglastcloseparen = n; */
40a82448
DM
4351 scan = ST.next_branch;
4352 /* no more branches? */
5d458dd8
YO
4353 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4354 DEBUG_EXECUTE_r({
4355 PerlIO_printf( Perl_debug_log,
4356 "%*s %sBRANCH failed...%s\n",
4357 REPORT_CODE_OFF+depth*2, "",
4358 PL_colors[4],
4359 PL_colors[5] );
4360 });
4361 sayNO_SILENT;
4362 }
40a82448
DM
4363 continue; /* execute next BRANCH[J] op */
4364 /* NOTREACHED */
4365
a0d0e21e 4366 case MINMOD:
24d3c4a9 4367 minmod = 1;
a0d0e21e 4368 break;
40a82448
DM
4369
4370#undef ST
4371#define ST st->u.curlym
4372
4373 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4374
4375 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 4376 * only a single backtracking state, no matter how many matches
40a82448
DM
4377 * there are in {m,n}. It relies on the pattern being constant
4378 * length, with no parens to influence future backrefs
4379 */
4380
4381 ST.me = scan;
dc45a647 4382 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
4383
4384 /* if paren positive, emulate an OPEN/CLOSE around A */
4385 if (ST.me->flags) {
3b6647e0 4386 U32 paren = ST.me->flags;
40a82448
DM
4387 if (paren > PL_regsize)
4388 PL_regsize = paren;
3b6647e0 4389 if (paren > *PL_reglastparen)
40a82448 4390 *PL_reglastparen = paren;
c277df42 4391 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 4392 }
40a82448
DM
4393 ST.A = scan;
4394 ST.B = next;
4395 ST.alen = 0;
4396 ST.count = 0;
24d3c4a9
DM
4397 ST.minmod = minmod;
4398 minmod = 0;
40a82448
DM
4399 ST.c1 = CHRTEST_UNINIT;
4400 REGCP_SET(ST.cp);
6407bf3b 4401
40a82448
DM
4402 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4403 goto curlym_do_B;
4404
4405 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 4406 PL_reginput = locinput;
40a82448
DM
4407 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4408 /* NOTREACHED */
5f80c4cf 4409
40a82448
DM
4410 case CURLYM_A: /* we've just matched an A */
4411 locinput = st->locinput;
4412 nextchr = UCHARAT(locinput);
4413
4414 ST.count++;
4415 /* after first match, determine A's length: u.curlym.alen */
4416 if (ST.count == 1) {
4417 if (PL_reg_match_utf8) {
4418 char *s = locinput;
4419 while (s < PL_reginput) {
4420 ST.alen++;
4421 s += UTF8SKIP(s);
4422 }
4423 }
4424 else {
4425 ST.alen = PL_reginput - locinput;
4426 }
4427 if (ST.alen == 0)
4428 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4429 }
0cadcf80
DM
4430 DEBUG_EXECUTE_r(
4431 PerlIO_printf(Perl_debug_log,
40a82448 4432 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 4433 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 4434 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
4435 );
4436
40a82448 4437 locinput = PL_reginput;
0a4db386
YO
4438
4439 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4440 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
4441 goto fake_end;
4442
c966426a
DM
4443 {
4444 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4445 if ( max == REG_INFTY || ST.count < max )
4446 goto curlym_do_A; /* try to match another A */
4447 }
40a82448 4448 goto curlym_do_B; /* try to match B */
5f80c4cf 4449
40a82448
DM
4450 case CURLYM_A_fail: /* just failed to match an A */
4451 REGCP_UNWIND(ST.cp);
0a4db386
YO
4452
4453 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4454 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4455 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 4456 sayNO;
0cadcf80 4457
40a82448
DM
4458 curlym_do_B: /* execute the B in /A{m,n}B/ */
4459 PL_reginput = locinput;
4460 if (ST.c1 == CHRTEST_UNINIT) {
4461 /* calculate c1 and c2 for possible match of 1st char
4462 * following curly */
4463 ST.c1 = ST.c2 = CHRTEST_VOID;
4464 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4465 regnode *text_node = ST.B;
4466 if (! HAS_TEXT(text_node))
4467 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
4468 /* this used to be
4469
4470 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4471
4472 But the former is redundant in light of the latter.
4473
4474 if this changes back then the macro for
4475 IS_TEXT and friends need to change.
4476 */
4477 if (PL_regkind[OP(text_node)] == EXACT)
40a82448 4478 {
ee9b8eae 4479
40a82448
DM
4480 ST.c1 = (U8)*STRING(text_node);
4481 ST.c2 =
ee9b8eae 4482 (IS_TEXTF(text_node))
40a82448 4483 ? PL_fold[ST.c1]
ee9b8eae 4484 : (IS_TEXTFL(text_node))
40a82448
DM
4485 ? PL_fold_locale[ST.c1]
4486 : ST.c1;
c277df42 4487 }
c277df42 4488 }
40a82448
DM
4489 }
4490
4491 DEBUG_EXECUTE_r(
4492 PerlIO_printf(Perl_debug_log,
4493 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 4494 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
4495 "", (IV)ST.count)
4496 );
4497 if (ST.c1 != CHRTEST_VOID
4498 && UCHARAT(PL_reginput) != ST.c1
4499 && UCHARAT(PL_reginput) != ST.c2)
4500 {
4501 /* simulate B failing */
3e901dc0
YO
4502 DEBUG_OPTIMISE_r(
4503 PerlIO_printf(Perl_debug_log,
4504 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4505 (int)(REPORT_CODE_OFF+(depth*2)),"",
4506 (IV)ST.c1,(IV)ST.c2
4507 ));
40a82448
DM
4508 state_num = CURLYM_B_fail;
4509 goto reenter_switch;
4510 }
4511
4512 if (ST.me->flags) {
4513 /* mark current A as captured */
4514 I32 paren = ST.me->flags;
4515 if (ST.count) {
f0ab9afb 4516 PL_regoffs[paren].start
40a82448 4517 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
f0ab9afb 4518 PL_regoffs[paren].end = PL_reginput - PL_bostr;
0a4db386 4519 /*dmq: *PL_reglastcloseparen = paren; */
c277df42 4520 }
40a82448 4521 else
f0ab9afb 4522 PL_regoffs[paren].end = -1;
0a4db386 4523 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4524 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
4525 {
4526 if (ST.count)
4527 goto fake_end;
4528 else
4529 sayNO;
4530 }
c277df42 4531 }
0a4db386 4532
40a82448 4533 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 4534 /* NOTREACHED */
40a82448
DM
4535
4536 case CURLYM_B_fail: /* just failed to match a B */
4537 REGCP_UNWIND(ST.cp);
4538 if (ST.minmod) {
84d2fa14
HS
4539 I32 max = ARG2(ST.me);
4540 if (max != REG_INFTY && ST.count == max)
40a82448
DM
4541 sayNO;
4542 goto curlym_do_A; /* try to match a further A */
4543 }
4544 /* backtrack one A */
4545 if (ST.count == ARG1(ST.me) /* min */)
4546 sayNO;
4547 ST.count--;
4548 locinput = HOPc(locinput, -ST.alen);
4549 goto curlym_do_B; /* try to match B */
4550
c255a977
DM
4551#undef ST
4552#define ST st->u.curly
40a82448 4553
c255a977
DM
4554#define CURLY_SETPAREN(paren, success) \
4555 if (paren) { \
4556 if (success) { \
f0ab9afb
NC
4557 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4558 PL_regoffs[paren].end = locinput - PL_bostr; \
0a4db386 4559 *PL_reglastcloseparen = paren; \
c255a977
DM
4560 } \
4561 else \
f0ab9afb 4562 PL_regoffs[paren].end = -1; \
c255a977
DM
4563 }
4564
4565 case STAR: /* /A*B/ where A is width 1 */
4566 ST.paren = 0;
4567 ST.min = 0;
4568 ST.max = REG_INFTY;
a0d0e21e
LW
4569 scan = NEXTOPER(scan);
4570 goto repeat;
c255a977
DM
4571 case PLUS: /* /A+B/ where A is width 1 */
4572 ST.paren = 0;
4573 ST.min = 1;
4574 ST.max = REG_INFTY;
c277df42 4575 scan = NEXTOPER(scan);
c255a977
DM
4576 goto repeat;
4577 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4578 ST.paren = scan->flags; /* Which paren to set */
4579 if (ST.paren > PL_regsize)
4580 PL_regsize = ST.paren;
3b6647e0 4581 if (ST.paren > *PL_reglastparen)
c255a977
DM
4582 *PL_reglastparen = ST.paren;
4583 ST.min = ARG1(scan); /* min to match */
4584 ST.max = ARG2(scan); /* max to match */
0a4db386 4585 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4586 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4587 ST.min=1;
4588 ST.max=1;
4589 }
c255a977
DM
4590 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4591 goto repeat;
4592 case CURLY: /* /A{m,n}B/ where A is width 1 */
4593 ST.paren = 0;
4594 ST.min = ARG1(scan); /* min to match */
4595 ST.max = ARG2(scan); /* max to match */
4596 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 4597 repeat:
a0d0e21e
LW
4598 /*
4599 * Lookahead to avoid useless match attempts
4600 * when we know what character comes next.
c255a977 4601 *
5f80c4cf
JP
4602 * Used to only do .*x and .*?x, but now it allows
4603 * for )'s, ('s and (?{ ... })'s to be in the way
4604 * of the quantifier and the EXACT-like node. -- japhy
4605 */
4606
c255a977
DM
4607 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4608 sayNO;
cca55fe3 4609 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4610 U8 *s;
4611 regnode *text_node = next;
4612
3dab1dad
YO
4613 if (! HAS_TEXT(text_node))
4614 FIND_NEXT_IMPT(text_node);
5f80c4cf 4615
9e137952 4616 if (! HAS_TEXT(text_node))
c255a977 4617 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 4618 else {
ee9b8eae 4619 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 4620 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 4621 goto assume_ok_easy;
cca55fe3 4622 }
be8e71aa
YO
4623 else
4624 s = (U8*)STRING(text_node);
ee9b8eae
YO
4625
4626 /* Currently we only get here when
4627
4628 PL_rekind[OP(text_node)] == EXACT
4629
4630 if this changes back then the macro for IS_TEXT and
4631 friends need to change. */
5f80c4cf 4632 if (!UTF) {
c255a977 4633 ST.c2 = ST.c1 = *s;
ee9b8eae 4634 if (IS_TEXTF(text_node))
c255a977 4635 ST.c2 = PL_fold[ST.c1];
ee9b8eae 4636 else if (IS_TEXTFL(text_node))
c255a977 4637 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 4638 }
5f80c4cf 4639 else { /* UTF */
ee9b8eae 4640 if (IS_TEXTF(text_node)) {
a2a2844f 4641 STRLEN ulen1, ulen2;
89ebb4a3
JH
4642 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4643 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4644
4645 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4646 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
e294cc5d
JH
4647#ifdef EBCDIC
4648 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4649 ckWARN(WARN_UTF8) ?
4650 0 : UTF8_ALLOW_ANY);
4651 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4652 ckWARN(WARN_UTF8) ?
4653 0 : UTF8_ALLOW_ANY);
4654#else
c255a977 4655 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
e294cc5d 4656 uniflags);
c255a977 4657 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
e294cc5d
JH
4658 uniflags);
4659#endif
5f80c4cf
JP
4660 }
4661 else {
c255a977 4662 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4663 uniflags);
5f80c4cf 4664 }
1aa99e6b
IH
4665 }
4666 }
bbce6d69 4667 }
a0d0e21e 4668 else
c255a977 4669 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 4670 assume_ok_easy:
c255a977
DM
4671
4672 ST.A = scan;
4673 ST.B = next;
3280af22 4674 PL_reginput = locinput;
24d3c4a9
DM
4675 if (minmod) {
4676 minmod = 0;
e2e6a0f1 4677 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4633a7c4 4678 sayNO;
c255a977 4679 ST.count = ST.min;
a0ed51b3 4680 locinput = PL_reginput;
c255a977
DM
4681 REGCP_SET(ST.cp);
4682 if (ST.c1 == CHRTEST_VOID)
4683 goto curly_try_B_min;
4684
4685 ST.oldloc = locinput;
4686
4687 /* set ST.maxpos to the furthest point along the
4688 * string that could possibly match */
4689 if (ST.max == REG_INFTY) {
4690 ST.maxpos = PL_regeol - 1;
4691 if (do_utf8)
4692 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4693 ST.maxpos--;
4694 }
4695 else if (do_utf8) {
4696 int m = ST.max - ST.min;
4697 for (ST.maxpos = locinput;
4698 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4699 ST.maxpos += UTF8SKIP(ST.maxpos);
4700 }
4701 else {
4702 ST.maxpos = locinput + ST.max - ST.min;
4703 if (ST.maxpos >= PL_regeol)
4704 ST.maxpos = PL_regeol - 1;
4705 }
4706 goto curly_try_B_min_known;
4707
4708 }
4709 else {
e2e6a0f1 4710 ST.count = regrepeat(rex, ST.A, ST.max, depth);
c255a977
DM
4711 locinput = PL_reginput;
4712 if (ST.count < ST.min)
4713 sayNO;
4714 if ((ST.count > ST.min)
4715 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4716 {
4717 /* A{m,n} must come at the end of the string, there's
4718 * no point in backing off ... */
4719 ST.min = ST.count;
4720 /* ...except that $ and \Z can match before *and* after
4721 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4722 We may back off by one in this case. */
4723 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4724 ST.min--;
4725 }
4726 REGCP_SET(ST.cp);
4727 goto curly_try_B_max;
4728 }
4729 /* NOTREACHED */
4730
4731
4732 case CURLY_B_min_known_fail:
4733 /* failed to find B in a non-greedy match where c1,c2 valid */
4734 if (ST.paren && ST.count)
f0ab9afb 4735 PL_regoffs[ST.paren].end = -1;
c255a977
DM
4736
4737 PL_reginput = locinput; /* Could be reset... */
4738 REGCP_UNWIND(ST.cp);
4739 /* Couldn't or didn't -- move forward. */
4740 ST.oldloc = locinput;
4741 if (do_utf8)
4742 locinput += UTF8SKIP(locinput);
4743 else
4744 locinput++;
4745 ST.count++;
4746 curly_try_B_min_known:
4747 /* find the next place where 'B' could work, then call B */
4748 {
4749 int n;
4750 if (do_utf8) {
4751 n = (ST.oldloc == locinput) ? 0 : 1;
4752 if (ST.c1 == ST.c2) {
4753 STRLEN len;
4754 /* set n to utf8_distance(oldloc, locinput) */
4755 while (locinput <= ST.maxpos &&
4756 utf8n_to_uvchr((U8*)locinput,
4757 UTF8_MAXBYTES, &len,
4758 uniflags) != (UV)ST.c1) {
4759 locinput += len;
4760 n++;
4761 }
1aa99e6b
IH
4762 }
4763 else {
c255a977
DM
4764 /* set n to utf8_distance(oldloc, locinput) */
4765 while (locinput <= ST.maxpos) {
4766 STRLEN len;
4767 const UV c = utf8n_to_uvchr((U8*)locinput,
4768 UTF8_MAXBYTES, &len,
4769 uniflags);
4770 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4771 break;
4772 locinput += len;
4773 n++;
1aa99e6b 4774 }
0fe9bf95
IZ
4775 }
4776 }
c255a977
DM
4777 else {
4778 if (ST.c1 == ST.c2) {
4779 while (locinput <= ST.maxpos &&
4780 UCHARAT(locinput) != ST.c1)
4781 locinput++;
bbce6d69 4782 }
c255a977
DM
4783 else {
4784 while (locinput <= ST.maxpos
4785 && UCHARAT(locinput) != ST.c1
4786 && UCHARAT(locinput) != ST.c2)
4787 locinput++;
a0ed51b3 4788 }
c255a977
DM
4789 n = locinput - ST.oldloc;
4790 }
4791 if (locinput > ST.maxpos)
4792 sayNO;
4793 /* PL_reginput == oldloc now */
4794 if (n) {
4795 ST.count += n;
e2e6a0f1 4796 if (regrepeat(rex, ST.A, n, depth) < n)
4633a7c4 4797 sayNO;
a0d0e21e 4798 }
c255a977
DM
4799 PL_reginput = locinput;
4800 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 4801 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4802 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4803 goto fake_end;
4804 }
c255a977 4805 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 4806 }
c255a977
DM
4807 /* NOTREACHED */
4808
4809
4810 case CURLY_B_min_fail:
4811 /* failed to find B in a non-greedy match where c1,c2 invalid */
4812 if (ST.paren && ST.count)
f0ab9afb 4813 PL_regoffs[ST.paren].end = -1;
c255a977
DM
4814
4815 REGCP_UNWIND(ST.cp);
4816 /* failed -- move forward one */
4817 PL_reginput = locinput;
e2e6a0f1 4818 if (regrepeat(rex, ST.A, 1, depth)) {
c255a977 4819 ST.count++;
a0ed51b3 4820 locinput = PL_reginput;
c255a977
DM
4821 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4822 ST.count > 0)) /* count overflow ? */
15272685 4823 {
c255a977
DM
4824 curly_try_B_min:
4825 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 4826 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4827 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4828 goto fake_end;
4829 }
c255a977 4830 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
4831 }
4832 }
4633a7c4 4833 sayNO;
c255a977
DM
4834 /* NOTREACHED */
4835
4836
4837 curly_try_B_max:
4838 /* a successful greedy match: now try to match B */
40d049e4 4839 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4840 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
4841 goto fake_end;
4842 }
c255a977
DM
4843 {
4844 UV c = 0;
4845 if (ST.c1 != CHRTEST_VOID)
4846 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4847 UTF8_MAXBYTES, 0, uniflags)
466787eb 4848 : (UV) UCHARAT(PL_reginput);
c255a977
DM
4849 /* If it could work, try it. */
4850 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4851 CURLY_SETPAREN(ST.paren, ST.count);
4852 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4853 /* NOTREACHED */
4854 }
4855 }
4856 /* FALL THROUGH */
4857 case CURLY_B_max_fail:
4858 /* failed to find B in a greedy match */
4859 if (ST.paren && ST.count)
f0ab9afb 4860 PL_regoffs[ST.paren].end = -1;
c255a977
DM
4861
4862 REGCP_UNWIND(ST.cp);
4863 /* back up. */
4864 if (--ST.count < ST.min)
4865 sayNO;
4866 PL_reginput = locinput = HOPc(locinput, -1);
4867 goto curly_try_B_max;
4868
4869#undef ST
4870
a0d0e21e 4871 case END:
6bda09f9 4872 fake_end:
faec1544
DM
4873 if (cur_eval) {
4874 /* we've just finished A in /(??{A})B/; now continue with B */
4875 I32 tmpix;
faec1544
DM
4876 st->u.eval.toggle_reg_flags
4877 = cur_eval->u.eval.toggle_reg_flags;
4878 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4879
288b8c02
NC
4880 st->u.eval.prev_rex = rex_sv; /* inner */
4881 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
4882 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 4883 rexi = RXi_GET(rex);
faec1544 4884 cur_curlyx = cur_eval->u.eval.prev_curlyx;
288b8c02 4885 ReREFCNT_inc(rex_sv);
faec1544 4886 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
34a81e2b
B
4887
4888 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4889 PL_reglastparen = &rex->lastparen;
4890 PL_reglastcloseparen = &rex->lastcloseparen;
4891
faec1544
DM
4892 REGCP_SET(st->u.eval.lastcp);
4893 PL_reginput = locinput;
4894
4895 /* Restore parens of the outer rex without popping the
4896 * savestack */
4897 tmpix = PL_savestack_ix;
4898 PL_savestack_ix = cur_eval->u.eval.lastcp;
4899 regcppop(rex);
4900 PL_savestack_ix = tmpix;
4901
4902 st->u.eval.prev_eval = cur_eval;
4903 cur_eval = cur_eval->u.eval.prev_eval;
4904 DEBUG_EXECUTE_r(
2a49f0f5
JH
4905 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
4906 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
4907 if ( nochange_depth )
4908 nochange_depth--;
4909
4910 PUSH_YES_STATE_GOTO(EVAL_AB,
faec1544
DM
4911 st->u.eval.prev_eval->u.eval.B); /* match B */
4912 }
4913
3b0527fe 4914 if (locinput < reginfo->till) {
a3621e74 4915 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4916 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4917 PL_colors[4],
4918 (long)(locinput - PL_reg_starttry),
3b0527fe 4919 (long)(reginfo->till - PL_reg_starttry),
7821416a 4920 PL_colors[5]));
58e23c8d 4921
262b90c4 4922 sayNO_SILENT; /* Cannot match: too short. */
7821416a
IZ
4923 }
4924 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 4925 sayYES; /* Success! */
dad79028
DM
4926
4927 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4928 DEBUG_EXECUTE_r(
4929 PerlIO_printf(Perl_debug_log,
4930 "%*s %ssubpattern success...%s\n",
5bc10b2c 4931 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
3280af22 4932 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 4933 sayYES; /* Success! */
dad79028 4934
40a82448
DM
4935#undef ST
4936#define ST st->u.ifmatch
4937
4938 case SUSPEND: /* (?>A) */
4939 ST.wanted = 1;
9fe1d20c 4940 PL_reginput = locinput;
9041c2e3 4941 goto do_ifmatch;
dad79028 4942
40a82448
DM
4943 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4944 ST.wanted = 0;
dad79028
DM
4945 goto ifmatch_trivial_fail_test;
4946
40a82448
DM
4947 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4948 ST.wanted = 1;
dad79028 4949 ifmatch_trivial_fail_test:
a0ed51b3 4950 if (scan->flags) {
52657f30 4951 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4952 if (!s) {
4953 /* trivial fail */
24d3c4a9
DM
4954 if (logical) {
4955 logical = 0;
4956 sw = 1 - (bool)ST.wanted;
dad79028 4957 }
40a82448 4958 else if (ST.wanted)
dad79028
DM
4959 sayNO;
4960 next = scan + ARG(scan);
4961 if (next == scan)
4962 next = NULL;
4963 break;
4964 }
efb30f32 4965 PL_reginput = s;
a0ed51b3
LW
4966 }
4967 else
4968 PL_reginput = locinput;
4969
c277df42 4970 do_ifmatch:
40a82448 4971 ST.me = scan;
24d3c4a9 4972 ST.logical = logical;
24d786f4
YO
4973 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
4974
40a82448
DM
4975 /* execute body of (?...A) */
4976 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4977 /* NOTREACHED */
4978
4979 case IFMATCH_A_fail: /* body of (?...A) failed */
4980 ST.wanted = !ST.wanted;
4981 /* FALL THROUGH */
4982
4983 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9
DM
4984 if (ST.logical) {
4985 sw = (bool)ST.wanted;
40a82448
DM
4986 }
4987 else if (!ST.wanted)
4988 sayNO;
4989
4990 if (OP(ST.me) == SUSPEND)
4991 locinput = PL_reginput;
4992 else {
4993 locinput = PL_reginput = st->locinput;
4994 nextchr = UCHARAT(locinput);
4995 }
4996 scan = ST.me + ARG(ST.me);
4997 if (scan == ST.me)
4998 scan = NULL;
4999 continue; /* execute B */
5000
5001#undef ST
dad79028 5002
c277df42 5003 case LONGJMP:
c277df42
IZ
5004 next = scan + ARG(scan);
5005 if (next == scan)
5006 next = NULL;
a0d0e21e 5007 break;
54612592 5008 case COMMIT:
e2e6a0f1
YO
5009 reginfo->cutpoint = PL_regeol;
5010 /* FALLTHROUGH */
5d458dd8 5011 case PRUNE:
24b23f37 5012 PL_reginput = locinput;
e2e6a0f1 5013 if (!scan->flags)
ad64d0ec 5014 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
54612592
YO
5015 PUSH_STATE_GOTO(COMMIT_next,next);
5016 /* NOTREACHED */
5017 case COMMIT_next_fail:
5018 no_final = 1;
5019 /* FALLTHROUGH */
7f69552c
YO
5020 case OPFAIL:
5021 sayNO;
e2e6a0f1
YO
5022 /* NOTREACHED */
5023
5024#define ST st->u.mark
5025 case MARKPOINT:
5026 ST.prev_mark = mark_state;
5d458dd8 5027 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 5028 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1
YO
5029 mark_state = st;
5030 ST.mark_loc = PL_reginput = locinput;
5031 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5032 /* NOTREACHED */
5033 case MARKPOINT_next:
5034 mark_state = ST.prev_mark;
5035 sayYES;
5036 /* NOTREACHED */
5037 case MARKPOINT_next_fail:
5d458dd8 5038 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
5039 {
5040 if (ST.mark_loc > startpoint)
5041 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5042 popmark = NULL; /* we found our mark */
5043 sv_commit = ST.mark_name;
5044
5045 DEBUG_EXECUTE_r({
5d458dd8 5046 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
5047 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5048 REPORT_CODE_OFF+depth*2, "",
be2597df 5049 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
5050 });
5051 }
5052 mark_state = ST.prev_mark;
5d458dd8
YO
5053 sv_yes_mark = mark_state ?
5054 mark_state->u.mark.mark_name : NULL;
e2e6a0f1
YO
5055 sayNO;
5056 /* NOTREACHED */
5d458dd8
YO
5057 case SKIP:
5058 PL_reginput = locinput;
5059 if (scan->flags) {
2bf803e2 5060 /* (*SKIP) : if we fail we cut here*/
5d458dd8 5061 ST.mark_name = NULL;
e2e6a0f1 5062 ST.mark_loc = locinput;
5d458dd8
YO
5063 PUSH_STATE_GOTO(SKIP_next,next);
5064 } else {
2bf803e2 5065 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
5066 otherwise do nothing. Meaning we need to scan
5067 */
5068 regmatch_state *cur = mark_state;
ad64d0ec 5069 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
5070
5071 while (cur) {
5072 if ( sv_eq( cur->u.mark.mark_name,
5073 find ) )
5074 {
5075 ST.mark_name = find;
5076 PUSH_STATE_GOTO( SKIP_next, next );
5077 }
5078 cur = cur->u.mark.prev_mark;
5079 }
e2e6a0f1 5080 }
2bf803e2 5081 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8
YO
5082 break;
5083 case SKIP_next_fail:
5084 if (ST.mark_name) {
5085 /* (*CUT:NAME) - Set up to search for the name as we
5086 collapse the stack*/
5087 popmark = ST.mark_name;
5088 } else {
5089 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
5090 if (ST.mark_loc > startpoint)
5091 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
5092 /* but we set sv_commit to latest mark_name if there
5093 is one so they can test to see how things lead to this
5094 cut */
5095 if (mark_state)
5096 sv_commit=mark_state->u.mark.mark_name;
5097 }
e2e6a0f1
YO
5098 no_final = 1;
5099 sayNO;
5100 /* NOTREACHED */
5101#undef ST
32e6a07c
YO
5102 case FOLDCHAR:
5103 n = ARG(scan);
81d4fa0f 5104 if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
e64b1bd1
YO
5105 locinput += ln;
5106 } else if ( 0xDF == n && !do_utf8 && !UTF ) {
5107 sayNO;
5108 } else {
5109 U8 folded[UTF8_MAXBYTES_CASE+1];
5110 STRLEN foldlen;
5111 const char * const l = locinput;
5112 char *e = PL_regeol;
5113 to_uni_fold(n, folded, &foldlen);
5114
59fe32ea 5115 if (ibcmp_utf8((const char*) folded, 0, foldlen, 1,
e64b1bd1 5116 l, &e, 0, do_utf8)) {
32e6a07c 5117 sayNO;
e64b1bd1
YO
5118 }
5119 locinput = e;
32e6a07c
YO
5120 }
5121 nextchr = UCHARAT(locinput);
5122 break;
e1d1eefb
YO
5123 case LNBREAK:
5124 if ((n=is_LNBREAK(locinput,do_utf8))) {
5125 locinput += n;
5126 nextchr = UCHARAT(locinput);
5127 } else
5128 sayNO;
5129 break;
5130
5131#define CASE_CLASS(nAmE) \
5132 case nAmE: \
5133 if ((n=is_##nAmE(locinput,do_utf8))) { \
5134 locinput += n; \
5135 nextchr = UCHARAT(locinput); \
5136 } else \
5137 sayNO; \
5138 break; \
5139 case N##nAmE: \
5140 if ((n=is_##nAmE(locinput,do_utf8))) { \
5141 sayNO; \
5142 } else { \
5143 locinput += UTF8SKIP(locinput); \
5144 nextchr = UCHARAT(locinput); \
5145 } \
5146 break
5147
5148 CASE_CLASS(VERTWS);
5149 CASE_CLASS(HORIZWS);
5150#undef CASE_CLASS
5151
a0d0e21e 5152 default:
b900a521 5153 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 5154 PTR2UV(scan), OP(scan));
cea2e8a9 5155 Perl_croak(aTHX_ "regexp memory corruption");
5d458dd8
YO
5156
5157 } /* end switch */
95b24440 5158
5d458dd8
YO
5159 /* switch break jumps here */
5160 scan = next; /* prepare to execute the next op and ... */
5161 continue; /* ... jump back to the top, reusing st */
95b24440
DM
5162 /* NOTREACHED */
5163
40a82448
DM
5164 push_yes_state:
5165 /* push a state that backtracks on success */
5166 st->u.yes.prev_yes_state = yes_state;
5167 yes_state = st;
5168 /* FALL THROUGH */
5169 push_state:
5170 /* push a new regex state, then continue at scan */
5171 {
5172 regmatch_state *newst;
5173
24b23f37
YO
5174 DEBUG_STACK_r({
5175 regmatch_state *cur = st;
5176 regmatch_state *curyes = yes_state;
5177 int curd = depth;
5178 regmatch_slab *slab = PL_regmatch_slab;
5179 for (;curd > -1;cur--,curd--) {
5180 if (cur < SLAB_FIRST(slab)) {
5181 slab = slab->prev;
5182 cur = SLAB_LAST(slab);
5183 }
5184 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5185 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 5186 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
5187 (curyes == cur) ? "yes" : ""
5188 );
5189 if (curyes == cur)
5190 curyes = cur->u.yes.prev_yes_state;
5191 }
5192 } else
5193 DEBUG_STATE_pp("push")
5194 );
40a82448 5195 depth++;
40a82448
DM
5196 st->locinput = locinput;
5197 newst = st+1;
5198 if (newst > SLAB_LAST(PL_regmatch_slab))
5199 newst = S_push_slab(aTHX);
5200 PL_regmatch_state = newst;
786e8c11 5201
40a82448
DM
5202 locinput = PL_reginput;
5203 nextchr = UCHARAT(locinput);
5204 st = newst;
5205 continue;
5206 /* NOTREACHED */
5207 }
a0d0e21e 5208 }
a687059c 5209
a0d0e21e
LW
5210 /*
5211 * We get here only if there's trouble -- normally "case END" is
5212 * the terminating point.
5213 */
cea2e8a9 5214 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 5215 /*NOTREACHED*/
4633a7c4
LW
5216 sayNO;
5217
262b90c4 5218yes:
77cb431f
DM
5219 if (yes_state) {
5220 /* we have successfully completed a subexpression, but we must now
5221 * pop to the state marked by yes_state and continue from there */
77cb431f 5222 assert(st != yes_state);
5bc10b2c
DM
5223#ifdef DEBUGGING
5224 while (st != yes_state) {
5225 st--;
5226 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5227 PL_regmatch_slab = PL_regmatch_slab->prev;
5228 st = SLAB_LAST(PL_regmatch_slab);
5229 }
e2e6a0f1 5230 DEBUG_STATE_r({
54612592
YO
5231 if (no_final) {
5232 DEBUG_STATE_pp("pop (no final)");
5233 } else {
5234 DEBUG_STATE_pp("pop (yes)");
5235 }
e2e6a0f1 5236 });
5bc10b2c
DM
5237 depth--;
5238 }
5239#else
77cb431f
DM
5240 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5241 || yes_state > SLAB_LAST(PL_regmatch_slab))
5242 {
5243 /* not in this slab, pop slab */
5244 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5245 PL_regmatch_slab = PL_regmatch_slab->prev;
5246 st = SLAB_LAST(PL_regmatch_slab);
5247 }
5248 depth -= (st - yes_state);
5bc10b2c 5249#endif
77cb431f
DM
5250 st = yes_state;
5251 yes_state = st->u.yes.prev_yes_state;
5252 PL_regmatch_state = st;
24b23f37 5253
5d458dd8
YO
5254 if (no_final) {
5255 locinput= st->locinput;
5256 nextchr = UCHARAT(locinput);
5257 }
54612592 5258 state_num = st->resume_state + no_final;
24d3c4a9 5259 goto reenter_switch;
77cb431f
DM
5260 }
5261
a3621e74 5262 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 5263 PL_colors[4], PL_colors[5]));
02db2b7b 5264
19b95bf0
DM
5265 if (PL_reg_eval_set) {
5266 /* each successfully executed (?{...}) block does the equivalent of
5267 * local $^R = do {...}
5268 * When popping the save stack, all these locals would be undone;
5269 * bypass this by setting the outermost saved $^R to the latest
5270 * value */
5271 if (oreplsv != GvSV(PL_replgv))
5272 sv_setsv(oreplsv, GvSV(PL_replgv));
5273 }
95b24440 5274 result = 1;
aa283a38 5275 goto final_exit;
4633a7c4
LW
5276
5277no:
a3621e74 5278 DEBUG_EXECUTE_r(
7821416a 5279 PerlIO_printf(Perl_debug_log,
786e8c11 5280 "%*s %sfailed...%s\n",
5bc10b2c 5281 REPORT_CODE_OFF+depth*2, "",
786e8c11 5282 PL_colors[4], PL_colors[5])
7821416a 5283 );
aa283a38 5284
262b90c4 5285no_silent:
54612592
YO
5286 if (no_final) {
5287 if (yes_state) {
5288 goto yes;
5289 } else {
5290 goto final_exit;
5291 }
5292 }
aa283a38
DM
5293 if (depth) {
5294 /* there's a previous state to backtrack to */
40a82448
DM
5295 st--;
5296 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5297 PL_regmatch_slab = PL_regmatch_slab->prev;
5298 st = SLAB_LAST(PL_regmatch_slab);
5299 }
5300 PL_regmatch_state = st;
40a82448
DM
5301 locinput= st->locinput;
5302 nextchr = UCHARAT(locinput);
5303
5bc10b2c
DM
5304 DEBUG_STATE_pp("pop");
5305 depth--;
262b90c4
DM
5306 if (yes_state == st)
5307 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 5308
24d3c4a9
DM
5309 state_num = st->resume_state + 1; /* failure = success + 1 */
5310 goto reenter_switch;
95b24440 5311 }
24d3c4a9 5312 result = 0;
aa283a38 5313
262b90c4 5314 final_exit:
bbe252da 5315 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
5316 SV *sv_err = get_sv("REGERROR", 1);
5317 SV *sv_mrk = get_sv("REGMARK", 1);
5318 if (result) {
e2e6a0f1 5319 sv_commit = &PL_sv_no;
5d458dd8
YO
5320 if (!sv_yes_mark)
5321 sv_yes_mark = &PL_sv_yes;
5322 } else {
5323 if (!sv_commit)
5324 sv_commit = &PL_sv_yes;
5325 sv_yes_mark = &PL_sv_no;
5326 }
5327 sv_setsv(sv_err, sv_commit);
5328 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 5329 }
19b95bf0 5330
2f554ef7
DM
5331 /* clean up; in particular, free all slabs above current one */
5332 LEAVE_SCOPE(oldsave);
5d9a96ca 5333
95b24440 5334 return result;
a687059c
LW
5335}
5336
5337/*
5338 - regrepeat - repeatedly match something simple, report how many
5339 */
5340/*
5341 * [This routine now assumes that it will only match on things of length 1.
5342 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 5343 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 5344 */
76e3520e 5345STATIC I32
e2e6a0f1 5346S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
a687059c 5347{
27da23d5 5348 dVAR;
a0d0e21e 5349 register char *scan;
a0d0e21e 5350 register I32 c;
3280af22 5351 register char *loceol = PL_regeol;
a0ed51b3 5352 register I32 hardcount = 0;
53c4c00c 5353 register bool do_utf8 = PL_reg_match_utf8;
4f55667c
SP
5354#ifndef DEBUGGING
5355 PERL_UNUSED_ARG(depth);
5356#endif
a0d0e21e 5357
7918f24d
NC
5358 PERL_ARGS_ASSERT_REGREPEAT;
5359
3280af22 5360 scan = PL_reginput;
faf11cac
HS
5361 if (max == REG_INFTY)
5362 max = I32_MAX;
5363 else if (max < loceol - scan)
7f596f4c 5364 loceol = scan + max;
a0d0e21e 5365 switch (OP(p)) {
22c35a8c 5366 case REG_ANY:
1aa99e6b 5367 if (do_utf8) {
ffc61ed2 5368 loceol = PL_regeol;
1aa99e6b 5369 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
5370 scan += UTF8SKIP(scan);
5371 hardcount++;
5372 }
5373 } else {
5374 while (scan < loceol && *scan != '\n')
5375 scan++;
a0ed51b3
LW
5376 }
5377 break;
ffc61ed2 5378 case SANY:
def8e4ea
JH
5379 if (do_utf8) {
5380 loceol = PL_regeol;
a0804c9e 5381 while (scan < loceol && hardcount < max) {
def8e4ea
JH
5382 scan += UTF8SKIP(scan);
5383 hardcount++;
5384 }
5385 }
5386 else
5387 scan = loceol;
a0ed51b3 5388 break;
f33976b4
DB
5389 case CANY:
5390 scan = loceol;
5391 break;
090f7165
JH
5392 case EXACT: /* length of string is 1 */
5393 c = (U8)*STRING(p);
5394 while (scan < loceol && UCHARAT(scan) == c)
5395 scan++;
bbce6d69 5396 break;
5397 case EXACTF: /* length of string is 1 */
cd439c50 5398 c = (U8)*STRING(p);
bbce6d69 5399 while (scan < loceol &&
22c35a8c 5400 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 5401 scan++;
5402 break;
5403 case EXACTFL: /* length of string is 1 */
3280af22 5404 PL_reg_flags |= RF_tainted;
cd439c50 5405 c = (U8)*STRING(p);
bbce6d69 5406 while (scan < loceol &&
22c35a8c 5407 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
5408 scan++;
5409 break;
5410 case ANYOF:
ffc61ed2
JH
5411 if (do_utf8) {
5412 loceol = PL_regeol;
cfc92286 5413 while (hardcount < max && scan < loceol &&
32fc9b6a 5414 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
5415 scan += UTF8SKIP(scan);
5416 hardcount++;
5417 }
5418 } else {
32fc9b6a 5419 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
5420 scan++;
5421 }
a0d0e21e
LW
5422 break;
5423 case ALNUM:
1aa99e6b 5424 if (do_utf8) {
ffc61ed2 5425 loceol = PL_regeol;
1a4fad37 5426 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 5427 while (hardcount < max && scan < loceol &&
3568d838 5428 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5429 scan += UTF8SKIP(scan);
5430 hardcount++;
5431 }
5432 } else {
5433 while (scan < loceol && isALNUM(*scan))
5434 scan++;
a0ed51b3
LW
5435 }
5436 break;
bbce6d69 5437 case ALNUML:
3280af22 5438 PL_reg_flags |= RF_tainted;
1aa99e6b 5439 if (do_utf8) {
ffc61ed2 5440 loceol = PL_regeol;
1aa99e6b
IH
5441 while (hardcount < max && scan < loceol &&
5442 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5443 scan += UTF8SKIP(scan);
5444 hardcount++;
5445 }
5446 } else {
5447 while (scan < loceol && isALNUM_LC(*scan))
5448 scan++;
a0ed51b3
LW
5449 }
5450 break;
a0d0e21e 5451 case NALNUM:
1aa99e6b 5452 if (do_utf8) {
ffc61ed2 5453 loceol = PL_regeol;
1a4fad37 5454 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 5455 while (hardcount < max && scan < loceol &&
3568d838 5456 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5457 scan += UTF8SKIP(scan);
5458 hardcount++;
5459 }
5460 } else {
5461 while (scan < loceol && !isALNUM(*scan))
5462 scan++;
a0ed51b3
LW
5463 }
5464 break;
bbce6d69 5465 case NALNUML:
3280af22 5466 PL_reg_flags |= RF_tainted;
1aa99e6b 5467 if (do_utf8) {
ffc61ed2 5468 loceol = PL_regeol;
1aa99e6b
IH
5469 while (hardcount < max && scan < loceol &&
5470 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5471 scan += UTF8SKIP(scan);
5472 hardcount++;
5473 }
5474 } else {
5475 while (scan < loceol && !isALNUM_LC(*scan))
5476 scan++;
a0ed51b3
LW
5477 }
5478 break;
a0d0e21e 5479 case SPACE:
1aa99e6b 5480 if (do_utf8) {
ffc61ed2 5481 loceol = PL_regeol;
1a4fad37 5482 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5483 while (hardcount < max && scan < loceol &&
3568d838
JH
5484 (*scan == ' ' ||
5485 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5486 scan += UTF8SKIP(scan);
5487 hardcount++;
5488 }
5489 } else {
5490 while (scan < loceol && isSPACE(*scan))
5491 scan++;
a0ed51b3
LW
5492 }
5493 break;
bbce6d69 5494 case SPACEL:
3280af22 5495 PL_reg_flags |= RF_tainted;
1aa99e6b 5496 if (do_utf8) {
ffc61ed2 5497 loceol = PL_regeol;
1aa99e6b 5498 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5499 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5500 scan += UTF8SKIP(scan);
5501 hardcount++;
5502 }
5503 } else {
5504 while (scan < loceol && isSPACE_LC(*scan))
5505 scan++;
a0ed51b3
LW
5506 }
5507 break;
a0d0e21e 5508 case NSPACE:
1aa99e6b 5509 if (do_utf8) {
ffc61ed2 5510 loceol = PL_regeol;
1a4fad37 5511 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5512 while (hardcount < max && scan < loceol &&
3568d838
JH
5513 !(*scan == ' ' ||
5514 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5515 scan += UTF8SKIP(scan);
5516 hardcount++;
5517 }
5518 } else {
5519 while (scan < loceol && !isSPACE(*scan))
5520 scan++;
a0ed51b3 5521 }
0008a298 5522 break;
bbce6d69 5523 case NSPACEL:
3280af22 5524 PL_reg_flags |= RF_tainted;
1aa99e6b 5525 if (do_utf8) {
ffc61ed2 5526 loceol = PL_regeol;
1aa99e6b 5527 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5528 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5529 scan += UTF8SKIP(scan);
5530 hardcount++;
5531 }
5532 } else {
5533 while (scan < loceol && !isSPACE_LC(*scan))
5534 scan++;
a0ed51b3
LW
5535 }
5536 break;
a0d0e21e 5537 case DIGIT:
1aa99e6b 5538 if (do_utf8) {
ffc61ed2 5539 loceol = PL_regeol;
1a4fad37 5540 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5541 while (hardcount < max && scan < loceol &&
3568d838 5542 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5543 scan += UTF8SKIP(scan);
5544 hardcount++;
5545 }
5546 } else {
5547 while (scan < loceol && isDIGIT(*scan))
5548 scan++;
a0ed51b3
LW
5549 }
5550 break;
a0d0e21e 5551 case NDIGIT:
1aa99e6b 5552 if (do_utf8) {
ffc61ed2 5553 loceol = PL_regeol;
1a4fad37 5554 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5555 while (hardcount < max && scan < loceol &&
3568d838 5556 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5557 scan += UTF8SKIP(scan);
5558 hardcount++;
5559 }
5560 } else {
5561 while (scan < loceol && !isDIGIT(*scan))
5562 scan++;
a0ed51b3 5563 }
e1d1eefb
YO
5564 case LNBREAK:
5565 if (do_utf8) {
5566 loceol = PL_regeol;
5567 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5568 scan += c;
5569 hardcount++;
5570 }
5571 } else {
5572 /*
5573 LNBREAK can match two latin chars, which is ok,
5574 because we have a null terminated string, but we
5575 have to use hardcount in this situation
5576 */
5577 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
5578 scan+=c;
5579 hardcount++;
5580 }
5581 }
5582 break;
5583 case HORIZWS:
5584 if (do_utf8) {
5585 loceol = PL_regeol;
5586 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5587 scan += c;
5588 hardcount++;
5589 }
5590 } else {
5591 while (scan < loceol && is_HORIZWS_latin1(scan))
5592 scan++;
5593 }
a0ed51b3 5594 break;
e1d1eefb
YO
5595 case NHORIZWS:
5596 if (do_utf8) {
5597 loceol = PL_regeol;
5598 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5599 scan += UTF8SKIP(scan);
5600 hardcount++;
5601 }
5602 } else {
5603 while (scan < loceol && !is_HORIZWS_latin1(scan))
5604 scan++;
5605
5606 }
5607 break;
5608 case VERTWS:
5609 if (do_utf8) {
5610 loceol = PL_regeol;
5611 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5612 scan += c;
5613 hardcount++;
5614 }
5615 } else {
5616 while (scan < loceol && is_VERTWS_latin1(scan))
5617 scan++;
5618
5619 }
5620 break;
5621 case NVERTWS:
5622 if (do_utf8) {
5623 loceol = PL_regeol;
5624 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5625 scan += UTF8SKIP(scan);
5626 hardcount++;
5627 }
5628 } else {
5629 while (scan < loceol && !is_VERTWS_latin1(scan))
5630 scan++;
5631
5632 }
5633 break;
5634
a0d0e21e
LW
5635 default: /* Called on something of 0 width. */
5636 break; /* So match right here or not at all. */
5637 }
a687059c 5638
a0ed51b3
LW
5639 if (hardcount)
5640 c = hardcount;
5641 else
5642 c = scan - PL_reginput;
3280af22 5643 PL_reginput = scan;
a687059c 5644
a3621e74 5645 DEBUG_r({
e68ec53f 5646 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 5647 DEBUG_EXECUTE_r({
e68ec53f
YO
5648 SV * const prop = sv_newmortal();
5649 regprop(prog, prop, p);
5650 PerlIO_printf(Perl_debug_log,
be8e71aa 5651 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 5652 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 5653 });
be8e71aa 5654 });
9041c2e3 5655
a0d0e21e 5656 return(c);
a687059c
LW
5657}
5658
c277df42 5659
be8e71aa 5660#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 5661/*
ffc61ed2
JH
5662- regclass_swash - prepare the utf8 swash
5663*/
5664
5665SV *
32fc9b6a 5666Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 5667{
97aff369 5668 dVAR;
9e55ce06
JH
5669 SV *sw = NULL;
5670 SV *si = NULL;
5671 SV *alt = NULL;
f8fc2ecf
YO
5672 RXi_GET_DECL(prog,progi);
5673 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 5674
7918f24d
NC
5675 PERL_ARGS_ASSERT_REGCLASS_SWASH;
5676
4f639d21 5677 if (data && data->count) {
a3b680e6 5678 const U32 n = ARG(node);
ffc61ed2 5679
4f639d21 5680 if (data->what[n] == 's') {
ad64d0ec
NC
5681 SV * const rv = MUTABLE_SV(data->data[n]);
5682 AV * const av = MUTABLE_AV(SvRV(rv));
2d03de9c 5683 SV **const ary = AvARRAY(av);
9e55ce06 5684 SV **a, **b;
9041c2e3 5685
711a919c 5686 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5687 * documentation of these array elements. */
5688
b11f357e 5689 si = *ary;
fe5bfecd
JH
5690 a = SvROK(ary[1]) ? &ary[1] : NULL;
5691 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
b11f357e 5692
ffc61ed2
JH
5693 if (a)
5694 sw = *a;
5695 else if (si && doinit) {
5696 sw = swash_init("utf8", "", si, 1, 0);
5697 (void)av_store(av, 1, sw);
5698 }
9e55ce06
JH
5699 if (b)
5700 alt = *b;
ffc61ed2
JH
5701 }
5702 }
5703
9e55ce06
JH
5704 if (listsvp)
5705 *listsvp = si;
5706 if (altsvp)
5707 *altsvp = alt;
ffc61ed2
JH
5708
5709 return sw;
5710}
76234dfb 5711#endif
ffc61ed2
JH
5712
5713/*
ba7b4546 5714 - reginclass - determine if a character falls into a character class
832705d4
JH
5715
5716 The n is the ANYOF regnode, the p is the target string, lenp
5717 is pointer to the maximum length of how far to go in the p
5718 (if the lenp is zero, UTF8SKIP(p) is used),
5719 do_utf8 tells whether the target string is in UTF-8.
5720
bbce6d69 5721 */
5722
76e3520e 5723STATIC bool
32fc9b6a 5724S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5725{
27da23d5 5726 dVAR;
a3b680e6 5727 const char flags = ANYOF_FLAGS(n);
bbce6d69 5728 bool match = FALSE;
cc07378b 5729 UV c = *p;
ae9ddab8 5730 STRLEN len = 0;
9e55ce06 5731 STRLEN plen;
1aa99e6b 5732
7918f24d
NC
5733 PERL_ARGS_ASSERT_REGINCLASS;
5734
19f67299
TS
5735 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5736 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5737 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5738 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
e8a70c6f
SP
5739 if (len == (STRLEN)-1)
5740 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 5741 }
bbce6d69 5742
0f0076b4 5743 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5744 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5745 if (lenp)
5746 *lenp = 0;
ffc61ed2 5747 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5748 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5749 match = TRUE;
bbce6d69 5750 }
3568d838 5751 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5752 match = TRUE;
ffc61ed2 5753 if (!match) {
9e55ce06 5754 AV *av;
32fc9b6a 5755 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5756
5757 if (sw) {
3f0c5693
KW
5758 U8 * utf8_p;
5759 if (do_utf8) {
5760 utf8_p = (U8 *) p;
5761 } else {
5762 STRLEN len = 1;
5763 utf8_p = bytes_to_utf8(p, &len);
5764 }
5765 if (swash_fetch(sw, utf8_p, 1))
ffc61ed2
JH
5766 match = TRUE;
5767 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5768 if (!match && lenp && av) {
5769 I32 i;
9e55ce06 5770 for (i = 0; i <= av_len(av); i++) {
890ce7af 5771 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5772 STRLEN len;
890ce7af 5773 const char * const s = SvPV_const(sv, len);
3f0c5693 5774 if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
9e55ce06
JH
5775 *lenp = len;
5776 match = TRUE;
5777 break;
5778 }
5779 }
5780 }
5781 if (!match) {
89ebb4a3 5782 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43 5783
3f0c5693
KW
5784 STRLEN tmplen;
5785 to_utf8_fold(utf8_p, tmpbuf, &tmplen);
5786 if (swash_fetch(sw, tmpbuf, 1))
9e55ce06
JH
5787 match = TRUE;
5788 }
ffc61ed2 5789 }
b3a04dd3
KW
5790
5791 /* If we allocated a string above, free it */
5792 if (! do_utf8) Safefree(utf8_p);
ffc61ed2 5793 }
bbce6d69 5794 }
9e55ce06 5795 if (match && lenp && *lenp == 0)
0f0076b4 5796 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5797 }
1aa99e6b 5798 if (!match && c < 256) {
ffc61ed2
JH
5799 if (ANYOF_BITMAP_TEST(n, c))
5800 match = TRUE;
5801 else if (flags & ANYOF_FOLD) {
eb160463 5802 U8 f;
a0ed51b3 5803
ffc61ed2
JH
5804 if (flags & ANYOF_LOCALE) {
5805 PL_reg_flags |= RF_tainted;
5806 f = PL_fold_locale[c];
5807 }
5808 else
5809 f = PL_fold[c];
5810 if (f != c && ANYOF_BITMAP_TEST(n, f))
5811 match = TRUE;
5812 }
5813
5814 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5815 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5816 if (
5817 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5818 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5819 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5820 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5821 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5822 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5823 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5824 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5825 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5826 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5827 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5828 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5829 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5830 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5831 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5832 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5833 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5834 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5835 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5836 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5837 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5838 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5839 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5840 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5841 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5842 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5843 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5844 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5845 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5846 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5847 ) /* How's that for a conditional? */
5848 {
5849 match = TRUE;
5850 }
a0ed51b3 5851 }
a0ed51b3
LW
5852 }
5853
a0ed51b3
LW
5854 return (flags & ANYOF_INVERT) ? !match : match;
5855}
161b471a 5856
dfe13c55 5857STATIC U8 *
0ce71af7 5858S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5859{
97aff369 5860 dVAR;
7918f24d
NC
5861
5862 PERL_ARGS_ASSERT_REGHOP3;
5863
a0ed51b3 5864 if (off >= 0) {
1aa99e6b 5865 while (off-- && s < lim) {
ffc61ed2 5866 /* XXX could check well-formedness here */
a0ed51b3 5867 s += UTF8SKIP(s);
ffc61ed2 5868 }
a0ed51b3
LW
5869 }
5870 else {
1de06328
YO
5871 while (off++ && s > lim) {
5872 s--;
5873 if (UTF8_IS_CONTINUED(*s)) {
5874 while (s > lim && UTF8_IS_CONTINUATION(*s))
5875 s--;
a0ed51b3 5876 }
1de06328 5877 /* XXX could check well-formedness here */
a0ed51b3
LW
5878 }
5879 }
5880 return s;
5881}
161b471a 5882
f9f4320a
YO
5883#ifdef XXX_dmq
5884/* there are a bunch of places where we use two reghop3's that should
5885 be replaced with this routine. but since thats not done yet
5886 we ifdef it out - dmq
5887*/
dfe13c55 5888STATIC U8 *
1de06328
YO
5889S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5890{
5891 dVAR;
7918f24d
NC
5892
5893 PERL_ARGS_ASSERT_REGHOP4;
5894
1de06328
YO
5895 if (off >= 0) {
5896 while (off-- && s < rlim) {
5897 /* XXX could check well-formedness here */
5898 s += UTF8SKIP(s);
5899 }
5900 }
5901 else {
5902 while (off++ && s > llim) {
5903 s--;
5904 if (UTF8_IS_CONTINUED(*s)) {
5905 while (s > llim && UTF8_IS_CONTINUATION(*s))
5906 s--;
5907 }
5908 /* XXX could check well-formedness here */
5909 }
5910 }
5911 return s;
5912}
f9f4320a 5913#endif
1de06328
YO
5914
5915STATIC U8 *
0ce71af7 5916S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5917{
97aff369 5918 dVAR;
7918f24d
NC
5919
5920 PERL_ARGS_ASSERT_REGHOPMAYBE3;
5921
a0ed51b3 5922 if (off >= 0) {
1aa99e6b 5923 while (off-- && s < lim) {
ffc61ed2 5924 /* XXX could check well-formedness here */
a0ed51b3 5925 s += UTF8SKIP(s);
ffc61ed2 5926 }
a0ed51b3 5927 if (off >= 0)
3dab1dad 5928 return NULL;
a0ed51b3
LW
5929 }
5930 else {
1de06328
YO
5931 while (off++ && s > lim) {
5932 s--;
5933 if (UTF8_IS_CONTINUED(*s)) {
5934 while (s > lim && UTF8_IS_CONTINUATION(*s))
5935 s--;
a0ed51b3 5936 }
1de06328 5937 /* XXX could check well-formedness here */
a0ed51b3
LW
5938 }
5939 if (off <= 0)
3dab1dad 5940 return NULL;
a0ed51b3
LW
5941 }
5942 return s;
5943}
51371543 5944
51371543 5945static void
acfe0abc 5946restore_pos(pTHX_ void *arg)
51371543 5947{
97aff369 5948 dVAR;
097eb12c 5949 regexp * const rex = (regexp *)arg;
51371543
GS
5950 if (PL_reg_eval_set) {
5951 if (PL_reg_oldsaved) {
4f639d21
DM
5952 rex->subbeg = PL_reg_oldsaved;
5953 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5954#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5955 rex->saved_copy = PL_nrs;
ed252734 5956#endif
07bc277f 5957 RXp_MATCH_COPIED_on(rex);
51371543
GS
5958 }
5959 PL_reg_magic->mg_len = PL_reg_oldpos;
5960 PL_reg_eval_set = 0;
5961 PL_curpm = PL_reg_oldcurpm;
5962 }
5963}
33b8afdf
JH
5964
5965STATIC void
5966S_to_utf8_substr(pTHX_ register regexp *prog)
5967{
a1cac82e 5968 int i = 1;
7918f24d
NC
5969
5970 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
5971
a1cac82e
NC
5972 do {
5973 if (prog->substrs->data[i].substr
5974 && !prog->substrs->data[i].utf8_substr) {
5975 SV* const sv = newSVsv(prog->substrs->data[i].substr);
5976 prog->substrs->data[i].utf8_substr = sv;
5977 sv_utf8_upgrade(sv);
610460f9
NC
5978 if (SvVALID(prog->substrs->data[i].substr)) {
5979 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
5980 if (flags & FBMcf_TAIL) {
5981 /* Trim the trailing \n that fbm_compile added last
5982 time. */
5983 SvCUR_set(sv, SvCUR(sv) - 1);
5984 /* Whilst this makes the SV technically "invalid" (as its
5985 buffer is no longer followed by "\0") when fbm_compile()
5986 adds the "\n" back, a "\0" is restored. */
5987 }
5988 fbm_compile(sv, flags);
5989 }
a1cac82e
NC
5990 if (prog->substrs->data[i].substr == prog->check_substr)
5991 prog->check_utf8 = sv;
5992 }
5993 } while (i--);
33b8afdf
JH
5994}
5995
5996STATIC void
5997S_to_byte_substr(pTHX_ register regexp *prog)
5998{
97aff369 5999 dVAR;
a1cac82e 6000 int i = 1;
7918f24d
NC
6001
6002 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6003
a1cac82e
NC
6004 do {
6005 if (prog->substrs->data[i].utf8_substr
6006 && !prog->substrs->data[i].substr) {
6007 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6008 if (sv_utf8_downgrade(sv, TRUE)) {
610460f9
NC
6009 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6010 const U8 flags
6011 = BmFLAGS(prog->substrs->data[i].utf8_substr);
6012 if (flags & FBMcf_TAIL) {
6013 /* Trim the trailing \n that fbm_compile added last
6014 time. */
6015 SvCUR_set(sv, SvCUR(sv) - 1);
6016 }
6017 fbm_compile(sv, flags);
6018 }
a1cac82e
NC
6019 } else {
6020 SvREFCNT_dec(sv);
6021 sv = &PL_sv_undef;
6022 }
6023 prog->substrs->data[i].substr = sv;
6024 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6025 prog->check_substr = sv;
33b8afdf 6026 }
a1cac82e 6027 } while (i--);
33b8afdf 6028}
66610fdd
RGS
6029
6030/*
6031 * Local variables:
6032 * c-indentation-style: bsd
6033 * c-basic-offset: 4
6034 * indent-tabs-mode: t
6035 * End:
6036 *
37442d52
RGS
6037 * ex: set ts=8 sts=4 sw=4 noet:
6038 */