This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[perl #56526] m/a{1,0}/ compiles but doesn't match a literal string
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
9041c2e3 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 **** Alterations to Henry's code are...
57 ****
4bb101f2 58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
60 **** by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
a687059c
LW
64 *
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
68 */
69#include "EXTERN.h"
864dbfa3 70#define PERL_IN_REGEXEC_C
a687059c 71#include "perl.h"
0f5d15d6 72
54df2634
NC
73#ifdef PERL_IN_XSUB_RE
74# include "re_comp.h"
75#else
76# include "regcomp.h"
77#endif
a687059c 78
c277df42
IZ
79#define RF_tainted 1 /* tainted information used? */
80#define RF_warned 2 /* warned about big count? */
faec1544 81
ab3bbdeb 82#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 83
eb160463 84#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
85
86#define RS_init 1 /* eval environment created */
87#define RS_set 2 /* replsv value is set */
c277df42 88
a687059c
LW
89#ifndef STATIC
90#define STATIC static
91#endif
92
32fc9b6a 93#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 94
c277df42
IZ
95/*
96 * Forwards.
97 */
98
33b8afdf 99#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 100#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 101
3dab1dad
YO
102#define HOPc(pos,off) \
103 (char *)(PL_reg_match_utf8 \
52657f30 104 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
105 : (U8*)(pos + off))
106#define HOPBACKc(pos, off) \
07be1b83
YO
107 (char*)(PL_reg_match_utf8\
108 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
109 : (pos - off >= PL_bostr) \
8e11feef 110 ? (U8*)pos - off \
3dab1dad 111 : NULL)
efb30f32 112
e7409c1b 113#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 114#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 115
1a4fad37
AL
116#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
117 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
118#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
119#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
120#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
121#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 122
3dab1dad
YO
123/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124
5f80c4cf 125/* for use after a quantifier and before an EXACT-like node -- japhy */
3e901dc0
YO
126/* it would be nice to rework regcomp.sym to generate this stuff. sigh */
127#define JUMPABLE(rn) ( \
128 OP(rn) == OPEN || \
129 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
130 OP(rn) == EVAL || \
cca55fe3
JP
131 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
132 OP(rn) == PLUS || OP(rn) == MINMOD || \
ee9b8eae 133 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
3dab1dad 134 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 135)
ee9b8eae 136#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 137
ee9b8eae
YO
138#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
139
140#if 0
141/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
142 we don't need this definition. */
143#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
144#define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
145#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
146
147#else
148/* ... so we use this as its faster. */
149#define IS_TEXT(rn) ( OP(rn)==EXACT )
150#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
151#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
152
153#endif
e2d8ce26 154
a84d97b6
HS
155/*
156 Search for mandatory following text node; for lookahead, the text must
157 follow but for lookbehind (rn->flags != 0) we skip to the next step.
158*/
cca55fe3 159#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
160 while (JUMPABLE(rn)) { \
161 const OPCODE type = OP(rn); \
162 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 163 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 164 else if (type == PLUS) \
cca55fe3 165 rn = NEXTOPER(rn); \
3dab1dad 166 else if (type == IFMATCH) \
a84d97b6 167 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 168 else rn += NEXT_OFF(rn); \
3dab1dad 169 } \
5f80c4cf 170} STMT_END
74750237 171
c476f425 172
acfe0abc 173static void restore_pos(pTHX_ void *arg);
51371543 174
76e3520e 175STATIC CHECKPOINT
cea2e8a9 176S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 177{
97aff369 178 dVAR;
a3b680e6 179 const int retval = PL_savestack_ix;
b1ce53c5 180#define REGCP_PAREN_ELEMS 4
a3b680e6 181 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 182 int p;
40a82448 183 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 184
e49a9654
IH
185 if (paren_elems_to_push < 0)
186 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
187
f0ab9afb 188#define REGCP_OTHER_ELEMS 7
4b3c1a47 189 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
7f69552c 190
3280af22 191 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 192/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
f0ab9afb
NC
193 SSPUSHINT(PL_regoffs[p].end);
194 SSPUSHINT(PL_regoffs[p].start);
3280af22 195 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 196 SSPUSHINT(p);
e7707071 197 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
40a82448 198 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
f0ab9afb 199 (UV)p, (IV)PL_regoffs[p].start,
40a82448 200 (IV)(PL_reg_start_tmp[p] - PL_bostr),
f0ab9afb 201 (IV)PL_regoffs[p].end
40a82448 202 ));
a0d0e21e 203 }
b1ce53c5 204/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
f0ab9afb 205 SSPUSHPTR(PL_regoffs);
3280af22
NIS
206 SSPUSHINT(PL_regsize);
207 SSPUSHINT(*PL_reglastparen);
a01268b5 208 SSPUSHINT(*PL_reglastcloseparen);
3280af22 209 SSPUSHPTR(PL_reginput);
41123dfd
JH
210#define REGCP_FRAME_ELEMS 2
211/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
212 * are needed for the regexp context stack bookkeeping. */
213 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 214 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 215
a0d0e21e
LW
216 return retval;
217}
218
c277df42 219/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
220#define REGCP_SET(cp) \
221 DEBUG_STATE_r( \
ab3bbdeb 222 PerlIO_printf(Perl_debug_log, \
e4f74956 223 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
224 (IV)PL_savestack_ix)); \
225 cp = PL_savestack_ix
c3464db5 226
ab3bbdeb 227#define REGCP_UNWIND(cp) \
e4f74956 228 DEBUG_STATE_r( \
ab3bbdeb 229 if (cp != PL_savestack_ix) \
e4f74956
YO
230 PerlIO_printf(Perl_debug_log, \
231 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
232 (IV)(cp), (IV)PL_savestack_ix)); \
233 regcpblow(cp)
c277df42 234
76e3520e 235STATIC char *
097eb12c 236S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 237{
97aff369 238 dVAR;
3b6647e0 239 U32 i;
a0d0e21e 240 char *input;
a3621e74
YO
241 GET_RE_DEBUG_FLAGS_DECL;
242
7918f24d
NC
243 PERL_ARGS_ASSERT_REGCPPOP;
244
b1ce53c5 245 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 246 i = SSPOPINT;
b1ce53c5
JH
247 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
248 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 249 input = (char *) SSPOPPTR;
a01268b5 250 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
251 *PL_reglastparen = SSPOPINT;
252 PL_regsize = SSPOPINT;
f0ab9afb 253 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
b1ce53c5 254
6bda09f9 255
b1ce53c5 256 /* Now restore the parentheses context. */
41123dfd
JH
257 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
258 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 259 I32 tmps;
097eb12c 260 U32 paren = (U32)SSPOPINT;
3280af22 261 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
f0ab9afb 262 PL_regoffs[paren].start = SSPOPINT;
cf93c79d 263 tmps = SSPOPINT;
3280af22 264 if (paren <= *PL_reglastparen)
f0ab9afb 265 PL_regoffs[paren].end = tmps;
e7707071 266 DEBUG_BUFFERS_r(
c3464db5 267 PerlIO_printf(Perl_debug_log,
b900a521 268 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
f0ab9afb 269 (UV)paren, (IV)PL_regoffs[paren].start,
b900a521 270 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
f0ab9afb 271 (IV)PL_regoffs[paren].end,
3280af22 272 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 273 );
a0d0e21e 274 }
e7707071 275 DEBUG_BUFFERS_r(
bb7a0f54 276 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 277 PerlIO_printf(Perl_debug_log,
faccc32b 278 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 279 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
280 }
281 );
daf18116 282#if 1
dafc8851
JH
283 /* It would seem that the similar code in regtry()
284 * already takes care of this, and in fact it is in
285 * a better location to since this code can #if 0-ed out
286 * but the code in regtry() is needed or otherwise tests
287 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
288 * (as of patchlevel 7877) will fail. Then again,
289 * this code seems to be necessary or otherwise
225593e1
DM
290 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
291 * --jhi updated by dapm */
3b6647e0 292 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
097eb12c 293 if (i > PL_regsize)
f0ab9afb
NC
294 PL_regoffs[i].start = -1;
295 PL_regoffs[i].end = -1;
a0d0e21e 296 }
dafc8851 297#endif
a0d0e21e
LW
298 return input;
299}
300
02db2b7b 301#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 302
a687059c 303/*
e50aee73 304 * pregexec and friends
a687059c
LW
305 */
306
76234dfb 307#ifndef PERL_IN_XSUB_RE
a687059c 308/*
c277df42 309 - pregexec - match a regexp against a string
a687059c 310 */
c277df42 311I32
49d7dfbc 312Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 313 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
314/* strend: pointer to null at end of string */
315/* strbeg: real beginning of string */
316/* minend: end of match must be >=minend after stringarg. */
317/* nosave: For optimizations. */
318{
7918f24d
NC
319 PERL_ARGS_ASSERT_PREGEXEC;
320
c277df42 321 return
9041c2e3 322 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
323 nosave ? 0 : REXEC_COPY_STR);
324}
76234dfb 325#endif
22e551b9 326
9041c2e3 327/*
cad2e5aa
JH
328 * Need to implement the following flags for reg_anch:
329 *
330 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
331 * USE_INTUIT_ML
332 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
333 * INTUIT_AUTORITATIVE_ML
334 * INTUIT_ONCE_NOML - Intuit can match in one location only.
335 * INTUIT_ONCE_ML
336 *
337 * Another flag for this function: SECOND_TIME (so that float substrs
338 * with giant delta may be not rechecked).
339 */
340
341/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
342
3f7c398e 343/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
344 Otherwise, only SvCUR(sv) is used to get strbeg. */
345
346/* XXXX We assume that strpos is strbeg unless sv. */
347
6eb5f6b9
JH
348/* XXXX Some places assume that there is a fixed substring.
349 An update may be needed if optimizer marks as "INTUITable"
350 RExen without fixed substrings. Similarly, it is assumed that
351 lengths of all the strings are no more than minlen, thus they
352 cannot come from lookahead.
40d049e4
YO
353 (Or minlen should take into account lookahead.)
354 NOTE: Some of this comment is not correct. minlen does now take account
355 of lookahead/behind. Further research is required. -- demerphq
356
357*/
6eb5f6b9 358
2c2d71f5
JH
359/* A failure to find a constant substring means that there is no need to make
360 an expensive call to REx engine, thus we celebrate a failure. Similarly,
361 finding a substring too deep into the string means that less calls to
30944b6d
IZ
362 regtry() should be needed.
363
364 REx compiler's optimizer found 4 possible hints:
365 a) Anchored substring;
366 b) Fixed substring;
367 c) Whether we are anchored (beginning-of-line or \G);
368 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 369 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
370 string which does not contradict any of them.
371 */
2c2d71f5 372
6eb5f6b9
JH
373/* Most of decisions we do here should have been done at compile time.
374 The nodes of the REx which we used for the search should have been
375 deleted from the finite automaton. */
376
cad2e5aa 377char *
288b8c02 378Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 379 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 380{
97aff369 381 dVAR;
288b8c02 382 struct regexp *const prog = (struct regexp *)SvANY(rx);
b7953727 383 register I32 start_shift = 0;
cad2e5aa 384 /* Should be nonnegative! */
b7953727 385 register I32 end_shift = 0;
2c2d71f5
JH
386 register char *s;
387 register SV *check;
a1933d95 388 char *strbeg;
cad2e5aa 389 char *t;
1de06328 390 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 391 I32 ml_anch;
bd61b366
SS
392 register char *other_last = NULL; /* other substr checked before this */
393 char *check_at = NULL; /* check substr found at this pos */
bbe252da 394 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 395 RXi_GET_DECL(prog,progi);
30944b6d 396#ifdef DEBUGGING
890ce7af 397 const char * const i_strpos = strpos;
30944b6d 398#endif
a3621e74
YO
399 GET_RE_DEBUG_FLAGS_DECL;
400
7918f24d
NC
401 PERL_ARGS_ASSERT_RE_INTUIT_START;
402
288b8c02 403 RX_MATCH_UTF8_set(rx,do_utf8);
cad2e5aa 404
3c8556c3 405 if (RX_UTF8(rx)) {
b8d68ded
JH
406 PL_reg_flags |= RF_utf8;
407 }
ab3bbdeb 408 DEBUG_EXECUTE_r(
efd26800 409 debug_start_match(rx, do_utf8, strpos, strend,
1de06328
YO
410 sv ? "Guessing start of match in sv for"
411 : "Guessing start of match in string for");
2a782b5b 412 );
cad2e5aa 413
c344f387
JH
414 /* CHR_DIST() would be more correct here but it makes things slow. */
415 if (prog->minlen > strend - strpos) {
a3621e74 416 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 417 "String too short... [re_intuit_start]\n"));
cad2e5aa 418 goto fail;
2c2d71f5 419 }
1de06328 420
a1933d95 421 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 422 PL_regeol = strend;
33b8afdf
JH
423 if (do_utf8) {
424 if (!prog->check_utf8 && prog->check_substr)
425 to_utf8_substr(prog);
426 check = prog->check_utf8;
427 } else {
428 if (!prog->check_substr && prog->check_utf8)
429 to_byte_substr(prog);
430 check = prog->check_substr;
431 }
1de06328 432 if (check == &PL_sv_undef) {
a3621e74 433 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 434 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
435 goto fail;
436 }
bbe252da
YO
437 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
438 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
439 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 440 && !multiline ) ); /* Check after \n? */
cad2e5aa 441
7e25d62c 442 if (!ml_anch) {
bbe252da
YO
443 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
444 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 445 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
446 && sv && !SvROK(sv)
447 && (strpos != strbeg)) {
a3621e74 448 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
449 goto fail;
450 }
451 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 452 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 453 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
454 I32 slen;
455
1aa99e6b 456 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 457
653099ff
GS
458 if (SvTAIL(check)) {
459 slen = SvCUR(check); /* >= 1 */
cad2e5aa 460
9041c2e3 461 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 462 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 463 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 464 goto fail_finish;
cad2e5aa
JH
465 }
466 /* Now should match s[0..slen-2] */
467 slen--;
3f7c398e 468 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 469 || (slen > 1
3f7c398e 470 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 471 report_neq:
a3621e74 472 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
473 goto fail_finish;
474 }
cad2e5aa 475 }
3f7c398e 476 else if (*SvPVX_const(check) != *s
653099ff 477 || ((slen = SvCUR(check)) > 1
3f7c398e 478 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 479 goto report_neq;
c315bfe8 480 check_at = s;
2c2d71f5 481 goto success_at_start;
7e25d62c 482 }
cad2e5aa 483 }
2c2d71f5 484 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 485 s = strpos;
2c2d71f5 486 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
487 end_shift = prog->check_end_shift;
488
2c2d71f5 489 if (!ml_anch) {
a3b680e6 490 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 491 - (SvTAIL(check) != 0);
a3b680e6 492 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
493
494 if (end_shift < eshift)
495 end_shift = eshift;
496 }
cad2e5aa 497 }
2c2d71f5 498 else { /* Can match at random position */
cad2e5aa
JH
499 ml_anch = 0;
500 s = strpos;
1de06328
YO
501 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
502 end_shift = prog->check_end_shift;
503
504 /* end shift should be non negative here */
cad2e5aa
JH
505 }
506
bcdf7404 507#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 508 if (end_shift < 0)
1de06328 509 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 510 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
511#endif
512
2c2d71f5
JH
513 restart:
514 /* Find a possible match in the region s..strend by looking for
515 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
516
517 {
518 I32 srch_start_shift = start_shift;
519 I32 srch_end_shift = end_shift;
520 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
521 srch_end_shift -= ((strbeg - s) - srch_start_shift);
522 srch_start_shift = strbeg - s;
523 }
6bda09f9 524 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
525 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
526 (IV)prog->check_offset_min,
527 (IV)srch_start_shift,
528 (IV)srch_end_shift,
529 (IV)prog->check_end_shift);
530 });
531
cad2e5aa 532 if (flags & REXEC_SCREAM) {
cad2e5aa 533 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 534 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 535
2c2d71f5
JH
536 if (PL_screamfirst[BmRARE(check)] >= 0
537 || ( BmRARE(check) == '\n'
85c508c3 538 && (BmPREVIOUS(check) == SvCUR(check) - 1)
2c2d71f5 539 && SvTAIL(check) ))
9041c2e3 540 s = screaminstr(sv, check,
1de06328 541 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 542 else
2c2d71f5 543 goto fail_finish;
4addbd3b 544 /* we may be pointing at the wrong string */
07bc277f 545 if (s && RXp_MATCH_COPIED(prog))
3f7c398e 546 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
547 if (data)
548 *data->scream_olds = s;
549 }
1de06328
YO
550 else {
551 U8* start_point;
552 U8* end_point;
bbe252da 553 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
554 start_point= (U8*)(s + srch_start_shift);
555 end_point= (U8*)(strend - srch_end_shift);
556 } else {
557 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
558 end_point= HOP3(strend, -srch_end_shift, strbeg);
559 }
6bda09f9 560 DEBUG_OPTIMISE_MORE_r({
56570a2c 561 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 562 (int)(end_point - start_point),
fc8cd66c 563 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
564 start_point);
565 });
566
567 s = fbm_instr( start_point, end_point,
7fba1cd6 568 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
569 }
570 }
cad2e5aa
JH
571 /* Update the count-of-usability, remove useless subpatterns,
572 unshift s. */
2c2d71f5 573
ab3bbdeb
YO
574 DEBUG_EXECUTE_r({
575 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
576 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
577 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 578 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
579 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
580 ? "anchored" : "floating"),
581 quoted,
582 RE_SV_TAIL(check),
583 (s ? " at offset " : "...\n") );
584 });
2c2d71f5
JH
585
586 if (!s)
587 goto fail_finish;
2c2d71f5 588 /* Finish the diagnostic message */
a3621e74 589 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 590
1de06328
YO
591 /* XXX dmq: first branch is for positive lookbehind...
592 Our check string is offset from the beginning of the pattern.
593 So we need to do any stclass tests offset forward from that
594 point. I think. :-(
595 */
596
597
598
599 check_at=s;
600
601
2c2d71f5
JH
602 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
603 Start with the other substr.
604 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 605 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
606 *always* match. Probably should be marked during compile...
607 Probably it is right to do no SCREAM here...
608 */
609
1de06328
YO
610 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
611 : (prog->float_substr && prog->anchored_substr))
612 {
30944b6d 613 /* Take into account the "other" substring. */
2c2d71f5
JH
614 /* XXXX May be hopelessly wrong for UTF... */
615 if (!other_last)
6eb5f6b9 616 other_last = strpos;
33b8afdf 617 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
618 do_other_anchored:
619 {
890ce7af
AL
620 char * const last = HOP3c(s, -start_shift, strbeg);
621 char *last1, *last2;
be8e71aa 622 char * const saved_s = s;
33b8afdf 623 SV* must;
2c2d71f5 624
2c2d71f5
JH
625 t = s - prog->check_offset_max;
626 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 627 && (!do_utf8
0ce71af7 628 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 629 && t > strpos)))
6f207bd3 630 NOOP;
2c2d71f5
JH
631 else
632 t = strpos;
1aa99e6b 633 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
634 if (t < other_last) /* These positions already checked */
635 t = other_last;
1aa99e6b 636 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
637 if (last < last1)
638 last1 = last;
1de06328
YO
639 /* XXXX It is not documented what units *_offsets are in.
640 We assume bytes, but this is clearly wrong.
641 Meaning this code needs to be carefully reviewed for errors.
642 dmq.
643 */
644
2c2d71f5 645 /* On end-of-str: see comment below. */
33b8afdf
JH
646 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
647 if (must == &PL_sv_undef) {
648 s = (char*)NULL;
1de06328 649 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
650 }
651 else
652 s = fbm_instr(
653 (unsigned char*)t,
654 HOP3(HOP3(last1, prog->anchored_offset, strend)
655 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
656 must,
7fba1cd6 657 multiline ? FBMrf_MULTILINE : 0
33b8afdf 658 );
ab3bbdeb
YO
659 DEBUG_EXECUTE_r({
660 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
661 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
662 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 663 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
664 quoted, RE_SV_TAIL(must));
665 });
666
667
2c2d71f5
JH
668 if (!s) {
669 if (last1 >= last2) {
a3621e74 670 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
671 ", giving up...\n"));
672 goto fail_finish;
673 }
a3621e74 674 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 675 ", trying floating at offset %ld...\n",
be8e71aa 676 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
677 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
678 s = HOP3c(last, 1, strend);
2c2d71f5
JH
679 goto restart;
680 }
681 else {
a3621e74 682 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 683 (long)(s - i_strpos)));
1aa99e6b
IH
684 t = HOP3c(s, -prog->anchored_offset, strbeg);
685 other_last = HOP3c(s, 1, strend);
be8e71aa 686 s = saved_s;
2c2d71f5
JH
687 if (t == strpos)
688 goto try_at_start;
2c2d71f5
JH
689 goto try_at_offset;
690 }
30944b6d 691 }
2c2d71f5
JH
692 }
693 else { /* Take into account the floating substring. */
33b8afdf 694 char *last, *last1;
be8e71aa 695 char * const saved_s = s;
33b8afdf
JH
696 SV* must;
697
698 t = HOP3c(s, -start_shift, strbeg);
699 last1 = last =
700 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
701 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
702 last = HOP3c(t, prog->float_max_offset, strend);
703 s = HOP3c(t, prog->float_min_offset, strend);
704 if (s < other_last)
705 s = other_last;
2c2d71f5 706 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
707 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
708 /* fbm_instr() takes into account exact value of end-of-str
709 if the check is SvTAIL(ed). Since false positives are OK,
710 and end-of-str is not later than strend we are OK. */
711 if (must == &PL_sv_undef) {
712 s = (char*)NULL;
1de06328 713 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
714 }
715 else
2c2d71f5 716 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
717 (unsigned char*)last + SvCUR(must)
718 - (SvTAIL(must)!=0),
7fba1cd6 719 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
720 DEBUG_EXECUTE_r({
721 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
722 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
723 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 724 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
725 quoted, RE_SV_TAIL(must));
726 });
33b8afdf
JH
727 if (!s) {
728 if (last1 == last) {
a3621e74 729 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
730 ", giving up...\n"));
731 goto fail_finish;
2c2d71f5 732 }
a3621e74 733 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 734 ", trying anchored starting at offset %ld...\n",
be8e71aa 735 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
736 other_last = last;
737 s = HOP3c(t, 1, strend);
738 goto restart;
739 }
740 else {
a3621e74 741 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
742 (long)(s - i_strpos)));
743 other_last = s; /* Fix this later. --Hugo */
be8e71aa 744 s = saved_s;
33b8afdf
JH
745 if (t == strpos)
746 goto try_at_start;
747 goto try_at_offset;
748 }
2c2d71f5 749 }
cad2e5aa 750 }
2c2d71f5 751
1de06328 752
9ef43ace 753 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 754
6bda09f9 755 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
756 PerlIO_printf(Perl_debug_log,
757 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
758 (IV)prog->check_offset_min,
759 (IV)prog->check_offset_max,
760 (IV)(s-strpos),
761 (IV)(t-strpos),
762 (IV)(t-s),
763 (IV)(strend-strpos)
764 )
765 );
766
2c2d71f5 767 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 768 && (!do_utf8
9ef43ace 769 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
770 && t > strpos)))
771 {
2c2d71f5
JH
772 /* Fixed substring is found far enough so that the match
773 cannot start at strpos. */
774 try_at_offset:
cad2e5aa 775 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
776 /* Eventually fbm_*() should handle this, but often
777 anchored_offset is not 0, so this check will not be wasted. */
778 /* XXXX In the code below we prefer to look for "^" even in
779 presence of anchored substrings. And we search even
780 beyond the found float position. These pessimizations
781 are historical artefacts only. */
782 find_anchor:
2c2d71f5 783 while (t < strend - prog->minlen) {
cad2e5aa 784 if (*t == '\n') {
4ee3650e 785 if (t < check_at - prog->check_offset_min) {
33b8afdf 786 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
787 /* Since we moved from the found position,
788 we definitely contradict the found anchored
30944b6d
IZ
789 substr. Due to the above check we do not
790 contradict "check" substr.
791 Thus we can arrive here only if check substr
792 is float. Redo checking for "other"=="fixed".
793 */
9041c2e3 794 strpos = t + 1;
a3621e74 795 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 796 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
797 goto do_other_anchored;
798 }
4ee3650e
GS
799 /* We don't contradict the found floating substring. */
800 /* XXXX Why not check for STCLASS? */
cad2e5aa 801 s = t + 1;
a3621e74 802 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 803 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
804 goto set_useful;
805 }
4ee3650e
GS
806 /* Position contradicts check-string */
807 /* XXXX probably better to look for check-string
808 than for "\n", so one should lower the limit for t? */
a3621e74 809 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 810 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 811 other_last = strpos = s = t + 1;
cad2e5aa
JH
812 goto restart;
813 }
814 t++;
815 }
a3621e74 816 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 817 PL_colors[0], PL_colors[1]));
2c2d71f5 818 goto fail_finish;
cad2e5aa 819 }
f5952150 820 else {
a3621e74 821 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 822 PL_colors[0], PL_colors[1]));
f5952150 823 }
cad2e5aa
JH
824 s = t;
825 set_useful:
33b8afdf 826 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
827 }
828 else {
f5952150 829 /* The found string does not prohibit matching at strpos,
2c2d71f5 830 - no optimization of calling REx engine can be performed,
f5952150
GS
831 unless it was an MBOL and we are not after MBOL,
832 or a future STCLASS check will fail this. */
2c2d71f5
JH
833 try_at_start:
834 /* Even in this situation we may use MBOL flag if strpos is offset
835 wrt the start of the string. */
05b4157f 836 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 837 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 838 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 839 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 840 {
cad2e5aa
JH
841 t = strpos;
842 goto find_anchor;
843 }
a3621e74 844 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 845 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 846 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 847 );
2c2d71f5 848 success_at_start:
bbe252da 849 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
850 && (do_utf8 ? (
851 prog->check_utf8 /* Could be deleted already */
852 && --BmUSEFUL(prog->check_utf8) < 0
853 && (prog->check_utf8 == prog->float_utf8)
854 ) : (
855 prog->check_substr /* Could be deleted already */
856 && --BmUSEFUL(prog->check_substr) < 0
857 && (prog->check_substr == prog->float_substr)
858 )))
66e933ab 859 {
cad2e5aa 860 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 861 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
862 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
863 if (do_utf8 ? prog->check_substr : prog->check_utf8)
864 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
865 prog->check_substr = prog->check_utf8 = NULL; /* disable */
866 prog->float_substr = prog->float_utf8 = NULL; /* clear */
867 check = NULL; /* abort */
cad2e5aa 868 s = strpos;
3cf5c195
IZ
869 /* XXXX This is a remnant of the old implementation. It
870 looks wasteful, since now INTUIT can use many
6eb5f6b9 871 other heuristics. */
bbe252da 872 prog->extflags &= ~RXf_USE_INTUIT;
cad2e5aa
JH
873 }
874 else
875 s = strpos;
876 }
877
6eb5f6b9
JH
878 /* Last resort... */
879 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
880 /* trie stclasses are too expensive to use here, we are better off to
881 leave it to regmatch itself */
f8fc2ecf 882 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
883 /* minlen == 0 is possible if regstclass is \b or \B,
884 and the fixed substr is ''$.
885 Since minlen is already taken into account, s+1 is before strend;
886 accidentally, minlen >= 1 guaranties no false positives at s + 1
887 even for \b or \B. But (minlen? 1 : 0) below assumes that
888 regstclass does not come from lookahead... */
889 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
890 This leaves EXACTF only, which is dealt with in find_byclass(). */
f8fc2ecf
YO
891 const U8* const str = (U8*)STRING(progi->regstclass);
892 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
893 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 894 : 1);
1de06328
YO
895 char * endpos;
896 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
897 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
898 else if (prog->float_substr || prog->float_utf8)
899 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
900 else
901 endpos= strend;
902
70685ca0
JH
903 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
904 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1de06328 905
6eb5f6b9 906 t = s;
f8fc2ecf 907 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
6eb5f6b9
JH
908 if (!s) {
909#ifdef DEBUGGING
cbbf8932 910 const char *what = NULL;
6eb5f6b9
JH
911#endif
912 if (endpos == strend) {
a3621e74 913 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
914 "Could not match STCLASS...\n") );
915 goto fail;
916 }
a3621e74 917 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 918 "This position contradicts STCLASS...\n") );
bbe252da 919 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 920 goto fail;
6eb5f6b9 921 /* Contradict one of substrings */
33b8afdf
JH
922 if (prog->anchored_substr || prog->anchored_utf8) {
923 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 924 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 925 hop_and_restart:
1aa99e6b 926 s = HOP3c(t, 1, strend);
66e933ab
GS
927 if (s + start_shift + end_shift > strend) {
928 /* XXXX Should be taken into account earlier? */
a3621e74 929 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
930 "Could not match STCLASS...\n") );
931 goto fail;
932 }
5e39e1e5
HS
933 if (!check)
934 goto giveup;
a3621e74 935 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 936 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
937 what, (long)(s + start_shift - i_strpos)) );
938 goto restart;
939 }
66e933ab 940 /* Have both, check_string is floating */
6eb5f6b9
JH
941 if (t + start_shift >= check_at) /* Contradicts floating=check */
942 goto retry_floating_check;
943 /* Recheck anchored substring, but not floating... */
9041c2e3 944 s = check_at;
5e39e1e5
HS
945 if (!check)
946 goto giveup;
a3621e74 947 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 948 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
949 (long)(other_last - i_strpos)) );
950 goto do_other_anchored;
951 }
60e71179
GS
952 /* Another way we could have checked stclass at the
953 current position only: */
954 if (ml_anch) {
955 s = t = t + 1;
5e39e1e5
HS
956 if (!check)
957 goto giveup;
a3621e74 958 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 959 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 960 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 961 goto try_at_offset;
66e933ab 962 }
33b8afdf 963 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 964 goto fail;
6eb5f6b9
JH
965 /* Check is floating subtring. */
966 retry_floating_check:
967 t = check_at - start_shift;
a3621e74 968 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
969 goto hop_and_restart;
970 }
b7953727 971 if (t != s) {
a3621e74 972 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 973 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
974 (long)(t - i_strpos), (long)(s - i_strpos))
975 );
976 }
977 else {
a3621e74 978 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
979 "Does not contradict STCLASS...\n");
980 );
981 }
6eb5f6b9 982 }
5e39e1e5 983 giveup:
a3621e74 984 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
985 PL_colors[4], (check ? "Guessed" : "Giving up"),
986 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 987 return s;
2c2d71f5
JH
988
989 fail_finish: /* Substring not found */
33b8afdf
JH
990 if (prog->check_substr || prog->check_utf8) /* could be removed already */
991 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 992 fail:
a3621e74 993 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 994 PL_colors[4], PL_colors[5]));
bd61b366 995 return NULL;
cad2e5aa 996}
9661b544 997
a0a388a1
YO
998#define DECL_TRIE_TYPE(scan) \
999 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1000 trie_type = (scan->flags != EXACT) \
1001 ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
1002 : (do_utf8 ? trie_utf8 : trie_plain)
3b0527fe 1003
55eed653
NC
1004#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1005uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
4cadc6a9
YO
1006 switch (trie_type) { \
1007 case trie_utf8_fold: \
1008 if ( foldlen>0 ) { \
1009 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1010 foldlen -= len; \
1011 uscan += len; \
1012 len=0; \
1013 } else { \
1014 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1015 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1016 foldlen -= UNISKIP( uvc ); \
1017 uscan = foldbuf + UNISKIP( uvc ); \
1018 } \
1019 break; \
a0a388a1
YO
1020 case trie_latin_utf8_fold: \
1021 if ( foldlen>0 ) { \
1022 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1023 foldlen -= len; \
1024 uscan += len; \
1025 len=0; \
1026 } else { \
1027 len = 1; \
1028 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
1029 foldlen -= UNISKIP( uvc ); \
1030 uscan = foldbuf + UNISKIP( uvc ); \
1031 } \
1032 break; \
4cadc6a9
YO
1033 case trie_utf8: \
1034 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1035 break; \
1036 case trie_plain: \
1037 uvc = (UV)*uc; \
1038 len = 1; \
1039 } \
1040 \
1041 if (uvc < 256) { \
1042 charid = trie->charmap[ uvc ]; \
1043 } \
1044 else { \
1045 charid = 0; \
55eed653
NC
1046 if (widecharmap) { \
1047 SV** const svpp = hv_fetch(widecharmap, \
4cadc6a9
YO
1048 (char*)&uvc, sizeof(UV), 0); \
1049 if (svpp) \
1050 charid = (U16)SvIV(*svpp); \
1051 } \
1052 } \
1053} STMT_END
1054
a0a388a1
YO
1055#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1056{ \
1057 char *my_strend= (char *)strend; \
4cadc6a9
YO
1058 if ( (CoNd) \
1059 && (ln == len || \
a0a388a1 1060 !ibcmp_utf8(s, &my_strend, 0, do_utf8, \
4cadc6a9 1061 m, NULL, ln, (bool)UTF)) \
a0a388a1 1062 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1063 goto got_it; \
1064 else { \
1065 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1066 uvchr_to_utf8(tmpbuf, c); \
1067 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1068 if ( f != c \
1069 && (f == c1 || f == c2) \
a0a388a1
YO
1070 && (ln == len || \
1071 !ibcmp_utf8(s, &my_strend, 0, do_utf8,\
1072 m, NULL, ln, (bool)UTF)) \
1073 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1074 goto got_it; \
1075 } \
a0a388a1
YO
1076} \
1077s += len
4cadc6a9
YO
1078
1079#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1080STMT_START { \
1081 while (s <= e) { \
1082 if ( (CoNd) \
1083 && (ln == 1 || !(OP(c) == EXACTF \
1084 ? ibcmp(s, m, ln) \
1085 : ibcmp_locale(s, m, ln))) \
24b23f37 1086 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1087 goto got_it; \
1088 s++; \
1089 } \
1090} STMT_END
1091
1092#define REXEC_FBC_UTF8_SCAN(CoDe) \
1093STMT_START { \
1094 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1095 CoDe \
1096 s += uskip; \
1097 } \
1098} STMT_END
1099
1100#define REXEC_FBC_SCAN(CoDe) \
1101STMT_START { \
1102 while (s < strend) { \
1103 CoDe \
1104 s++; \
1105 } \
1106} STMT_END
1107
1108#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1109REXEC_FBC_UTF8_SCAN( \
1110 if (CoNd) { \
24b23f37 1111 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1112 goto got_it; \
1113 else \
1114 tmp = doevery; \
1115 } \
1116 else \
1117 tmp = 1; \
1118)
1119
1120#define REXEC_FBC_CLASS_SCAN(CoNd) \
1121REXEC_FBC_SCAN( \
1122 if (CoNd) { \
24b23f37 1123 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1124 goto got_it; \
1125 else \
1126 tmp = doevery; \
1127 } \
1128 else \
1129 tmp = 1; \
1130)
1131
1132#define REXEC_FBC_TRYIT \
24b23f37 1133if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1134 goto got_it
1135
e1d1eefb
YO
1136#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1137 if (do_utf8) { \
1138 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1139 } \
1140 else { \
1141 REXEC_FBC_CLASS_SCAN(CoNd); \
1142 } \
1143 break
1144
4cadc6a9
YO
1145#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1146 if (do_utf8) { \
1147 UtFpReLoAd; \
1148 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1149 } \
1150 else { \
1151 REXEC_FBC_CLASS_SCAN(CoNd); \
1152 } \
1153 break
1154
1155#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1156 PL_reg_flags |= RF_tainted; \
1157 if (do_utf8) { \
1158 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1159 } \
1160 else { \
1161 REXEC_FBC_CLASS_SCAN(CoNd); \
1162 } \
1163 break
1164
786e8c11
YO
1165#define DUMP_EXEC_POS(li,s,doutf8) \
1166 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1167
1168/* We know what class REx starts with. Try to find this position... */
1169/* if reginfo is NULL, its a dryrun */
1170/* annoyingly all the vars in this routine have different names from their counterparts
1171 in regmatch. /grrr */
1172
3c3eec57 1173STATIC char *
07be1b83 1174S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1175 const char *strend, regmatch_info *reginfo)
a687059c 1176{
27da23d5 1177 dVAR;
bbe252da 1178 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
6eb5f6b9 1179 char *m;
d8093b23 1180 STRLEN ln;
5dab1207 1181 STRLEN lnc;
078c425b 1182 register STRLEN uskip;
d8093b23
G
1183 unsigned int c1;
1184 unsigned int c2;
6eb5f6b9
JH
1185 char *e;
1186 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1187 register const bool do_utf8 = PL_reg_match_utf8;
f8fc2ecf 1188 RXi_GET_DECL(prog,progi);
7918f24d
NC
1189
1190 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1191
6eb5f6b9
JH
1192 /* We know what class it must start with. */
1193 switch (OP(c)) {
6eb5f6b9 1194 case ANYOF:
388cc4de 1195 if (do_utf8) {
4cadc6a9 1196 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1197 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1198 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1199 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1200 }
1201 else {
1202 while (s < strend) {
1203 STRLEN skip = 1;
1204
32fc9b6a 1205 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1206 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1207 /* The assignment of 2 is intentional:
1208 * for the folded sharp s, the skip is 2. */
1209 (skip = SHARP_S_SKIP))) {
24b23f37 1210 if (tmp && (!reginfo || regtry(reginfo, &s)))
388cc4de
HS
1211 goto got_it;
1212 else
1213 tmp = doevery;
1214 }
1215 else
1216 tmp = 1;
1217 s += skip;
1218 }
a0d0e21e 1219 }
6eb5f6b9 1220 break;
f33976b4 1221 case CANY:
4cadc6a9 1222 REXEC_FBC_SCAN(
24b23f37 1223 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1224 goto got_it;
1225 else
1226 tmp = doevery;
4cadc6a9 1227 );
f33976b4 1228 break;
6eb5f6b9 1229 case EXACTF:
5dab1207
NIS
1230 m = STRING(c);
1231 ln = STR_LEN(c); /* length to match in octets/bytes */
1232 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1233 if (UTF) {
a2a2844f 1234 STRLEN ulen1, ulen2;
5dab1207 1235 U8 *sm = (U8 *) m;
89ebb4a3
JH
1236 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1237 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
97dc7d3e
RGS
1238 /* used by commented-out code below */
1239 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
a0a388a1
YO
1240
1241 /* XXX: Since the node will be case folded at compile
1242 time this logic is a little odd, although im not
1243 sure that its actually wrong. --dmq */
1244
1245 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1246 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1247
1248 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1249 codepoint of the first character in the converted
1250 form, yet originally we did the extra step.
1251 No tests fail by commenting this code out however
1252 so Ive left it out. -- dmq.
1253
89ebb4a3 1254 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1255 0, uniflags);
89ebb4a3 1256 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1257 0, uniflags);
a0a388a1
YO
1258 */
1259
5dab1207
NIS
1260 lnc = 0;
1261 while (sm < ((U8 *) m + ln)) {
1262 lnc++;
1263 sm += UTF8SKIP(sm);
1264 }
1aa99e6b
IH
1265 }
1266 else {
1267 c1 = *(U8*)m;
1268 c2 = PL_fold[c1];
1269 }
6eb5f6b9
JH
1270 goto do_exactf;
1271 case EXACTFL:
5dab1207
NIS
1272 m = STRING(c);
1273 ln = STR_LEN(c);
1274 lnc = (I32) ln;
d8093b23 1275 c1 = *(U8*)m;
6eb5f6b9
JH
1276 c2 = PL_fold_locale[c1];
1277 do_exactf:
db12adc6 1278 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1279
3b0527fe 1280 if (!reginfo && e < s)
6eb5f6b9 1281 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1282
60a8b682
JH
1283 /* The idea in the EXACTF* cases is to first find the
1284 * first character of the EXACTF* node and then, if
1285 * necessary, case-insensitively compare the full
1286 * text of the node. The c1 and c2 are the first
1287 * characters (though in Unicode it gets a bit
1288 * more complicated because there are more cases
7f16dd3d
JH
1289 * than just upper and lower: one needs to use
1290 * the so-called folding case for case-insensitive
1291 * matching (called "loose matching" in Unicode).
1292 * ibcmp_utf8() will do just that. */
60a8b682 1293
a0a388a1 1294 if (do_utf8 || UTF) {
575cac57 1295 UV c, f;
89ebb4a3 1296 U8 tmpbuf [UTF8_MAXBYTES+1];
a0a388a1
YO
1297 STRLEN len = 1;
1298 STRLEN foldlen;
4ad0818d 1299 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1300 if (c1 == c2) {
5dab1207
NIS
1301 /* Upper and lower of 1st char are equal -
1302 * probably not a "letter". */
1aa99e6b 1303 while (s <= e) {
a0a388a1
YO
1304 if (do_utf8) {
1305 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1306 uniflags);
a0a388a1
YO
1307 } else {
1308 c = *((U8*)s);
1309 }
4cadc6a9 1310 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1311 }
09091399
JH
1312 }
1313 else {
1aa99e6b 1314 while (s <= e) {
a0a388a1
YO
1315 if (do_utf8) {
1316 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1317 uniflags);
a0a388a1
YO
1318 } else {
1319 c = *((U8*)s);
1320 }
80aecb99 1321
60a8b682 1322 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1323 * Note that not all the possible combinations
1324 * are handled here: some of them are handled
1325 * by the standard folding rules, and some of
1326 * them (the character class or ANYOF cases)
1327 * are handled during compiletime in
1328 * regexec.c:S_regclass(). */
880bd946
JH
1329 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1330 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1331 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1332
4cadc6a9 1333 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1334 }
09091399 1335 }
1aa99e6b
IH
1336 }
1337 else {
a0a388a1 1338 /* Neither pattern nor string are UTF8 */
1aa99e6b 1339 if (c1 == c2)
4cadc6a9 1340 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1341 else
4cadc6a9 1342 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1343 }
1344 break;
bbce6d69 1345 case BOUNDL:
3280af22 1346 PL_reg_flags |= RF_tainted;
bbce6d69 1347 /* FALL THROUGH */
a0d0e21e 1348 case BOUND:
ffc61ed2 1349 if (do_utf8) {
12d33761 1350 if (s == PL_bostr)
ffc61ed2
JH
1351 tmp = '\n';
1352 else {
6136c704 1353 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1354 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1355 }
1356 tmp = ((OP(c) == BOUND ?
9041c2e3 1357 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1358 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1359 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1360 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1361 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1362 isALNUM_LC_utf8((U8*)s)))
1363 {
1364 tmp = !tmp;
4cadc6a9 1365 REXEC_FBC_TRYIT;
a687059c 1366 }
4cadc6a9 1367 );
a0d0e21e 1368 }
667bb95a 1369 else {
12d33761 1370 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1371 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1372 REXEC_FBC_SCAN(
ffc61ed2
JH
1373 if (tmp ==
1374 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1375 tmp = !tmp;
4cadc6a9 1376 REXEC_FBC_TRYIT;
a0ed51b3 1377 }
4cadc6a9 1378 );
a0ed51b3 1379 }
24b23f37 1380 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1381 goto got_it;
1382 break;
bbce6d69 1383 case NBOUNDL:
3280af22 1384 PL_reg_flags |= RF_tainted;
bbce6d69 1385 /* FALL THROUGH */
a0d0e21e 1386 case NBOUND:
ffc61ed2 1387 if (do_utf8) {
12d33761 1388 if (s == PL_bostr)
ffc61ed2
JH
1389 tmp = '\n';
1390 else {
6136c704 1391 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1392 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1393 }
1394 tmp = ((OP(c) == NBOUND ?
9041c2e3 1395 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1396 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1397 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1398 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1399 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1400 isALNUM_LC_utf8((U8*)s)))
1401 tmp = !tmp;
4cadc6a9
YO
1402 else REXEC_FBC_TRYIT;
1403 );
a0d0e21e 1404 }
667bb95a 1405 else {
12d33761 1406 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1407 tmp = ((OP(c) == NBOUND ?
1408 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1409 REXEC_FBC_SCAN(
ffc61ed2
JH
1410 if (tmp ==
1411 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1412 tmp = !tmp;
4cadc6a9
YO
1413 else REXEC_FBC_TRYIT;
1414 );
a0ed51b3 1415 }
24b23f37 1416 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1417 goto got_it;
1418 break;
a0d0e21e 1419 case ALNUM:
4cadc6a9
YO
1420 REXEC_FBC_CSCAN_PRELOAD(
1421 LOAD_UTF8_CHARCLASS_ALNUM(),
1422 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1423 isALNUM(*s)
1424 );
bbce6d69 1425 case ALNUML:
4cadc6a9
YO
1426 REXEC_FBC_CSCAN_TAINT(
1427 isALNUM_LC_utf8((U8*)s),
1428 isALNUM_LC(*s)
1429 );
a0d0e21e 1430 case NALNUM:
4cadc6a9
YO
1431 REXEC_FBC_CSCAN_PRELOAD(
1432 LOAD_UTF8_CHARCLASS_ALNUM(),
1433 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1434 !isALNUM(*s)
1435 );
bbce6d69 1436 case NALNUML:
4cadc6a9
YO
1437 REXEC_FBC_CSCAN_TAINT(
1438 !isALNUM_LC_utf8((U8*)s),
1439 !isALNUM_LC(*s)
1440 );
a0d0e21e 1441 case SPACE:
4cadc6a9
YO
1442 REXEC_FBC_CSCAN_PRELOAD(
1443 LOAD_UTF8_CHARCLASS_SPACE(),
1444 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1445 isSPACE(*s)
1446 );
bbce6d69 1447 case SPACEL:
4cadc6a9
YO
1448 REXEC_FBC_CSCAN_TAINT(
1449 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1450 isSPACE_LC(*s)
1451 );
a0d0e21e 1452 case NSPACE:
4cadc6a9
YO
1453 REXEC_FBC_CSCAN_PRELOAD(
1454 LOAD_UTF8_CHARCLASS_SPACE(),
1455 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1456 !isSPACE(*s)
1457 );
bbce6d69 1458 case NSPACEL:
4cadc6a9
YO
1459 REXEC_FBC_CSCAN_TAINT(
1460 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1461 !isSPACE_LC(*s)
1462 );
a0d0e21e 1463 case DIGIT:
4cadc6a9
YO
1464 REXEC_FBC_CSCAN_PRELOAD(
1465 LOAD_UTF8_CHARCLASS_DIGIT(),
1466 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1467 isDIGIT(*s)
1468 );
b8c5462f 1469 case DIGITL:
4cadc6a9
YO
1470 REXEC_FBC_CSCAN_TAINT(
1471 isDIGIT_LC_utf8((U8*)s),
1472 isDIGIT_LC(*s)
1473 );
a0d0e21e 1474 case NDIGIT:
4cadc6a9
YO
1475 REXEC_FBC_CSCAN_PRELOAD(
1476 LOAD_UTF8_CHARCLASS_DIGIT(),
1477 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1478 !isDIGIT(*s)
1479 );
b8c5462f 1480 case NDIGITL:
4cadc6a9
YO
1481 REXEC_FBC_CSCAN_TAINT(
1482 !isDIGIT_LC_utf8((U8*)s),
1483 !isDIGIT_LC(*s)
1484 );
e1d1eefb
YO
1485 case LNBREAK:
1486 REXEC_FBC_CSCAN(
1487 is_LNBREAK_utf8(s),
1488 is_LNBREAK_latin1(s)
1489 );
1490 case VERTWS:
1491 REXEC_FBC_CSCAN(
1492 is_VERTWS_utf8(s),
1493 is_VERTWS_latin1(s)
1494 );
1495 case NVERTWS:
1496 REXEC_FBC_CSCAN(
1497 !is_VERTWS_utf8(s),
1498 !is_VERTWS_latin1(s)
1499 );
1500 case HORIZWS:
1501 REXEC_FBC_CSCAN(
1502 is_HORIZWS_utf8(s),
1503 is_HORIZWS_latin1(s)
1504 );
1505 case NHORIZWS:
1506 REXEC_FBC_CSCAN(
1507 !is_HORIZWS_utf8(s),
1508 !is_HORIZWS_latin1(s)
1509 );
1de06328
YO
1510 case AHOCORASICKC:
1511 case AHOCORASICK:
07be1b83 1512 {
a0a388a1 1513 DECL_TRIE_TYPE(c);
07be1b83
YO
1514 /* what trie are we using right now */
1515 reg_ac_data *aho
f8fc2ecf 1516 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1517 reg_trie_data *trie
1518 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1519 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1520
1521 const char *last_start = strend - trie->minlen;
6148ee25 1522#ifdef DEBUGGING
07be1b83 1523 const char *real_start = s;
6148ee25 1524#endif
07be1b83 1525 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1526 SV *sv_points;
1527 U8 **points; /* map of where we were in the input string
786e8c11 1528 when reading a given char. For ASCII this
be8e71aa 1529 is unnecessary overhead as the relationship
38a44b82
NC
1530 is always 1:1, but for Unicode, especially
1531 case folded Unicode this is not true. */
f9e705e8 1532 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1533 U8 *bitmap=NULL;
1534
07be1b83
YO
1535
1536 GET_RE_DEBUG_FLAGS_DECL;
1537
be8e71aa
YO
1538 /* We can't just allocate points here. We need to wrap it in
1539 * an SV so it gets freed properly if there is a croak while
1540 * running the match */
1541 ENTER;
1542 SAVETMPS;
1543 sv_points=newSV(maxlen * sizeof(U8 *));
1544 SvCUR_set(sv_points,
1545 maxlen * sizeof(U8 *));
1546 SvPOK_on(sv_points);
1547 sv_2mortal(sv_points);
1548 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1549 if ( trie_type != trie_utf8_fold
1550 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1551 {
786e8c11
YO
1552 if (trie->bitmap)
1553 bitmap=(U8*)trie->bitmap;
1554 else
1555 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1556 }
786e8c11
YO
1557 /* this is the Aho-Corasick algorithm modified a touch
1558 to include special handling for long "unknown char"
1559 sequences. The basic idea being that we use AC as long
1560 as we are dealing with a possible matching char, when
1561 we encounter an unknown char (and we have not encountered
1562 an accepting state) we scan forward until we find a legal
1563 starting char.
1564 AC matching is basically that of trie matching, except
1565 that when we encounter a failing transition, we fall back
1566 to the current states "fail state", and try the current char
1567 again, a process we repeat until we reach the root state,
1568 state 1, or a legal transition. If we fail on the root state
1569 then we can either terminate if we have reached an accepting
1570 state previously, or restart the entire process from the beginning
1571 if we have not.
1572
1573 */
07be1b83
YO
1574 while (s <= last_start) {
1575 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1576 U8 *uc = (U8*)s;
1577 U16 charid = 0;
1578 U32 base = 1;
1579 U32 state = 1;
1580 UV uvc = 0;
1581 STRLEN len = 0;
1582 STRLEN foldlen = 0;
1583 U8 *uscan = (U8*)NULL;
1584 U8 *leftmost = NULL;
786e8c11
YO
1585#ifdef DEBUGGING
1586 U32 accepted_word= 0;
1587#endif
07be1b83
YO
1588 U32 pointpos = 0;
1589
1590 while ( state && uc <= (U8*)strend ) {
1591 int failed=0;
786e8c11
YO
1592 U32 word = aho->states[ state ].wordnum;
1593
1de06328
YO
1594 if( state==1 ) {
1595 if ( bitmap ) {
1596 DEBUG_TRIE_EXECUTE_r(
1597 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1598 dump_exec_pos( (char *)uc, c, strend, real_start,
1599 (char *)uc, do_utf8 );
1600 PerlIO_printf( Perl_debug_log,
1601 " Scanning for legal start char...\n");
1602 }
1603 );
1604 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1605 uc++;
786e8c11 1606 }
1de06328 1607 s= (char *)uc;
786e8c11 1608 }
786e8c11
YO
1609 if (uc >(U8*)last_start) break;
1610 }
1611
1612 if ( word ) {
1613 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1614 if (!leftmost || lpos < leftmost) {
1615 DEBUG_r(accepted_word=word);
07be1b83 1616 leftmost= lpos;
786e8c11 1617 }
07be1b83 1618 if (base==0) break;
786e8c11 1619
07be1b83
YO
1620 }
1621 points[pointpos++ % maxlen]= uc;
55eed653
NC
1622 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1623 uscan, len, uvc, charid, foldlen,
1624 foldbuf, uniflags);
786e8c11
YO
1625 DEBUG_TRIE_EXECUTE_r({
1626 dump_exec_pos( (char *)uc, c, strend, real_start,
1627 s, do_utf8 );
07be1b83 1628 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1629 " Charid:%3u CP:%4"UVxf" ",
1630 charid, uvc);
1631 });
07be1b83
YO
1632
1633 do {
6148ee25 1634#ifdef DEBUGGING
786e8c11 1635 word = aho->states[ state ].wordnum;
6148ee25 1636#endif
07be1b83
YO
1637 base = aho->states[ state ].trans.base;
1638
786e8c11
YO
1639 DEBUG_TRIE_EXECUTE_r({
1640 if (failed)
1641 dump_exec_pos( (char *)uc, c, strend, real_start,
1642 s, do_utf8 );
07be1b83 1643 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1644 "%sState: %4"UVxf", word=%"UVxf,
1645 failed ? " Fail transition to " : "",
1646 (UV)state, (UV)word);
1647 });
07be1b83
YO
1648 if ( base ) {
1649 U32 tmp;
1650 if (charid &&
1651 (base + charid > trie->uniquecharcount )
1652 && (base + charid - 1 - trie->uniquecharcount
1653 < trie->lasttrans)
1654 && trie->trans[base + charid - 1 -
1655 trie->uniquecharcount].check == state
1656 && (tmp=trie->trans[base + charid - 1 -
1657 trie->uniquecharcount ].next))
1658 {
786e8c11
YO
1659 DEBUG_TRIE_EXECUTE_r(
1660 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1661 state = tmp;
1662 break;
1663 }
1664 else {
786e8c11
YO
1665 DEBUG_TRIE_EXECUTE_r(
1666 PerlIO_printf( Perl_debug_log," - fail\n"));
1667 failed = 1;
1668 state = aho->fail[state];
07be1b83
YO
1669 }
1670 }
1671 else {
1672 /* we must be accepting here */
786e8c11
YO
1673 DEBUG_TRIE_EXECUTE_r(
1674 PerlIO_printf( Perl_debug_log," - accepting\n"));
1675 failed = 1;
07be1b83
YO
1676 break;
1677 }
1678 } while(state);
786e8c11 1679 uc += len;
07be1b83
YO
1680 if (failed) {
1681 if (leftmost)
1682 break;
786e8c11 1683 if (!state) state = 1;
07be1b83
YO
1684 }
1685 }
1686 if ( aho->states[ state ].wordnum ) {
1687 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
786e8c11
YO
1688 if (!leftmost || lpos < leftmost) {
1689 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1690 leftmost = lpos;
786e8c11 1691 }
07be1b83 1692 }
07be1b83
YO
1693 if (leftmost) {
1694 s = (char*)leftmost;
786e8c11
YO
1695 DEBUG_TRIE_EXECUTE_r({
1696 PerlIO_printf(
70685ca0
JH
1697 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1698 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
1699 );
1700 });
24b23f37 1701 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
1702 FREETMPS;
1703 LEAVE;
07be1b83 1704 goto got_it;
be8e71aa 1705 }
07be1b83 1706 s = HOPc(s,1);
786e8c11
YO
1707 DEBUG_TRIE_EXECUTE_r({
1708 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1709 });
07be1b83 1710 } else {
786e8c11
YO
1711 DEBUG_TRIE_EXECUTE_r(
1712 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1713 break;
1714 }
1715 }
be8e71aa
YO
1716 FREETMPS;
1717 LEAVE;
07be1b83
YO
1718 }
1719 break;
b3c9acc1 1720 default:
3c3eec57
GS
1721 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1722 break;
d6a28714 1723 }
6eb5f6b9
JH
1724 return 0;
1725 got_it:
1726 return s;
1727}
1728
e6aad7ab 1729static void
7918f24d
NC
1730S_swap_match_buff (pTHX_ regexp *prog)
1731{
f0ab9afb 1732 regexp_paren_pair *t;
fae667d5 1733
7918f24d
NC
1734 PERL_ARGS_ASSERT_SWAP_MATCH_BUFF;
1735
28d8d7f4 1736 if (!prog->swap) {
fae667d5
YO
1737 /* We have to be careful. If the previous successful match
1738 was from this regex we don't want a subsequent paritally
1739 successful match to clobber the old results.
1740 So when we detect this possibility we add a swap buffer
1741 to the re, and switch the buffer each match. If we fail
1742 we switch it back, otherwise we leave it swapped.
1743 */
f0ab9afb 1744 Newxz(prog->swap, (prog->nparens + 1), regexp_paren_pair);
fae667d5 1745 }
f0ab9afb
NC
1746 t = prog->swap;
1747 prog->swap = prog->offs;
1748 prog->offs = t;
fae667d5
YO
1749}
1750
1751
6eb5f6b9
JH
1752/*
1753 - regexec_flags - match a regexp against a string
1754 */
1755I32
288b8c02 1756Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9
JH
1757 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1758/* strend: pointer to null at end of string */
1759/* strbeg: real beginning of string */
1760/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
1761/* data: May be used for some additional optimizations.
1762 Currently its only used, with a U32 cast, for transmitting
1763 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
1764/* nosave: For optimizations. */
1765{
97aff369 1766 dVAR;
288b8c02 1767 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 1768 /*register*/ char *s;
6eb5f6b9 1769 register regnode *c;
24b23f37 1770 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
1771 I32 minlen; /* must match at least this many chars */
1772 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1773 I32 end_shift = 0; /* Same for the end. */ /* CC */
1774 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1775 char *scream_olds = NULL;
f9f4320a 1776 const bool do_utf8 = (bool)DO_UTF8(sv);
2757e526 1777 I32 multiline;
f8fc2ecf 1778 RXi_GET_DECL(prog,progi);
3b0527fe 1779 regmatch_info reginfo; /* create some info to pass to regtry etc */
fae667d5 1780 bool swap_on_fail = 0;
a3621e74
YO
1781 GET_RE_DEBUG_FLAGS_DECL;
1782
7918f24d 1783 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 1784 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1785
1786 /* Be paranoid... */
1787 if (prog == NULL || startpos == NULL) {
1788 Perl_croak(aTHX_ "NULL regexp parameter");
1789 return 0;
1790 }
1791
bbe252da 1792 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 1793 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 1794
288b8c02 1795 RX_MATCH_UTF8_set(rx, do_utf8);
1de06328 1796 DEBUG_EXECUTE_r(
efd26800 1797 debug_start_match(rx, do_utf8, startpos, strend,
1de06328
YO
1798 "Matching");
1799 );
bac06658 1800
6eb5f6b9 1801 minlen = prog->minlen;
1de06328
YO
1802
1803 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 1804 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1805 "String too short [regexec_flags]...\n"));
1806 goto phooey;
1aa99e6b 1807 }
6eb5f6b9 1808
1de06328 1809
6eb5f6b9 1810 /* Check validity of program. */
f8fc2ecf 1811 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
1812 Perl_croak(aTHX_ "corrupted regexp program");
1813 }
1814
1815 PL_reg_flags = 0;
1816 PL_reg_eval_set = 0;
1817 PL_reg_maxiter = 0;
1818
3c8556c3 1819 if (RX_UTF8(rx))
6eb5f6b9
JH
1820 PL_reg_flags |= RF_utf8;
1821
1822 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1823 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1824 PL_bostr = strbeg;
3b0527fe 1825 reginfo.sv = sv;
6eb5f6b9
JH
1826
1827 /* Mark end of line for $ (and such) */
1828 PL_regeol = strend;
1829
1830 /* see how far we have to get to not match where we matched before */
3b0527fe 1831 reginfo.till = startpos+minend;
6eb5f6b9 1832
6eb5f6b9
JH
1833 /* If there is a "must appear" string, look for it. */
1834 s = startpos;
1835
bbe252da 1836 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1837 MAGIC *mg;
1838
1839 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
58e23c8d 1840 reginfo.ganch = startpos + prog->gofs;
6eb5f6b9
JH
1841 else if (sv && SvTYPE(sv) >= SVt_PVMG
1842 && SvMAGIC(sv)
14befaf4
DM
1843 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1844 && mg->mg_len >= 0) {
3b0527fe 1845 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
bbe252da 1846 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 1847 if (s > reginfo.ganch)
6eb5f6b9 1848 goto phooey;
58e23c8d 1849 s = reginfo.ganch - prog->gofs;
6eb5f6b9
JH
1850 }
1851 }
58e23c8d 1852 else if (data) {
70685ca0 1853 reginfo.ganch = strbeg + PTR2UV(data);
58e23c8d 1854 } else /* pos() not defined */
3b0527fe 1855 reginfo.ganch = strbeg;
6eb5f6b9 1856 }
288b8c02 1857 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
fae667d5
YO
1858 swap_on_fail = 1;
1859 swap_match_buff(prog); /* do we need a save destructor here for
1860 eval dies? */
c74340f9 1861 }
a0714e2c 1862 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1863 re_scream_pos_data d;
1864
1865 d.scream_olds = &scream_olds;
1866 d.scream_pos = &scream_pos;
288b8c02 1867 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 1868 if (!s) {
a3621e74 1869 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1870 goto phooey; /* not present */
3fa9c3d7 1871 }
6eb5f6b9
JH
1872 }
1873
1de06328 1874
6eb5f6b9
JH
1875
1876 /* Simplest case: anchored match need be tried only once. */
1877 /* [unless only anchor is BOL and multiline is set] */
bbe252da 1878 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 1879 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 1880 goto got_it;
bbe252da
YO
1881 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1882 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
1883 {
1884 char *end;
1885
1886 if (minlen)
1887 dontbother = minlen - 1;
1aa99e6b 1888 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1889 /* for multiline we only have to try after newlines */
33b8afdf 1890 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1891 if (s == startpos)
1892 goto after_try;
1893 while (1) {
24b23f37 1894 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1895 goto got_it;
1896 after_try:
5339e136 1897 if (s > end)
6eb5f6b9 1898 goto phooey;
bbe252da 1899 if (prog->extflags & RXf_USE_INTUIT) {
288b8c02 1900 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
6eb5f6b9
JH
1901 if (!s)
1902 goto phooey;
1903 }
1904 else
1905 s++;
1906 }
1907 } else {
1908 if (s > startpos)
1909 s--;
1910 while (s < end) {
1911 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 1912 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1913 goto got_it;
1914 }
1915 }
1916 }
1917 }
1918 goto phooey;
bbe252da 1919 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a
YO
1920 {
1921 /* the warning about reginfo.ganch being used without intialization
bbe252da 1922 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 1923 and we only enter this block when the same bit is set. */
58e23c8d
YO
1924 char *tmp_s = reginfo.ganch - prog->gofs;
1925 if (regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
1926 goto got_it;
1927 goto phooey;
1928 }
1929
1930 /* Messy cases: unanchored match. */
bbe252da 1931 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9
JH
1932 /* we have /x+whatever/ */
1933 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1934 char ch;
bf93d4cc
GS
1935#ifdef DEBUGGING
1936 int did_match = 0;
1937#endif
33b8afdf
JH
1938 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1939 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1940 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1941
1aa99e6b 1942 if (do_utf8) {
4cadc6a9 1943 REXEC_FBC_SCAN(
6eb5f6b9 1944 if (*s == ch) {
a3621e74 1945 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 1946 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
1947 s += UTF8SKIP(s);
1948 while (s < strend && *s == ch)
1949 s += UTF8SKIP(s);
1950 }
4cadc6a9 1951 );
6eb5f6b9
JH
1952 }
1953 else {
4cadc6a9 1954 REXEC_FBC_SCAN(
6eb5f6b9 1955 if (*s == ch) {
a3621e74 1956 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 1957 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
1958 s++;
1959 while (s < strend && *s == ch)
1960 s++;
1961 }
4cadc6a9 1962 );
6eb5f6b9 1963 }
a3621e74 1964 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1965 PerlIO_printf(Perl_debug_log,
b7953727
JH
1966 "Did not find anchored character...\n")
1967 );
6eb5f6b9 1968 }
a0714e2c
SS
1969 else if (prog->anchored_substr != NULL
1970 || prog->anchored_utf8 != NULL
1971 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1972 && prog->float_max_offset < strend - s)) {
1973 SV *must;
1974 I32 back_max;
1975 I32 back_min;
1976 char *last;
6eb5f6b9 1977 char *last1; /* Last position checked before */
bf93d4cc
GS
1978#ifdef DEBUGGING
1979 int did_match = 0;
1980#endif
33b8afdf
JH
1981 if (prog->anchored_substr || prog->anchored_utf8) {
1982 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1983 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1984 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1985 back_max = back_min = prog->anchored_offset;
1986 } else {
1987 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1988 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1989 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1990 back_max = prog->float_max_offset;
1991 back_min = prog->float_min_offset;
1992 }
1de06328
YO
1993
1994
33b8afdf
JH
1995 if (must == &PL_sv_undef)
1996 /* could not downgrade utf8 check substring, so must fail */
1997 goto phooey;
1998
1de06328
YO
1999 if (back_min<0) {
2000 last = strend;
2001 } else {
2002 last = HOP3c(strend, /* Cannot start after this */
2003 -(I32)(CHR_SVLEN(must)
2004 - (SvTAIL(must) != 0) + back_min), strbeg);
2005 }
6eb5f6b9
JH
2006 if (s > PL_bostr)
2007 last1 = HOPc(s, -1);
2008 else
2009 last1 = s - 1; /* bogus */
2010
a0288114 2011 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2012 check_substr==must. */
2013 scream_pos = -1;
2014 dontbother = end_shift;
2015 strend = HOPc(strend, -dontbother);
2016 while ( (s <= last) &&
9041c2e3 2017 ((flags & REXEC_SCREAM)
1de06328 2018 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 2019 end_shift, &scream_pos, 0))
1de06328 2020 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2021 (unsigned char*)strend, must,
7fba1cd6 2022 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b 2023 /* we may be pointing at the wrong string */
07bc277f 2024 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
3f7c398e 2025 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2026 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2027 if (HOPc(s, -back_max) > last1) {
2028 last1 = HOPc(s, -back_min);
2029 s = HOPc(s, -back_max);
2030 }
2031 else {
52657f30 2032 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2033
2034 last1 = HOPc(s, -back_min);
52657f30 2035 s = t;
6eb5f6b9 2036 }
1aa99e6b 2037 if (do_utf8) {
6eb5f6b9 2038 while (s <= last1) {
24b23f37 2039 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2040 goto got_it;
2041 s += UTF8SKIP(s);
2042 }
2043 }
2044 else {
2045 while (s <= last1) {
24b23f37 2046 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2047 goto got_it;
2048 s++;
2049 }
2050 }
2051 }
ab3bbdeb
YO
2052 DEBUG_EXECUTE_r(if (!did_match) {
2053 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2054 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2055 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2056 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2057 ? "anchored" : "floating"),
ab3bbdeb
YO
2058 quoted, RE_SV_TAIL(must));
2059 });
6eb5f6b9
JH
2060 goto phooey;
2061 }
f8fc2ecf 2062 else if ( (c = progi->regstclass) ) {
f14c76ed 2063 if (minlen) {
f8fc2ecf 2064 const OPCODE op = OP(progi->regstclass);
66e933ab 2065 /* don't bother with what can't match */
786e8c11 2066 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2067 strend = HOPc(strend, -(minlen - 1));
2068 }
a3621e74 2069 DEBUG_EXECUTE_r({
be8e71aa 2070 SV * const prop = sv_newmortal();
32fc9b6a 2071 regprop(prog, prop, c);
0df25f3d 2072 {
02daf0ab 2073 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2074 s,strend-s,60);
0df25f3d 2075 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2076 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 2077 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2078 quoted, (int)(strend - s));
0df25f3d 2079 }
ffc61ed2 2080 });
3b0527fe 2081 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2082 goto got_it;
07be1b83 2083 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2084 }
2085 else {
2086 dontbother = 0;
a0714e2c 2087 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2088 /* Trim the end. */
d6a28714 2089 char *last;
33b8afdf
JH
2090 SV* float_real;
2091
2092 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2093 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2094 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2095
2096 if (flags & REXEC_SCREAM) {
33b8afdf 2097 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2098 end_shift, &scream_pos, 1); /* last one */
2099 if (!last)
ffc61ed2 2100 last = scream_olds; /* Only one occurrence. */
4addbd3b 2101 /* we may be pointing at the wrong string */
07bc277f 2102 else if (RXp_MATCH_COPIED(prog))
3f7c398e 2103 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2104 }
d6a28714
JH
2105 else {
2106 STRLEN len;
cfd0369c 2107 const char * const little = SvPV_const(float_real, len);
d6a28714 2108
33b8afdf 2109 if (SvTAIL(float_real)) {
d6a28714
JH
2110 if (memEQ(strend - len + 1, little, len - 1))
2111 last = strend - len + 1;
7fba1cd6 2112 else if (!multiline)
9041c2e3 2113 last = memEQ(strend - len, little, len)
bd61b366 2114 ? strend - len : NULL;
b8c5462f 2115 else
d6a28714
JH
2116 goto find_last;
2117 } else {
2118 find_last:
9041c2e3 2119 if (len)
d6a28714 2120 last = rninstr(s, strend, little, little + len);
b8c5462f 2121 else
a0288114 2122 last = strend; /* matching "$" */
b8c5462f 2123 }
b8c5462f 2124 }
bf93d4cc 2125 if (last == NULL) {
6bda09f9
YO
2126 DEBUG_EXECUTE_r(
2127 PerlIO_printf(Perl_debug_log,
2128 "%sCan't trim the tail, match fails (should not happen)%s\n",
2129 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2130 goto phooey; /* Should not happen! */
2131 }
d6a28714
JH
2132 dontbother = strend - last + prog->float_min_offset;
2133 }
2134 if (minlen && (dontbother < minlen))
2135 dontbother = minlen - 1;
2136 strend -= dontbother; /* this one's always in bytes! */
2137 /* We don't know much -- general case. */
1aa99e6b 2138 if (do_utf8) {
d6a28714 2139 for (;;) {
24b23f37 2140 if (regtry(&reginfo, &s))
d6a28714
JH
2141 goto got_it;
2142 if (s >= strend)
2143 break;
b8c5462f 2144 s += UTF8SKIP(s);
d6a28714
JH
2145 };
2146 }
2147 else {
2148 do {
24b23f37 2149 if (regtry(&reginfo, &s))
d6a28714
JH
2150 goto got_it;
2151 } while (s++ < strend);
2152 }
2153 }
2154
2155 /* Failure. */
2156 goto phooey;
2157
2158got_it:
288b8c02 2159 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2160
19b95bf0 2161 if (PL_reg_eval_set)
4f639d21 2162 restore_pos(aTHX_ prog);
5daac39c
NC
2163 if (RXp_PAREN_NAMES(prog))
2164 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2165
2166 /* make sure $`, $&, $', and $digit will work later */
2167 if ( !(flags & REXEC_NOT_FIRST) ) {
288b8c02 2168 RX_MATCH_COPY_FREE(rx);
d6a28714 2169 if (flags & REXEC_COPY_STR) {
be8e71aa 2170 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2171#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2172 if ((SvIsCOW(sv)
2173 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2174 if (DEBUG_C_TEST) {
2175 PerlIO_printf(Perl_debug_log,
2176 "Copy on write: regexp capture, type %d\n",
2177 (int) SvTYPE(sv));
2178 }
2179 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2180 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2181 assert (SvPOKp(prog->saved_copy));
2182 } else
2183#endif
2184 {
288b8c02 2185 RX_MATCH_COPIED_on(rx);
ed252734
NC
2186 s = savepvn(strbeg, i);
2187 prog->subbeg = s;
2188 }
d6a28714 2189 prog->sublen = i;
d6a28714
JH
2190 }
2191 else {
2192 prog->subbeg = strbeg;
2193 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2194 }
2195 }
9041c2e3 2196
d6a28714
JH
2197 return 1;
2198
2199phooey:
a3621e74 2200 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2201 PL_colors[4], PL_colors[5]));
d6a28714 2202 if (PL_reg_eval_set)
4f639d21 2203 restore_pos(aTHX_ prog);
fae667d5 2204 if (swap_on_fail)
c74340f9 2205 /* we failed :-( roll it back */
fae667d5
YO
2206 swap_match_buff(prog);
2207
d6a28714
JH
2208 return 0;
2209}
2210
6bda09f9 2211
d6a28714
JH
2212/*
2213 - regtry - try match at specific point
2214 */
2215STATIC I32 /* 0 failure, 1 success */
24b23f37 2216S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2217{
97aff369 2218 dVAR;
d6a28714 2219 CHECKPOINT lastcp;
288b8c02
NC
2220 REGEXP *const rx = reginfo->prog;
2221 regexp *const prog = (struct regexp *)SvANY(rx);
f8fc2ecf 2222 RXi_GET_DECL(prog,progi);
a3621e74 2223 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2224
2225 PERL_ARGS_ASSERT_REGTRY;
2226
24b23f37 2227 reginfo->cutpoint=NULL;
d6a28714 2228
bbe252da 2229 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
d6a28714
JH
2230 MAGIC *mg;
2231
2232 PL_reg_eval_set = RS_init;
a3621e74 2233 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2234 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2235 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2236 ));
ea8d6ae1 2237 SAVESTACK_CXPOS();
d6a28714
JH
2238 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2239 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2240 SAVETMPS;
2241 /* Apparently this is not needed, judging by wantarray. */
e8347627 2242 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2243 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2244
3b0527fe 2245 if (reginfo->sv) {
d6a28714 2246 /* Make $_ available to executed code. */
3b0527fe 2247 if (reginfo->sv != DEFSV) {
59f00321 2248 SAVE_DEFSV;
3b0527fe 2249 DEFSV = reginfo->sv;
b8c5462f 2250 }
d6a28714 2251
3b0527fe
DM
2252 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2253 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2254 /* prepare for quick setting of pos */
d300d9fa 2255#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2256 if (SvIsCOW(reginfo->sv))
2257 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2258#endif
3dab1dad 2259 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2260 &PL_vtbl_mglob, NULL, 0);
d6a28714 2261 mg->mg_len = -1;
b8c5462f 2262 }
d6a28714
JH
2263 PL_reg_magic = mg;
2264 PL_reg_oldpos = mg->mg_len;
4f639d21 2265 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2266 }
09687e5a 2267 if (!PL_reg_curpm) {
a02a5408 2268 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2269#ifdef USE_ITHREADS
2270 {
14a49a24 2271 SV* const repointer = &PL_sv_undef;
92313705
NC
2272 /* this regexp is also owned by the new PL_reg_curpm, which
2273 will try to free it. */
d2ece331 2274 av_push(PL_regex_padav, repointer);
09687e5a
AB
2275 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2276 PL_regex_pad = AvARRAY(PL_regex_padav);
2277 }
2278#endif
2279 }
86c29d75
NC
2280#ifdef USE_ITHREADS
2281 /* It seems that non-ithreads works both with and without this code.
2282 So for efficiency reasons it seems best not to have the code
2283 compiled when it is not needed. */
92313705
NC
2284 /* This is safe against NULLs: */
2285 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2286 /* PM_reg_curpm owns a reference to this regexp. */
2287 ReREFCNT_inc(rx);
86c29d75 2288#endif
288b8c02 2289 PM_SETRE(PL_reg_curpm, rx);
d6a28714
JH
2290 PL_reg_oldcurpm = PL_curpm;
2291 PL_curpm = PL_reg_curpm;
07bc277f 2292 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2293 /* Here is a serious problem: we cannot rewrite subbeg,
2294 since it may be needed if this match fails. Thus
2295 $` inside (?{}) could fail... */
2296 PL_reg_oldsaved = prog->subbeg;
2297 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2298#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2299 PL_nrs = prog->saved_copy;
2300#endif
07bc277f 2301 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2302 }
2303 else
bd61b366 2304 PL_reg_oldsaved = NULL;
d6a28714
JH
2305 prog->subbeg = PL_bostr;
2306 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2307 }
24b23f37 2308 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
f0ab9afb 2309 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2310 PL_reginput = *startpos;
d6a28714 2311 PL_reglastparen = &prog->lastparen;
a01268b5 2312 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2313 prog->lastparen = 0;
03994de8 2314 prog->lastcloseparen = 0;
d6a28714 2315 PL_regsize = 0;
f0ab9afb 2316 PL_regoffs = prog->offs;
d6a28714
JH
2317 if (PL_reg_start_tmpl <= prog->nparens) {
2318 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2319 if(PL_reg_start_tmp)
2320 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2321 else
a02a5408 2322 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2323 }
2324
2325 /* XXXX What this code is doing here?!!! There should be no need
2326 to do this again and again, PL_reglastparen should take care of
3dd2943c 2327 this! --ilya*/
dafc8851
JH
2328
2329 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2330 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116 2331 * PL_reglastparen), is not needed at all by the test suite
225593e1
DM
2332 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2333 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2334 * Meanwhile, this code *is* needed for the
daf18116
JH
2335 * above-mentioned test suite tests to succeed. The common theme
2336 * on those tests seems to be returning null fields from matches.
225593e1 2337 * --jhi updated by dapm */
dafc8851 2338#if 1
d6a28714 2339 if (prog->nparens) {
f0ab9afb 2340 regexp_paren_pair *pp = PL_regoffs;
097eb12c 2341 register I32 i;
eb160463 2342 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
f0ab9afb
NC
2343 ++pp;
2344 pp->start = -1;
2345 pp->end = -1;
d6a28714
JH
2346 }
2347 }
dafc8851 2348#endif
02db2b7b 2349 REGCP_SET(lastcp);
f8fc2ecf 2350 if (regmatch(reginfo, progi->program + 1)) {
f0ab9afb 2351 PL_regoffs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2352 return 1;
2353 }
24b23f37
YO
2354 if (reginfo->cutpoint)
2355 *startpos= reginfo->cutpoint;
02db2b7b 2356 REGCP_UNWIND(lastcp);
d6a28714
JH
2357 return 0;
2358}
2359
02db2b7b 2360
8ba1375e
MJD
2361#define sayYES goto yes
2362#define sayNO goto no
262b90c4 2363#define sayNO_SILENT goto no_silent
8ba1375e 2364
f9f4320a
YO
2365/* we dont use STMT_START/END here because it leads to
2366 "unreachable code" warnings, which are bogus, but distracting. */
2367#define CACHEsayNO \
c476f425
DM
2368 if (ST.cache_mask) \
2369 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2370 sayNO
3298f257 2371
a3621e74 2372/* this is used to determine how far from the left messages like
265c4333
YO
2373 'failed...' are printed. It should be set such that messages
2374 are inline with the regop output that created them.
a3621e74 2375*/
265c4333 2376#define REPORT_CODE_OFF 32
a3621e74
YO
2377
2378
2379/* Make sure there is a test for this +1 options in re_tests */
2380#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2381
40a82448
DM
2382#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2383#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2384
86545054
DM
2385#define SLAB_FIRST(s) (&(s)->states[0])
2386#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2387
5d9a96ca
DM
2388/* grab a new slab and return the first slot in it */
2389
2390STATIC regmatch_state *
2391S_push_slab(pTHX)
2392{
a35a87e7 2393#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2394 dMY_CXT;
2395#endif
5d9a96ca
DM
2396 regmatch_slab *s = PL_regmatch_slab->next;
2397 if (!s) {
2398 Newx(s, 1, regmatch_slab);
2399 s->prev = PL_regmatch_slab;
2400 s->next = NULL;
2401 PL_regmatch_slab->next = s;
2402 }
2403 PL_regmatch_slab = s;
86545054 2404 return SLAB_FIRST(s);
5d9a96ca 2405}
5b47454d 2406
95b24440 2407
40a82448
DM
2408/* push a new state then goto it */
2409
2410#define PUSH_STATE_GOTO(state, node) \
2411 scan = node; \
2412 st->resume_state = state; \
2413 goto push_state;
2414
2415/* push a new state with success backtracking, then goto it */
2416
2417#define PUSH_YES_STATE_GOTO(state, node) \
2418 scan = node; \
2419 st->resume_state = state; \
2420 goto push_yes_state;
2421
aa283a38 2422
aa283a38 2423
d6a28714 2424/*
95b24440 2425
bf1f174e
DM
2426regmatch() - main matching routine
2427
2428This is basically one big switch statement in a loop. We execute an op,
2429set 'next' to point the next op, and continue. If we come to a point which
2430we may need to backtrack to on failure such as (A|B|C), we push a
2431backtrack state onto the backtrack stack. On failure, we pop the top
2432state, and re-enter the loop at the state indicated. If there are no more
2433states to pop, we return failure.
2434
2435Sometimes we also need to backtrack on success; for example /A+/, where
2436after successfully matching one A, we need to go back and try to
2437match another one; similarly for lookahead assertions: if the assertion
2438completes successfully, we backtrack to the state just before the assertion
2439and then carry on. In these cases, the pushed state is marked as
2440'backtrack on success too'. This marking is in fact done by a chain of
2441pointers, each pointing to the previous 'yes' state. On success, we pop to
2442the nearest yes state, discarding any intermediate failure-only states.
2443Sometimes a yes state is pushed just to force some cleanup code to be
2444called at the end of a successful match or submatch; e.g. (??{$re}) uses
2445it to free the inner regex.
2446
2447Note that failure backtracking rewinds the cursor position, while
2448success backtracking leaves it alone.
2449
2450A pattern is complete when the END op is executed, while a subpattern
2451such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2452ops trigger the "pop to last yes state if any, otherwise return true"
2453behaviour.
2454
2455A common convention in this function is to use A and B to refer to the two
2456subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2457the subpattern to be matched possibly multiple times, while B is the entire
2458rest of the pattern. Variable and state names reflect this convention.
2459
2460The states in the main switch are the union of ops and failure/success of
2461substates associated with with that op. For example, IFMATCH is the op
2462that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2463'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2464successfully matched A and IFMATCH_A_fail is a state saying that we have
2465just failed to match A. Resume states always come in pairs. The backtrack
2466state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2467at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2468on success or failure.
2469
2470The struct that holds a backtracking state is actually a big union, with
2471one variant for each major type of op. The variable st points to the
2472top-most backtrack struct. To make the code clearer, within each
2473block of code we #define ST to alias the relevant union.
2474
2475Here's a concrete example of a (vastly oversimplified) IFMATCH
2476implementation:
2477
2478 switch (state) {
2479 ....
2480
2481#define ST st->u.ifmatch
2482
2483 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2484 ST.foo = ...; // some state we wish to save
95b24440 2485 ...
bf1f174e
DM
2486 // push a yes backtrack state with a resume value of
2487 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2488 // first node of A:
2489 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2490 // NOTREACHED
2491
2492 case IFMATCH_A: // we have successfully executed A; now continue with B
2493 next = B;
2494 bar = ST.foo; // do something with the preserved value
2495 break;
2496
2497 case IFMATCH_A_fail: // A failed, so the assertion failed
2498 ...; // do some housekeeping, then ...
2499 sayNO; // propagate the failure
2500
2501#undef ST
95b24440 2502
bf1f174e
DM
2503 ...
2504 }
95b24440 2505
bf1f174e
DM
2506For any old-timers reading this who are familiar with the old recursive
2507approach, the code above is equivalent to:
95b24440 2508
bf1f174e
DM
2509 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2510 {
2511 int foo = ...
95b24440 2512 ...
bf1f174e
DM
2513 if (regmatch(A)) {
2514 next = B;
2515 bar = foo;
2516 break;
95b24440 2517 }
bf1f174e
DM
2518 ...; // do some housekeeping, then ...
2519 sayNO; // propagate the failure
95b24440 2520 }
bf1f174e
DM
2521
2522The topmost backtrack state, pointed to by st, is usually free. If you
2523want to claim it, populate any ST.foo fields in it with values you wish to
2524save, then do one of
2525
2526 PUSH_STATE_GOTO(resume_state, node);
2527 PUSH_YES_STATE_GOTO(resume_state, node);
2528
2529which sets that backtrack state's resume value to 'resume_state', pushes a
2530new free entry to the top of the backtrack stack, then goes to 'node'.
2531On backtracking, the free slot is popped, and the saved state becomes the
2532new free state. An ST.foo field in this new top state can be temporarily
2533accessed to retrieve values, but once the main loop is re-entered, it
2534becomes available for reuse.
2535
2536Note that the depth of the backtrack stack constantly increases during the
2537left-to-right execution of the pattern, rather than going up and down with
2538the pattern nesting. For example the stack is at its maximum at Z at the
2539end of the pattern, rather than at X in the following:
2540
2541 /(((X)+)+)+....(Y)+....Z/
2542
2543The only exceptions to this are lookahead/behind assertions and the cut,
2544(?>A), which pop all the backtrack states associated with A before
2545continuing.
2546
2547Bascktrack state structs are allocated in slabs of about 4K in size.
2548PL_regmatch_state and st always point to the currently active state,
2549and PL_regmatch_slab points to the slab currently containing
2550PL_regmatch_state. The first time regmatch() is called, the first slab is
2551allocated, and is never freed until interpreter destruction. When the slab
2552is full, a new one is allocated and chained to the end. At exit from
2553regmatch(), slabs allocated since entry are freed.
2554
2555*/
95b24440 2556
40a82448 2557
5bc10b2c 2558#define DEBUG_STATE_pp(pp) \
265c4333 2559 DEBUG_STATE_r({ \
5bc10b2c
DM
2560 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2561 PerlIO_printf(Perl_debug_log, \
5d458dd8 2562 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2563 depth*2, "", \
13d6edb4 2564 PL_reg_name[st->resume_state], \
5d458dd8
YO
2565 ((st==yes_state||st==mark_state) ? "[" : ""), \
2566 ((st==yes_state) ? "Y" : ""), \
2567 ((st==mark_state) ? "M" : ""), \
2568 ((st==yes_state||st==mark_state) ? "]" : "") \
2569 ); \
265c4333 2570 });
5bc10b2c 2571
40a82448 2572
3dab1dad 2573#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2574
3df15adc 2575#ifdef DEBUGGING
5bc10b2c 2576
ab3bbdeb 2577STATIC void
efd26800 2578S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8,
ab3bbdeb
YO
2579 const char *start, const char *end, const char *blurb)
2580{
efd26800 2581 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
2582
2583 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2584
ab3bbdeb
YO
2585 if (!PL_colorset)
2586 reginitcolors();
2587 {
2588 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
efd26800 2589 RX_PRECOMP(prog), RX_PRELEN(prog), 60);
ab3bbdeb
YO
2590
2591 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2592 start, end - start, 60);
2593
2594 PerlIO_printf(Perl_debug_log,
2595 "%s%s REx%s %s against %s\n",
2596 PL_colors[4], blurb, PL_colors[5], s0, s1);
2597
2598 if (do_utf8||utf8_pat)
1de06328
YO
2599 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2600 utf8_pat ? "pattern" : "",
2601 utf8_pat && do_utf8 ? " and " : "",
2602 do_utf8 ? "string" : ""
ab3bbdeb
YO
2603 );
2604 }
2605}
3df15adc
YO
2606
2607STATIC void
786e8c11
YO
2608S_dump_exec_pos(pTHX_ const char *locinput,
2609 const regnode *scan,
2610 const char *loc_regeol,
2611 const char *loc_bostr,
2612 const char *loc_reg_starttry,
2613 const bool do_utf8)
07be1b83 2614{
786e8c11 2615 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2616 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2617 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2618 /* The part of the string before starttry has one color
2619 (pref0_len chars), between starttry and current
2620 position another one (pref_len - pref0_len chars),
2621 after the current position the third one.
2622 We assume that pref0_len <= pref_len, otherwise we
2623 decrease pref0_len. */
786e8c11
YO
2624 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2625 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2626 int pref0_len;
2627
7918f24d
NC
2628 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2629
07be1b83
YO
2630 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2631 pref_len++;
786e8c11
YO
2632 pref0_len = pref_len - (locinput - loc_reg_starttry);
2633 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2634 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2635 ? (5 + taill) - pref_len : loc_regeol - locinput);
07be1b83
YO
2636 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2637 l--;
2638 if (pref0_len < 0)
2639 pref0_len = 0;
2640 if (pref0_len > pref_len)
2641 pref0_len = pref_len;
2642 {
3df15adc 2643 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2644
ab3bbdeb 2645 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2646 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2647
ab3bbdeb 2648 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2649 (locinput - pref_len + pref0_len),
1de06328 2650 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2651
ab3bbdeb 2652 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2653 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2654
1de06328 2655 const STRLEN tlen=len0+len1+len2;
3df15adc 2656 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2657 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2658 (IV)(locinput - loc_bostr),
07be1b83 2659 len0, s0,
07be1b83 2660 len1, s1,
07be1b83 2661 (docolor ? "" : "> <"),
07be1b83 2662 len2, s2,
f9f4320a 2663 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
2664 "");
2665 }
2666}
3df15adc 2667
07be1b83
YO
2668#endif
2669
0a4db386
YO
2670/* reg_check_named_buff_matched()
2671 * Checks to see if a named buffer has matched. The data array of
2672 * buffer numbers corresponding to the buffer is expected to reside
2673 * in the regexp->data->data array in the slot stored in the ARG() of
2674 * node involved. Note that this routine doesn't actually care about the
2675 * name, that information is not preserved from compilation to execution.
2676 * Returns the index of the leftmost defined buffer with the given name
2677 * or 0 if non of the buffers matched.
2678 */
2679STATIC I32
7918f24d
NC
2680S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2681{
0a4db386 2682 I32 n;
f8fc2ecf 2683 RXi_GET_DECL(rex,rexi);
ad64d0ec 2684 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 2685 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
2686
2687 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2688
0a4db386
YO
2689 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2690 if ((I32)*PL_reglastparen >= nums[n] &&
f0ab9afb 2691 PL_regoffs[nums[n]].end != -1)
0a4db386
YO
2692 {
2693 return nums[n];
2694 }
2695 }
2696 return 0;
2697}
2698
2f554ef7
DM
2699
2700/* free all slabs above current one - called during LEAVE_SCOPE */
2701
2702STATIC void
2703S_clear_backtrack_stack(pTHX_ void *p)
2704{
2705 regmatch_slab *s = PL_regmatch_slab->next;
2706 PERL_UNUSED_ARG(p);
2707
2708 if (!s)
2709 return;
2710 PL_regmatch_slab->next = NULL;
2711 while (s) {
2712 regmatch_slab * const osl = s;
2713 s = s->next;
2714 Safefree(osl);
2715 }
2716}
2717
2718
28d8d7f4
YO
2719#define SETREX(Re1,Re2) \
2720 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2721 Re1 = (Re2)
2722
d6a28714 2723STATIC I32 /* 0 failure, 1 success */
24b23f37 2724S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 2725{
a35a87e7 2726#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2727 dMY_CXT;
2728#endif
27da23d5 2729 dVAR;
95b24440 2730 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2731 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
2732 REGEXP *rex_sv = reginfo->prog;
2733 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 2734 RXi_GET_DECL(rex,rexi);
2f554ef7 2735 I32 oldsave;
5d9a96ca
DM
2736 /* the current state. This is a cached copy of PL_regmatch_state */
2737 register regmatch_state *st;
5d9a96ca
DM
2738 /* cache heavy used fields of st in registers */
2739 register regnode *scan;
2740 register regnode *next;
438e9bae 2741 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 2742 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 2743 register char *locinput = PL_reginput;
5d9a96ca 2744 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 2745
b69b0499 2746 bool result = 0; /* return value of S_regmatch */
24d3c4a9 2747 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
2748 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2749 const U32 max_nochange_depth =
2750 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2751 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
2752 regmatch_state *yes_state = NULL; /* state to pop to on success of
2753 subpattern */
e2e6a0f1
YO
2754 /* mark_state piggy backs on the yes_state logic so that when we unwind
2755 the stack on success we can update the mark_state as we go */
2756 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 2757 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 2758 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 2759 U32 state_num;
5d458dd8
YO
2760 bool no_final = 0; /* prevent failure from backtracking? */
2761 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 2762 char *startpoint = PL_reginput;
5d458dd8
YO
2763 SV *popmark = NULL; /* are we looking for a mark? */
2764 SV *sv_commit = NULL; /* last mark name seen in failure */
2765 SV *sv_yes_mark = NULL; /* last mark name we have seen
2766 during a successfull match */
2767 U32 lastopen = 0; /* last open we saw */
2768 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 2769 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
2770 /* these three flags are set by various ops to signal information to
2771 * the very next op. They have a useful lifetime of exactly one loop
2772 * iteration, and are not preserved or restored by state pushes/pops
2773 */
2774 bool sw = 0; /* the condition value in (?(cond)a|b) */
2775 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2776 int logical = 0; /* the following EVAL is:
2777 0: (?{...})
2778 1: (?(?{...})X|Y)
2779 2: (??{...})
2780 or the following IFMATCH/UNLESSM is:
2781 false: plain (?=foo)
2782 true: used as a condition: (?(?=foo))
2783 */
95b24440 2784#ifdef DEBUGGING
e68ec53f 2785 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2786#endif
2787
7918f24d
NC
2788 PERL_ARGS_ASSERT_REGMATCH;
2789
3b57cd43 2790 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 2791 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 2792 }));
5d9a96ca
DM
2793 /* on first ever call to regmatch, allocate first slab */
2794 if (!PL_regmatch_slab) {
2795 Newx(PL_regmatch_slab, 1, regmatch_slab);
2796 PL_regmatch_slab->prev = NULL;
2797 PL_regmatch_slab->next = NULL;
86545054 2798 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2799 }
2800
2f554ef7
DM
2801 oldsave = PL_savestack_ix;
2802 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2803 SAVEVPTR(PL_regmatch_slab);
2804 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
2805
2806 /* grab next free state slot */
2807 st = ++PL_regmatch_state;
86545054 2808 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2809 st = PL_regmatch_state = S_push_slab(aTHX);
2810
d6a28714
JH
2811 /* Note that nextchr is a byte even in UTF */
2812 nextchr = UCHARAT(locinput);
2813 scan = prog;
2814 while (scan != NULL) {
8ba1375e 2815
a3621e74 2816 DEBUG_EXECUTE_r( {
6136c704 2817 SV * const prop = sv_newmortal();
1de06328 2818 regnode *rnext=regnext(scan);
786e8c11 2819 DUMP_EXEC_POS( locinput, scan, do_utf8 );
32fc9b6a 2820 regprop(rex, prop, scan);
07be1b83
YO
2821
2822 PerlIO_printf(Perl_debug_log,
2823 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 2824 (IV)(scan - rexi->program), depth*2, "",
07be1b83 2825 SvPVX_const(prop),
1de06328 2826 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 2827 0 : (IV)(rnext - rexi->program));
2a782b5b 2828 });
d6a28714
JH
2829
2830 next = scan + NEXT_OFF(scan);
2831 if (next == scan)
2832 next = NULL;
40a82448 2833 state_num = OP(scan);
d6a28714 2834
40a82448
DM
2835 reenter_switch:
2836 switch (state_num) {
d6a28714 2837 case BOL:
7fba1cd6 2838 if (locinput == PL_bostr)
d6a28714 2839 {
3b0527fe 2840 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2841 break;
2842 }
d6a28714
JH
2843 sayNO;
2844 case MBOL:
12d33761
HS
2845 if (locinput == PL_bostr ||
2846 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2847 {
b8c5462f
JH
2848 break;
2849 }
d6a28714
JH
2850 sayNO;
2851 case SBOL:
c2a73568 2852 if (locinput == PL_bostr)
b8c5462f 2853 break;
d6a28714
JH
2854 sayNO;
2855 case GPOS:
3b0527fe 2856 if (locinput == reginfo->ganch)
d6a28714
JH
2857 break;
2858 sayNO;
ee9b8eae
YO
2859
2860 case KEEPS:
2861 /* update the startpoint */
f0ab9afb 2862 st->u.keeper.val = PL_regoffs[0].start;
ee9b8eae 2863 PL_reginput = locinput;
f0ab9afb 2864 PL_regoffs[0].start = locinput - PL_bostr;
ee9b8eae
YO
2865 PUSH_STATE_GOTO(KEEPS_next, next);
2866 /*NOT-REACHED*/
2867 case KEEPS_next_fail:
2868 /* rollback the start point change */
f0ab9afb 2869 PL_regoffs[0].start = st->u.keeper.val;
ee9b8eae
YO
2870 sayNO_SILENT;
2871 /*NOT-REACHED*/
d6a28714 2872 case EOL:
d6a28714
JH
2873 goto seol;
2874 case MEOL:
d6a28714 2875 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2876 sayNO;
b8c5462f 2877 break;
d6a28714
JH
2878 case SEOL:
2879 seol:
2880 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2881 sayNO;
d6a28714 2882 if (PL_regeol - locinput > 1)
b8c5462f 2883 sayNO;
b8c5462f 2884 break;
d6a28714
JH
2885 case EOS:
2886 if (PL_regeol != locinput)
b8c5462f 2887 sayNO;
d6a28714 2888 break;
ffc61ed2 2889 case SANY:
d6a28714 2890 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2891 sayNO;
f33976b4
DB
2892 if (do_utf8) {
2893 locinput += PL_utf8skip[nextchr];
2894 if (locinput > PL_regeol)
2895 sayNO;
2896 nextchr = UCHARAT(locinput);
2897 }
2898 else
2899 nextchr = UCHARAT(++locinput);
2900 break;
2901 case CANY:
2902 if (!nextchr && locinput >= PL_regeol)
2903 sayNO;
b8c5462f 2904 nextchr = UCHARAT(++locinput);
a0d0e21e 2905 break;
ffc61ed2 2906 case REG_ANY:
1aa99e6b
IH
2907 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2908 sayNO;
2909 if (do_utf8) {
b8c5462f 2910 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2911 if (locinput > PL_regeol)
2912 sayNO;
a0ed51b3 2913 nextchr = UCHARAT(locinput);
a0ed51b3 2914 }
1aa99e6b
IH
2915 else
2916 nextchr = UCHARAT(++locinput);
a0ed51b3 2917 break;
166ba7cd
DM
2918
2919#undef ST
2920#define ST st->u.trie
786e8c11
YO
2921 case TRIEC:
2922 /* In this case the charclass data is available inline so
2923 we can fail fast without a lot of extra overhead.
2924 */
2925 if (scan->flags == EXACT || !do_utf8) {
2926 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2927 DEBUG_EXECUTE_r(
2928 PerlIO_printf(Perl_debug_log,
2929 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2930 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
786e8c11
YO
2931 );
2932 sayNO_SILENT;
2933 /* NOTREACHED */
2934 }
2935 }
2936 /* FALL THROUGH */
5b47454d 2937 case TRIE:
3dab1dad 2938 {
07be1b83 2939 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 2940 DECL_TRIE_TYPE(scan);
3dab1dad
YO
2941
2942 /* what trie are we using right now */
be8e71aa 2943 reg_trie_data * const trie
f8fc2ecf 2944 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 2945 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 2946 U32 state = trie->startstate;
166ba7cd 2947
3dab1dad
YO
2948 if (trie->bitmap && trie_type != trie_utf8_fold &&
2949 !TRIE_BITMAP_TEST(trie,*locinput)
2950 ) {
2951 if (trie->states[ state ].wordnum) {
2952 DEBUG_EXECUTE_r(
2953 PerlIO_printf(Perl_debug_log,
2954 "%*s %smatched empty string...%s\n",
5bc10b2c 2955 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2956 );
2957 break;
2958 } else {
2959 DEBUG_EXECUTE_r(
2960 PerlIO_printf(Perl_debug_log,
786e8c11 2961 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2962 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2963 );
2964 sayNO_SILENT;
2965 }
2966 }
166ba7cd 2967
786e8c11
YO
2968 {
2969 U8 *uc = ( U8* )locinput;
2970
2971 STRLEN len = 0;
2972 STRLEN foldlen = 0;
2973 U8 *uscan = (U8*)NULL;
2974 STRLEN bufflen=0;
2975 SV *sv_accept_buff = NULL;
2976 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2977
2978 ST.accepted = 0; /* how many accepting states we have seen */
2979 ST.B = next;
2980 ST.jump = trie->jump;
786e8c11 2981 ST.me = scan;
07be1b83
YO
2982 /*
2983 traverse the TRIE keeping track of all accepting states
2984 we transition through until we get to a failing node.
2985 */
2986
a3621e74 2987 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 2988 U32 base = trie->states[ state ].trans.base;
f9f4320a 2989 UV uvc = 0;
786e8c11
YO
2990 U16 charid;
2991 /* We use charid to hold the wordnum as we don't use it
2992 for charid until after we have done the wordnum logic.
2993 We define an alias just so that the wordnum logic reads
2994 more naturally. */
2995
2996#define got_wordnum charid
2997 got_wordnum = trie->states[ state ].wordnum;
2998
2999 if ( got_wordnum ) {
3000 if ( ! ST.accepted ) {
5b47454d 3001 ENTER;
6b173516 3002 /* SAVETMPS; */ /* XXX is this necessary? dmq */
5b47454d
DM
3003 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
3004 sv_accept_buff=newSV(bufflen *
3005 sizeof(reg_trie_accepted) - 1);
786e8c11 3006 SvCUR_set(sv_accept_buff, 0);
5b47454d
DM
3007 SvPOK_on(sv_accept_buff);
3008 sv_2mortal(sv_accept_buff);
166ba7cd
DM
3009 SAVETMPS;
3010 ST.accept_buff =
5b47454d
DM
3011 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
3012 }
786e8c11 3013 do {
166ba7cd 3014 if (ST.accepted >= bufflen) {
5b47454d 3015 bufflen *= 2;
166ba7cd 3016 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
3017 SvGROW(sv_accept_buff,
3018 bufflen * sizeof(reg_trie_accepted));
3019 }
3020 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
3021 + sizeof(reg_trie_accepted));
a3621e74 3022
786e8c11
YO
3023
3024 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
3025 ST.accept_buff[ST.accepted].endpos = uc;
3026 ++ST.accepted;
3027 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
3028 }
3029#undef got_wordnum
a3621e74 3030
07be1b83 3031 DEBUG_TRIE_EXECUTE_r({
786e8c11 3032 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
a3621e74 3033 PerlIO_printf( Perl_debug_log,
786e8c11 3034 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
5bc10b2c 3035 2+depth * 2, "", PL_colors[4],
786e8c11 3036 (UV)state, (UV)ST.accepted );
07be1b83 3037 });
a3621e74
YO
3038
3039 if ( base ) {
55eed653
NC
3040 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3041 uscan, len, uvc, charid, foldlen,
3042 foldbuf, uniflags);
a3621e74 3043
5b47454d
DM
3044 if (charid &&
3045 (base + charid > trie->uniquecharcount )
3046 && (base + charid - 1 - trie->uniquecharcount
3047 < trie->lasttrans)
3048 && trie->trans[base + charid - 1 -
3049 trie->uniquecharcount].check == state)
3050 {
3051 state = trie->trans[base + charid - 1 -
3052 trie->uniquecharcount ].next;
3053 }
3054 else {
3055 state = 0;
3056 }
3057 uc += len;
3058
3059 }
3060 else {
a3621e74
YO
3061 state = 0;
3062 }
3063 DEBUG_TRIE_EXECUTE_r(
e4584336 3064 PerlIO_printf( Perl_debug_log,
786e8c11 3065 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3066 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3067 );
3068 }
166ba7cd 3069 if (!ST.accepted )
a3621e74 3070 sayNO;
a3621e74 3071
166ba7cd
DM
3072 DEBUG_EXECUTE_r(
3073 PerlIO_printf( Perl_debug_log,
3074 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3075 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3076 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3077 );
786e8c11 3078 }}
fae667d5
YO
3079 goto trie_first_try; /* jump into the fail handler */
3080 /* NOTREACHED */
166ba7cd 3081 case TRIE_next_fail: /* we failed - try next alterative */
fae667d5
YO
3082 if ( ST.jump) {
3083 REGCP_UNWIND(ST.cp);
3084 for (n = *PL_reglastparen; n > ST.lastparen; n--)
f0ab9afb 3085 PL_regoffs[n].end = -1;
fae667d5
YO
3086 *PL_reglastparen = n;
3087 }
3088 trie_first_try:
5d458dd8
YO
3089 if (do_cutgroup) {
3090 do_cutgroup = 0;
3091 no_final = 0;
3092 }
fae667d5
YO
3093
3094 if ( ST.jump) {
3095 ST.lastparen = *PL_reglastparen;
3096 REGCP_SET(ST.cp);
3097 }
166ba7cd
DM
3098 if ( ST.accepted == 1 ) {
3099 /* only one choice left - just continue */
3100 DEBUG_EXECUTE_r({
2b8b4781 3101 AV *const trie_words
502c6561 3102 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
2b8b4781 3103 SV ** const tmp = av_fetch( trie_words,
5c9f2f80 3104 ST.accept_buff[ 0 ].wordnum-1, 0 );
de734bd5
A
3105 SV *sv= tmp ? sv_newmortal() : NULL;
3106
166ba7cd
DM
3107 PerlIO_printf( Perl_debug_log,
3108 "%*s %sonly one match left: #%d <%s>%s\n",
5bc10b2c 3109 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3110 ST.accept_buff[ 0 ].wordnum,
de734bd5
A
3111 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3112 PL_colors[0], PL_colors[1],
3113 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3114 )
3115 : "not compiled under -Dr",
166ba7cd
DM
3116 PL_colors[5] );
3117 });
3118 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3119 /* in this case we free tmps/leave before we call regmatch
3120 as we wont be using accept_buff again. */
5d458dd8 3121
166ba7cd
DM
3122 locinput = PL_reginput;
3123 nextchr = UCHARAT(locinput);
5d458dd8
YO
3124 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3125 scan = ST.B;
3126 else
3127 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3128 if (!has_cutgroup) {
3129 FREETMPS;
3130 LEAVE;
3131 } else {
3132 ST.accepted--;
3133 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3134 }
786e8c11 3135
166ba7cd
DM
3136 continue; /* execute rest of RE */
3137 }
fae667d5
YO
3138
3139 if ( !ST.accepted-- ) {
5d458dd8
YO
3140 DEBUG_EXECUTE_r({
3141 PerlIO_printf( Perl_debug_log,
3142 "%*s %sTRIE failed...%s\n",
3143 REPORT_CODE_OFF+depth*2, "",
3144 PL_colors[4],
3145 PL_colors[5] );
3146 });
166ba7cd
DM
3147 FREETMPS;
3148 LEAVE;
5d458dd8 3149 sayNO_SILENT;
fae667d5
YO
3150 /*NOTREACHED*/
3151 }
166ba7cd 3152
a3621e74 3153 /*
166ba7cd
DM
3154 There are at least two accepting states left. Presumably
3155 the number of accepting states is going to be low,
3156 typically two. So we simply scan through to find the one
3157 with lowest wordnum. Once we find it, we swap the last
3158 state into its place and decrement the size. We then try to
3159 match the rest of the pattern at the point where the word
3160 ends. If we succeed, control just continues along the
3161 regex; if we fail we return here to try the next accepting
3162 state
3163 */
a3621e74 3164
166ba7cd
DM
3165 {
3166 U32 best = 0;
3167 U32 cur;
3168 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3169 DEBUG_TRIE_EXECUTE_r(
f2278c82 3170 PerlIO_printf( Perl_debug_log,
166ba7cd 3171 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
5bc10b2c 3172 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
166ba7cd
DM
3173 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3174 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3175 );
3176
3177 if (ST.accept_buff[cur].wordnum <
3178 ST.accept_buff[best].wordnum)
3179 best = cur;
a3621e74 3180 }
166ba7cd
DM
3181
3182 DEBUG_EXECUTE_r({
2b8b4781 3183 AV *const trie_words
502c6561 3184 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
2b8b4781 3185 SV ** const tmp = av_fetch( trie_words,
5c9f2f80 3186 ST.accept_buff[ best ].wordnum - 1, 0 );
7f69552c 3187 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
265c4333 3188 ST.B :
7f69552c 3189 ST.me + ST.jump[ST.accept_buff[best].wordnum];
de734bd5
A
3190 SV *sv= tmp ? sv_newmortal() : NULL;
3191
265c4333
YO
3192 PerlIO_printf( Perl_debug_log,
3193 "%*s %strying alternation #%d <%s> at node #%d %s\n",
5bc10b2c 3194 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3195 ST.accept_buff[best].wordnum,
de734bd5
A
3196 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3197 PL_colors[0], PL_colors[1],
3198 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3199 ) : "not compiled under -Dr",
265c4333 3200 REG_NODE_NUM(nextop),
166ba7cd
DM
3201 PL_colors[5] );
3202 });
3203
3204 if ( best<ST.accepted ) {
3205 reg_trie_accepted tmp = ST.accept_buff[ best ];
3206 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3207 ST.accept_buff[ ST.accepted ] = tmp;
3208 best = ST.accepted;
a3621e74 3209 }
166ba7cd 3210 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
7f69552c 3211 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
5d458dd8 3212 scan = ST.B;
786e8c11 3213 } else {
5d458dd8 3214 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
5d458dd8 3215 }
6b173516 3216 PUSH_YES_STATE_GOTO(TRIE_next, scan);
786e8c11 3217 /* NOTREACHED */
166ba7cd 3218 }
166ba7cd 3219 /* NOTREACHED */
5d458dd8
YO
3220 case TRIE_next:
3221 FREETMPS;
3222 LEAVE;
3223 sayYES;
166ba7cd
DM
3224#undef ST
3225
95b24440
DM
3226 case EXACT: {
3227 char *s = STRING(scan);
24d3c4a9 3228 ln = STR_LEN(scan);
eb160463 3229 if (do_utf8 != UTF) {
bc517b45 3230 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3231 char *l = locinput;
24d3c4a9 3232 const char * const e = s + ln;
a72c7584 3233
5ff6fc6d
JH
3234 if (do_utf8) {
3235 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3236 while (s < e) {
a3b680e6 3237 STRLEN ulen;
1aa99e6b 3238 if (l >= PL_regeol)
5ff6fc6d
JH
3239 sayNO;
3240 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3241 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3242 uniflags))
5ff6fc6d 3243 sayNO;
bc517b45 3244 l += ulen;
5ff6fc6d 3245 s ++;
1aa99e6b 3246 }
5ff6fc6d
JH
3247 }
3248 else {
3249 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3250 while (s < e) {
a3b680e6 3251 STRLEN ulen;
1aa99e6b
IH
3252 if (l >= PL_regeol)
3253 sayNO;
5ff6fc6d 3254 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3255 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3256 uniflags))
1aa99e6b 3257 sayNO;
bc517b45 3258 s += ulen;
a72c7584 3259 l ++;
1aa99e6b 3260 }
5ff6fc6d 3261 }
1aa99e6b
IH
3262 locinput = l;
3263 nextchr = UCHARAT(locinput);
3264 break;
3265 }
bc517b45 3266 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3267 /* Inline the first character, for speed. */
3268 if (UCHARAT(s) != nextchr)
3269 sayNO;
24d3c4a9 3270 if (PL_regeol - locinput < ln)
d6a28714 3271 sayNO;
24d3c4a9 3272 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 3273 sayNO;
24d3c4a9 3274 locinput += ln;
d6a28714
JH
3275 nextchr = UCHARAT(locinput);
3276 break;
95b24440 3277 }
d6a28714 3278 case EXACTFL:
b8c5462f
JH
3279 PL_reg_flags |= RF_tainted;
3280 /* FALL THROUGH */
95b24440 3281 case EXACTF: {
be8e71aa 3282 char * const s = STRING(scan);
24d3c4a9 3283 ln = STR_LEN(scan);
d6a28714 3284
d07ddd77
JH
3285 if (do_utf8 || UTF) {
3286 /* Either target or the pattern are utf8. */
be8e71aa 3287 const char * const l = locinput;
d07ddd77 3288 char *e = PL_regeol;
bc517b45 3289
24d3c4a9 3290 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 3291 l, &e, 0, do_utf8)) {
5486206c
JH
3292 /* One more case for the sharp s:
3293 * pack("U0U*", 0xDF) =~ /ss/i,
3294 * the 0xC3 0x9F are the UTF-8
3295 * byte sequence for the U+00DF. */
e1d1eefb 3296
5486206c 3297 if (!(do_utf8 &&
e1d1eefb 3298 toLOWER(s[0]) == 's' &&
24d3c4a9 3299 ln >= 2 &&
5486206c
JH
3300 toLOWER(s[1]) == 's' &&
3301 (U8)l[0] == 0xC3 &&
3302 e - l >= 2 &&
3303 (U8)l[1] == 0x9F))
3304 sayNO;
3305 }
d07ddd77
JH
3306 locinput = e;
3307 nextchr = UCHARAT(locinput);
3308 break;
a0ed51b3 3309 }
d6a28714 3310
bc517b45
JH
3311 /* Neither the target and the pattern are utf8. */
3312
d6a28714
JH
3313 /* Inline the first character, for speed. */
3314 if (UCHARAT(s) != nextchr &&
3315 UCHARAT(s) != ((OP(scan) == EXACTF)
3316 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3317 sayNO;
24d3c4a9 3318 if (PL_regeol - locinput < ln)
b8c5462f 3319 sayNO;
24d3c4a9
DM
3320 if (ln > 1 && (OP(scan) == EXACTF
3321 ? ibcmp(s, locinput, ln)
3322 : ibcmp_locale(s, locinput, ln)))
4633a7c4 3323 sayNO;
24d3c4a9 3324 locinput += ln;
d6a28714 3325 nextchr = UCHARAT(locinput);
a0d0e21e 3326 break;
95b24440 3327 }
d6a28714 3328 case ANYOF:
ffc61ed2 3329 if (do_utf8) {
9e55ce06
JH
3330 STRLEN inclasslen = PL_regeol - locinput;
3331
32fc9b6a 3332 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
262b90c4 3333 goto anyof_fail;
ffc61ed2
JH
3334 if (locinput >= PL_regeol)
3335 sayNO;
0f0076b4 3336 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3337 nextchr = UCHARAT(locinput);
e0f9d4a8 3338 break;
ffc61ed2
JH
3339 }
3340 else {
3341 if (nextchr < 0)
3342 nextchr = UCHARAT(locinput);
32fc9b6a 3343 if (!REGINCLASS(rex, scan, (U8*)locinput))
262b90c4 3344 goto anyof_fail;
ffc61ed2
JH
3345 if (!nextchr && locinput >= PL_regeol)
3346 sayNO;
3347 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3348 break;
3349 }
262b90c4 3350 anyof_fail:
e0f9d4a8
JH
3351 /* If we might have the case of the German sharp s
3352 * in a casefolding Unicode character class. */
3353
ebc501f0
JH
3354 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3355 locinput += SHARP_S_SKIP;
e0f9d4a8 3356 nextchr = UCHARAT(locinput);
ffc61ed2 3357 }
e0f9d4a8
JH
3358 else
3359 sayNO;
b8c5462f 3360 break;
d6a28714 3361 case ALNUML:
b8c5462f
JH
3362 PL_reg_flags |= RF_tainted;
3363 /* FALL THROUGH */
d6a28714 3364 case ALNUM:
b8c5462f 3365 if (!nextchr)
4633a7c4 3366 sayNO;
ffc61ed2 3367 if (do_utf8) {
1a4fad37 3368 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3369 if (!(OP(scan) == ALNUM
bb7a0f54 3370 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3371 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3372 {
3373 sayNO;
a0ed51b3 3374 }
b8c5462f 3375 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3376 nextchr = UCHARAT(locinput);
3377 break;
3378 }
ffc61ed2 3379 if (!(OP(scan) == ALNUM
d6a28714 3380 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3381 sayNO;
b8c5462f 3382 nextchr = UCHARAT(++locinput);
a0d0e21e 3383 break;
d6a28714 3384 case NALNUML:
b8c5462f
JH
3385 PL_reg_flags |= RF_tainted;
3386 /* FALL THROUGH */
d6a28714
JH
3387 case NALNUM:
3388 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3389 sayNO;
ffc61ed2 3390 if (do_utf8) {
1a4fad37 3391 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3392 if (OP(scan) == NALNUM
bb7a0f54 3393 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3394 : isALNUM_LC_utf8((U8*)locinput))
3395 {
b8c5462f 3396 sayNO;
d6a28714 3397 }
b8c5462f
JH
3398 locinput += PL_utf8skip[nextchr];
3399 nextchr = UCHARAT(locinput);
3400 break;
3401 }
ffc61ed2 3402 if (OP(scan) == NALNUM
d6a28714 3403 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3404 sayNO;
76e3520e 3405 nextchr = UCHARAT(++locinput);
a0d0e21e 3406 break;
d6a28714
JH
3407 case BOUNDL:
3408 case NBOUNDL:
3280af22 3409 PL_reg_flags |= RF_tainted;
bbce6d69 3410 /* FALL THROUGH */
d6a28714
JH
3411 case BOUND:
3412 case NBOUND:
3413 /* was last char in word? */
ffc61ed2 3414 if (do_utf8) {
12d33761 3415 if (locinput == PL_bostr)
24d3c4a9 3416 ln = '\n';
ffc61ed2 3417 else {
a3b680e6 3418 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3419
24d3c4a9 3420 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3421 }
3422 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3423 ln = isALNUM_uni(ln);
1a4fad37 3424 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3425 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3426 }
3427 else {
24d3c4a9 3428 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
3429 n = isALNUM_LC_utf8((U8*)locinput);
3430 }
a0ed51b3 3431 }
d6a28714 3432 else {
24d3c4a9 3433 ln = (locinput != PL_bostr) ?
12d33761 3434 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3435 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3436 ln = isALNUM(ln);
ffc61ed2
JH
3437 n = isALNUM(nextchr);
3438 }
3439 else {
24d3c4a9 3440 ln = isALNUM_LC(ln);
ffc61ed2
JH
3441 n = isALNUM_LC(nextchr);
3442 }
d6a28714 3443 }
24d3c4a9 3444 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3445 OP(scan) == BOUNDL))
3446 sayNO;
a0ed51b3 3447 break;
d6a28714 3448 case SPACEL:
3280af22 3449 PL_reg_flags |= RF_tainted;
bbce6d69 3450 /* FALL THROUGH */
d6a28714 3451 case SPACE:
9442cb0e 3452 if (!nextchr)
4633a7c4 3453 sayNO;
1aa99e6b 3454 if (do_utf8) {
fd400ab9 3455 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3456 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3457 if (!(OP(scan) == SPACE
bb7a0f54 3458 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3459 : isSPACE_LC_utf8((U8*)locinput)))
3460 {
3461 sayNO;
3462 }
3463 locinput += PL_utf8skip[nextchr];
3464 nextchr = UCHARAT(locinput);
3465 break;
d6a28714 3466 }
ffc61ed2
JH
3467 if (!(OP(scan) == SPACE
3468 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3469 sayNO;
3470 nextchr = UCHARAT(++locinput);
3471 }
3472 else {
3473 if (!(OP(scan) == SPACE
3474 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3475 sayNO;
3476 nextchr = UCHARAT(++locinput);
a0ed51b3 3477 }
a0ed51b3 3478 break;
d6a28714 3479 case NSPACEL:
3280af22 3480 PL_reg_flags |= RF_tainted;
bbce6d69 3481 /* FALL THROUGH */
d6a28714 3482 case NSPACE:
9442cb0e 3483 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3484 sayNO;
1aa99e6b 3485 if (do_utf8) {
1a4fad37 3486 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3487 if (OP(scan) == NSPACE
bb7a0f54 3488 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3489 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3490 {
3491 sayNO;
3492 }
3493 locinput += PL_utf8skip[nextchr];
3494 nextchr = UCHARAT(locinput);
3495 break;
a0ed51b3 3496 }
ffc61ed2 3497 if (OP(scan) == NSPACE
d6a28714 3498 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3499 sayNO;
76e3520e 3500 nextchr = UCHARAT(++locinput);
a0d0e21e 3501 break;
d6a28714 3502 case DIGITL:
a0ed51b3
LW
3503 PL_reg_flags |= RF_tainted;
3504 /* FALL THROUGH */
d6a28714 3505 case DIGIT:
9442cb0e 3506 if (!nextchr)
a0ed51b3 3507 sayNO;
1aa99e6b 3508 if (do_utf8) {
1a4fad37 3509 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3510 if (!(OP(scan) == DIGIT
bb7a0f54 3511 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3512 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3513 {
a0ed51b3 3514 sayNO;
dfe13c55 3515 }
6f06b55f 3516 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3517 nextchr = UCHARAT(locinput);
3518 break;
3519 }
ffc61ed2 3520 if (!(OP(scan) == DIGIT
9442cb0e 3521 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3522 sayNO;
3523 nextchr = UCHARAT(++locinput);
3524 break;
d6a28714 3525 case NDIGITL:
b8c5462f
JH
3526 PL_reg_flags |= RF_tainted;
3527 /* FALL THROUGH */
d6a28714 3528 case NDIGIT:
9442cb0e 3529 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3530 sayNO;
1aa99e6b 3531 if (do_utf8) {
1a4fad37 3532 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3533 if (OP(scan) == NDIGIT
bb7a0f54 3534 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3535 : isDIGIT_LC_utf8((U8*)locinput))
3536 {
a0ed51b3 3537 sayNO;
9442cb0e 3538 }
6f06b55f 3539 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3540 nextchr = UCHARAT(locinput);
3541 break;
3542 }
ffc61ed2 3543 if (OP(scan) == NDIGIT
9442cb0e 3544 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3545 sayNO;
3546 nextchr = UCHARAT(++locinput);
3547 break;
3548 case CLUMP:
b7c83a7e 3549 if (locinput >= PL_regeol)
a0ed51b3 3550 sayNO;
b7c83a7e 3551 if (do_utf8) {
1a4fad37 3552 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3553 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3554 sayNO;
3555 locinput += PL_utf8skip[nextchr];
3556 while (locinput < PL_regeol &&
3557 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3558 locinput += UTF8SKIP(locinput);
3559 if (locinput > PL_regeol)
3560 sayNO;
eb08e2da
JH
3561 }
3562 else
3563 locinput++;
a0ed51b3
LW
3564 nextchr = UCHARAT(locinput);
3565 break;
81714fb9
YO
3566
3567 case NREFFL:
3568 {
3569 char *s;
ff1157ca 3570 char type;
81714fb9
YO
3571 PL_reg_flags |= RF_tainted;
3572 /* FALL THROUGH */
3573 case NREF:
3574 case NREFF:
ff1157ca 3575 type = OP(scan);
0a4db386
YO
3576 n = reg_check_named_buff_matched(rex,scan);
3577
3578 if ( n ) {
3579 type = REF + ( type - NREF );
3580 goto do_ref;
3581 } else {
81714fb9 3582 sayNO;
0a4db386
YO
3583 }
3584 /* unreached */
c8756f30 3585 case REFFL:
3280af22 3586 PL_reg_flags |= RF_tainted;
c8756f30 3587 /* FALL THROUGH */
c277df42 3588 case REF:
81714fb9 3589 case REFF:
c277df42 3590 n = ARG(scan); /* which paren pair */
81714fb9
YO
3591 type = OP(scan);
3592 do_ref:
f0ab9afb 3593 ln = PL_regoffs[n].start;
2c2d71f5 3594 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3b6647e0 3595 if (*PL_reglastparen < n || ln == -1)
af3f8c16 3596 sayNO; /* Do not match unless seen CLOSEn. */
f0ab9afb 3597 if (ln == PL_regoffs[n].end)
a0d0e21e 3598 break;
a0ed51b3 3599
24d3c4a9 3600 s = PL_bostr + ln;
81714fb9 3601 if (do_utf8 && type != REF) { /* REF can do byte comparison */
a0ed51b3 3602 char *l = locinput;
f0ab9afb 3603 const char *e = PL_bostr + PL_regoffs[n].end;
a0ed51b3
LW
3604 /*
3605 * Note that we can't do the "other character" lookup trick as
3606 * in the 8-bit case (no pun intended) because in Unicode we
3607 * have to map both upper and title case to lower case.
3608 */
81714fb9 3609 if (type == REFF) {
a0ed51b3 3610 while (s < e) {
a3b680e6
AL
3611 STRLEN ulen1, ulen2;
3612 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3613 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3614
a0ed51b3
LW
3615 if (l >= PL_regeol)
3616 sayNO;
a2a2844f
JH
3617 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3618 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3619 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3620 sayNO;
a2a2844f
JH
3621 s += ulen1;
3622 l += ulen2;
a0ed51b3
LW
3623 }
3624 }
3625 locinput = l;
3626 nextchr = UCHARAT(locinput);
3627 break;
3628 }
3629
a0d0e21e 3630 /* Inline the first character, for speed. */
76e3520e 3631 if (UCHARAT(s) != nextchr &&
81714fb9
YO
3632 (type == REF ||
3633 (UCHARAT(s) != (type == REFF
3634 ? PL_fold : PL_fold_locale)[nextchr])))
4633a7c4 3635 sayNO;
f0ab9afb 3636 ln = PL_regoffs[n].end - ln;
24d3c4a9 3637 if (locinput + ln > PL_regeol)
4633a7c4 3638 sayNO;
81714fb9 3639 if (ln > 1 && (type == REF
24d3c4a9 3640 ? memNE(s, locinput, ln)
81714fb9 3641 : (type == REFF
24d3c4a9
DM
3642 ? ibcmp(s, locinput, ln)
3643 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3644 sayNO;
24d3c4a9 3645 locinput += ln;
76e3520e 3646 nextchr = UCHARAT(locinput);
a0d0e21e 3647 break;
81714fb9 3648 }
a0d0e21e 3649 case NOTHING:
c277df42 3650 case TAIL:
a0d0e21e
LW
3651 break;
3652 case BACK:
3653 break;
40a82448
DM
3654
3655#undef ST
3656#define ST st->u.eval
c277df42 3657 {
c277df42 3658 SV *ret;
d2f13c59 3659 REGEXP *re_sv;
6bda09f9 3660 regexp *re;
f8fc2ecf 3661 regexp_internal *rei;
1a147d38
YO
3662 regnode *startpoint;
3663
3664 case GOSTART:
e7707071
YO
3665 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
3666 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 3667 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 3668 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 3669 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
3670 Perl_croak(aTHX_
3671 "Pattern subroutine nesting without pos change"
3672 " exceeded limit in regex");
6bda09f9
YO
3673 } else {
3674 nochange_depth = 0;
1a147d38 3675 }
288b8c02 3676 re_sv = rex_sv;
6bda09f9 3677 re = rex;
f8fc2ecf 3678 rei = rexi;
288b8c02 3679 (void)ReREFCNT_inc(rex_sv);
1a147d38 3680 if (OP(scan)==GOSUB) {
6bda09f9
YO
3681 startpoint = scan + ARG2L(scan);
3682 ST.close_paren = ARG(scan);
3683 } else {
f8fc2ecf 3684 startpoint = rei->program+1;
6bda09f9
YO
3685 ST.close_paren = 0;
3686 }
3687 goto eval_recurse_doit;
3688 /* NOTREACHED */
3689 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3690 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 3691 if ( ++nochange_depth > max_nochange_depth )
1a147d38 3692 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
3693 } else {
3694 nochange_depth = 0;
3695 }
8e5e9ebe 3696 {
4aabdb9b
DM
3697 /* execute the code in the {...} */
3698 dSP;
6136c704 3699 SV ** const before = SP;
4aabdb9b
DM
3700 OP_4tree * const oop = PL_op;
3701 COP * const ocurcop = PL_curcop;
3702 PAD *old_comppad;
4aabdb9b
DM
3703
3704 n = ARG(scan);
f8fc2ecf 3705 PL_op = (OP_4tree*)rexi->data->data[n];
24b23f37
YO
3706 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3707 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f8fc2ecf 3708 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
f0ab9afb 3709 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4aabdb9b 3710
2bf803e2
YO
3711 if (sv_yes_mark) {
3712 SV *sv_mrk = get_sv("REGMARK", 1);
3713 sv_setsv(sv_mrk, sv_yes_mark);
3714 }
3715
8e5e9ebe
RGS
3716 CALLRUNOPS(aTHX); /* Scalar context. */
3717 SPAGAIN;
3718 if (SP == before)
075aa684 3719 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3720 else {
3721 ret = POPs;
3722 PUTBACK;
3723 }
4aabdb9b
DM
3724
3725 PL_op = oop;
3726 PAD_RESTORE_LOCAL(old_comppad);
3727 PL_curcop = ocurcop;
24d3c4a9 3728 if (!logical) {
4aabdb9b
DM
3729 /* /(?{...})/ */
3730 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3731 break;
3732 }
8e5e9ebe 3733 }
24d3c4a9
DM
3734 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3735 logical = 0;
4aabdb9b 3736 {
4f639d21
DM
3737 /* extract RE object from returned value; compiling if
3738 * necessary */
6136c704 3739 MAGIC *mg = NULL;
288b8c02 3740 REGEXP *rx = NULL;
5c35adbb
NC
3741
3742 if (SvROK(ret)) {
288b8c02 3743 SV *const sv = SvRV(ret);
5c35adbb
NC
3744
3745 if (SvTYPE(sv) == SVt_REGEXP) {
d2f13c59 3746 rx = (REGEXP*) sv;
5c35adbb
NC
3747 } else if (SvSMAGICAL(sv)) {
3748 mg = mg_find(sv, PERL_MAGIC_qr);
3749 assert(mg);
3750 }
3751 } else if (SvTYPE(ret) == SVt_REGEXP) {
d2f13c59 3752 rx = (REGEXP*) ret;
5c35adbb 3753 } else if (SvSMAGICAL(ret)) {
124ee91a
NC
3754 if (SvGMAGICAL(ret)) {
3755 /* I don't believe that there is ever qr magic
3756 here. */
3757 assert(!mg_find(ret, PERL_MAGIC_qr));
faf82a0b 3758 sv_unmagic(ret, PERL_MAGIC_qr);
124ee91a
NC
3759 }
3760 else {
faf82a0b 3761 mg = mg_find(ret, PERL_MAGIC_qr);
124ee91a
NC
3762 /* testing suggests mg only ends up non-NULL for
3763 scalars who were upgraded and compiled in the
3764 else block below. In turn, this is only
3765 triggered in the "postponed utf8 string" tests
3766 in t/op/pat.t */
3767 }
0f5d15d6 3768 }
faf82a0b 3769
0f5d15d6 3770 if (mg) {
d2f13c59 3771 rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
e2560c33 3772 assert(rx);
0f5d15d6 3773 }
288b8c02
NC
3774 if (rx) {
3775 rx = reg_temp_copy(rx);
3776 }
0f5d15d6 3777 else {
c737faaf 3778 U32 pm_flags = 0;
a3b680e6 3779 const I32 osize = PL_regsize;
0f5d15d6 3780
b9ad30b4
NC
3781 if (DO_UTF8(ret)) {
3782 assert (SvUTF8(ret));
3783 } else if (SvUTF8(ret)) {
3784 /* Not doing UTF-8, despite what the SV says. Is
3785 this only if we're trapped in use 'bytes'? */
3786 /* Make a copy of the octet sequence, but without
3787 the flag on, as the compiler now honours the
3788 SvUTF8 flag on ret. */
3789 STRLEN len;
3790 const char *const p = SvPV(ret, len);
3791 ret = newSVpvn_flags(p, len, SVs_TEMP);
3792 }
288b8c02 3793 rx = CALLREGCOMP(ret, pm_flags);
9041c2e3 3794 if (!(SvFLAGS(ret)
faf82a0b 3795 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3ce3ed55 3796 | SVs_GMG))) {
a2794585
NC
3797 /* This isn't a first class regexp. Instead, it's
3798 caching a regexp onto an existing, Perl visible
3799 scalar. */
ad64d0ec 3800 sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
3ce3ed55 3801 }
0f5d15d6 3802 PL_regsize = osize;
0f5d15d6 3803 }
288b8c02
NC
3804 re_sv = rx;
3805 re = (struct regexp *)SvANY(rx);
4aabdb9b 3806 }
07bc277f 3807 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
3808 re->subbeg = rex->subbeg;
3809 re->sublen = rex->sublen;
f8fc2ecf 3810 rei = RXi_GET(re);
6bda09f9 3811 DEBUG_EXECUTE_r(
efd26800 3812 debug_start_match(re_sv, do_utf8, locinput, PL_regeol,
6bda09f9
YO
3813 "Matching embedded");
3814 );
f8fc2ecf 3815 startpoint = rei->program + 1;
1a147d38 3816 ST.close_paren = 0; /* only used for GOSUB */
6bda09f9
YO
3817 /* borrowed from regtry */
3818 if (PL_reg_start_tmpl <= re->nparens) {
3819 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3820 if(PL_reg_start_tmp)
3821 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3822 else
3823 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
dd5def09 3824 }
aa283a38 3825
1a147d38 3826 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 3827 /* run the pattern returned from (??{...}) */
40a82448
DM
3828 ST.cp = regcppush(0); /* Save *all* the positions. */
3829 REGCP_SET(ST.lastcp);
6bda09f9 3830
f0ab9afb 3831 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
6bda09f9 3832
0357f1fd
ML
3833 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
3834 PL_reglastparen = &re->lastparen;
3835 PL_reglastcloseparen = &re->lastcloseparen;
3836 re->lastparen = 0;
3837 re->lastcloseparen = 0;
3838
4aabdb9b 3839 PL_reginput = locinput;
ae0beba1 3840 PL_regsize = 0;
4aabdb9b
DM
3841
3842 /* XXXX This is too dramatic a measure... */
3843 PL_reg_maxiter = 0;
3844
faec1544 3845 ST.toggle_reg_flags = PL_reg_flags;
3c8556c3 3846 if (RX_UTF8(re_sv))
faec1544
DM
3847 PL_reg_flags |= RF_utf8;
3848 else
3849 PL_reg_flags &= ~RF_utf8;
3850 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3851
288b8c02 3852 ST.prev_rex = rex_sv;
faec1544 3853 ST.prev_curlyx = cur_curlyx;
288b8c02
NC
3854 SETREX(rex_sv,re_sv);
3855 rex = re;
f8fc2ecf 3856 rexi = rei;
faec1544 3857 cur_curlyx = NULL;
40a82448 3858 ST.B = next;
faec1544
DM
3859 ST.prev_eval = cur_eval;
3860 cur_eval = st;
faec1544 3861 /* now continue from first node in postoned RE */
6bda09f9 3862 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4aabdb9b 3863 /* NOTREACHED */
a0ed51b3 3864 }
24d3c4a9
DM
3865 /* logical is 1, /(?(?{...})X|Y)/ */
3866 sw = (bool)SvTRUE(ret);
3867 logical = 0;
c277df42
IZ
3868 break;
3869 }
40a82448 3870
faec1544
DM
3871 case EVAL_AB: /* cleanup after a successful (??{A})B */
3872 /* note: this is called twice; first after popping B, then A */
3873 PL_reg_flags ^= ST.toggle_reg_flags;
288b8c02
NC
3874 ReREFCNT_dec(rex_sv);
3875 SETREX(rex_sv,ST.prev_rex);
3876 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3877 rexi = RXi_GET(rex);
faec1544
DM
3878 regcpblow(ST.cp);
3879 cur_eval = ST.prev_eval;
3880 cur_curlyx = ST.prev_curlyx;
0357f1fd
ML
3881
3882 PL_reglastparen = &rex->lastparen;
3883 PL_reglastcloseparen = &rex->lastcloseparen;
3884
40a82448
DM
3885 /* XXXX This is too dramatic a measure... */
3886 PL_reg_maxiter = 0;
e7707071 3887 if ( nochange_depth )
4b196cd4 3888 nochange_depth--;
262b90c4 3889 sayYES;
40a82448 3890
40a82448 3891
faec1544
DM
3892 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3893 /* note: this is called twice; first after popping B, then A */
3894 PL_reg_flags ^= ST.toggle_reg_flags;
288b8c02
NC
3895 ReREFCNT_dec(rex_sv);
3896 SETREX(rex_sv,ST.prev_rex);
3897 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3898 rexi = RXi_GET(rex);
0357f1fd
ML
3899 PL_reglastparen = &rex->lastparen;
3900 PL_reglastcloseparen = &rex->lastcloseparen;
3901
40a82448
DM
3902 PL_reginput = locinput;
3903 REGCP_UNWIND(ST.lastcp);
3904 regcppop(rex);
faec1544
DM
3905 cur_eval = ST.prev_eval;
3906 cur_curlyx = ST.prev_curlyx;
3907 /* XXXX This is too dramatic a measure... */
3908 PL_reg_maxiter = 0;
e7707071 3909 if ( nochange_depth )
4b196cd4 3910 nochange_depth--;
40a82448 3911 sayNO_SILENT;
40a82448
DM
3912#undef ST
3913
a0d0e21e 3914 case OPEN:
c277df42 3915 n = ARG(scan); /* which paren pair */
3280af22
NIS
3916 PL_reg_start_tmp[n] = locinput;
3917 if (n > PL_regsize)
3918 PL_regsize = n;
e2e6a0f1 3919 lastopen = n;
a0d0e21e
LW
3920 break;
3921 case CLOSE:
c277df42 3922 n = ARG(scan); /* which paren pair */
f0ab9afb
NC
3923 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
3924 PL_regoffs[n].end = locinput - PL_bostr;
7f69552c
YO
3925 /*if (n > PL_regsize)
3926 PL_regsize = n;*/
3b6647e0 3927 if (n > *PL_reglastparen)
3280af22 3928 *PL_reglastparen = n;
a01268b5 3929 *PL_reglastcloseparen = n;
3b6647e0 3930 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
3931 goto fake_end;
3932 }
a0d0e21e 3933 break;
e2e6a0f1
YO
3934 case ACCEPT:
3935 if (ARG(scan)){
3936 regnode *cursor;
3937 for (cursor=scan;
3938 cursor && OP(cursor)!=END;
3939 cursor=regnext(cursor))
3940 {
3941 if ( OP(cursor)==CLOSE ){
3942 n = ARG(cursor);
3943 if ( n <= lastopen ) {
f0ab9afb
NC
3944 PL_regoffs[n].start
3945 = PL_reg_start_tmp[n] - PL_bostr;
3946 PL_regoffs[n].end = locinput - PL_bostr;
e2e6a0f1
YO
3947 /*if (n > PL_regsize)
3948 PL_regsize = n;*/
3b6647e0 3949 if (n > *PL_reglastparen)
e2e6a0f1
YO
3950 *PL_reglastparen = n;
3951 *PL_reglastcloseparen = n;
3b6647e0
RB
3952 if ( n == ARG(scan) || (cur_eval &&
3953 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
3954 break;
3955 }
3956 }
3957 }
3958 }
3959 goto fake_end;
3960 /*NOTREACHED*/
c277df42
IZ
3961 case GROUPP:
3962 n = ARG(scan); /* which paren pair */
f0ab9afb 3963 sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
c277df42 3964 break;
0a4db386
YO
3965 case NGROUPP:
3966 /* reg_check_named_buff_matched returns 0 for no match */
3967 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3968 break;
1a147d38 3969 case INSUBP:
0a4db386 3970 n = ARG(scan);
3b6647e0 3971 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386
YO
3972 break;
3973 case DEFINEP:
3974 sw = 0;
3975 break;
c277df42 3976 case IFTHEN:
2c2d71f5 3977 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 3978 if (sw)
c277df42
IZ
3979 next = NEXTOPER(NEXTOPER(scan));
3980 else {
3981 next = scan + ARG(scan);
3982 if (OP(next) == IFTHEN) /* Fake one. */
3983 next = NEXTOPER(NEXTOPER(next));
3984 }
3985 break;
3986 case LOGICAL:
24d3c4a9 3987 logical = scan->flags;
c277df42 3988 break;
c476f425 3989
2ab05381 3990/*******************************************************************
2ab05381 3991
c476f425
DM
3992The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3993pattern, where A and B are subpatterns. (For simple A, CURLYM or
3994STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 3995
c476f425 3996A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 3997
c476f425
DM
3998On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3999state, which contains the current count, initialised to -1. It also sets
4000cur_curlyx to point to this state, with any previous value saved in the
4001state block.
2ab05381 4002
c476f425
DM
4003CURLYX then jumps straight to the WHILEM op, rather than executing A,
4004since the pattern may possibly match zero times (i.e. it's a while {} loop
4005rather than a do {} while loop).
2ab05381 4006
c476f425
DM
4007Each entry to WHILEM represents a successful match of A. The count in the
4008CURLYX block is incremented, another WHILEM state is pushed, and execution
4009passes to A or B depending on greediness and the current count.
2ab05381 4010
c476f425
DM
4011For example, if matching against the string a1a2a3b (where the aN are
4012substrings that match /A/), then the match progresses as follows: (the
4013pushed states are interspersed with the bits of strings matched so far):
2ab05381 4014
c476f425
DM
4015 <CURLYX cnt=-1>
4016 <CURLYX cnt=0><WHILEM>
4017 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4018 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4019 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4020 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 4021
c476f425
DM
4022(Contrast this with something like CURLYM, which maintains only a single
4023backtrack state:
2ab05381 4024
c476f425
DM
4025 <CURLYM cnt=0> a1
4026 a1 <CURLYM cnt=1> a2
4027 a1 a2 <CURLYM cnt=2> a3
4028 a1 a2 a3 <CURLYM cnt=3> b
4029)
2ab05381 4030
c476f425
DM
4031Each WHILEM state block marks a point to backtrack to upon partial failure
4032of A or B, and also contains some minor state data related to that
4033iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4034overall state, such as the count, and pointers to the A and B ops.
2ab05381 4035
c476f425
DM
4036This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4037must always point to the *current* CURLYX block, the rules are:
2ab05381 4038
c476f425
DM
4039When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4040and set cur_curlyx to point the new block.
2ab05381 4041
c476f425
DM
4042When popping the CURLYX block after a successful or unsuccessful match,
4043restore the previous cur_curlyx.
2ab05381 4044
c476f425
DM
4045When WHILEM is about to execute B, save the current cur_curlyx, and set it
4046to the outer one saved in the CURLYX block.
2ab05381 4047
c476f425
DM
4048When popping the WHILEM block after a successful or unsuccessful B match,
4049restore the previous cur_curlyx.
2ab05381 4050
c476f425
DM
4051Here's an example for the pattern (AI* BI)*BO
4052I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 4053
c476f425
DM
4054cur_
4055curlyx backtrack stack
4056------ ---------------
4057NULL
4058CO <CO prev=NULL> <WO>
4059CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4060CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4061NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 4062
c476f425
DM
4063At this point the pattern succeeds, and we work back down the stack to
4064clean up, restoring as we go:
95b24440 4065
c476f425
DM
4066CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4067CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4068CO <CO prev=NULL> <WO>
4069NULL
a0374537 4070
c476f425
DM
4071*******************************************************************/
4072
4073#define ST st->u.curlyx
4074
4075 case CURLYX: /* start of /A*B/ (for complex A) */
4076 {
4077 /* No need to save/restore up to this paren */
4078 I32 parenfloor = scan->flags;
4079
4080 assert(next); /* keep Coverity happy */
4081 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4082 next += ARG(next);
4083
4084 /* XXXX Probably it is better to teach regpush to support
4085 parenfloor > PL_regsize... */
4086 if (parenfloor > (I32)*PL_reglastparen)
4087 parenfloor = *PL_reglastparen; /* Pessimization... */
4088
4089 ST.prev_curlyx= cur_curlyx;
4090 cur_curlyx = st;
4091 ST.cp = PL_savestack_ix;
4092
4093 /* these fields contain the state of the current curly.
4094 * they are accessed by subsequent WHILEMs */
4095 ST.parenfloor = parenfloor;
4096 ST.min = ARG1(scan);
4097 ST.max = ARG2(scan);
4098 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4099 ST.B = next;
24d3c4a9
DM
4100 ST.minmod = minmod;
4101 minmod = 0;
c476f425
DM
4102 ST.count = -1; /* this will be updated by WHILEM */
4103 ST.lastloc = NULL; /* this will be updated by WHILEM */
4104
4105 PL_reginput = locinput;
4106 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
5f66b61c 4107 /* NOTREACHED */
c476f425 4108 }
a0d0e21e 4109
c476f425 4110 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
4111 cur_curlyx = ST.prev_curlyx;
4112 sayYES;
4113 /* NOTREACHED */
a0d0e21e 4114
c476f425
DM
4115 case CURLYX_end_fail: /* just failed to match all of A*B */
4116 regcpblow(ST.cp);
4117 cur_curlyx = ST.prev_curlyx;
4118 sayNO;
4119 /* NOTREACHED */
4633a7c4 4120
a0d0e21e 4121
c476f425
DM
4122#undef ST
4123#define ST st->u.whilem
4124
4125 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4126 {
4127 /* see the discussion above about CURLYX/WHILEM */
c476f425
DM
4128 I32 n;
4129 assert(cur_curlyx); /* keep Coverity happy */
4130 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4131 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4132 ST.cache_offset = 0;
4133 ST.cache_mask = 0;
4134
4135 PL_reginput = locinput;
4136
4137 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4138 "%*s whilem: matched %ld out of %ld..%ld\n",
4139 REPORT_CODE_OFF+depth*2, "", (long)n,
4140 (long)cur_curlyx->u.curlyx.min,
4141 (long)cur_curlyx->u.curlyx.max)
4142 );
a0d0e21e 4143
c476f425 4144 /* First just match a string of min A's. */
a0d0e21e 4145
c476f425
DM
4146 if (n < cur_curlyx->u.curlyx.min) {
4147 cur_curlyx->u.curlyx.lastloc = locinput;
4148 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4149 /* NOTREACHED */
4150 }
4151
4152 /* If degenerate A matches "", assume A done. */
4153
4154 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4155 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4156 "%*s whilem: empty match detected, trying continuation...\n",
4157 REPORT_CODE_OFF+depth*2, "")
4158 );
4159 goto do_whilem_B_max;
4160 }
4161
4162 /* super-linear cache processing */
4163
4164 if (scan->flags) {
a0d0e21e 4165
2c2d71f5 4166 if (!PL_reg_maxiter) {
c476f425
DM
4167 /* start the countdown: Postpone detection until we
4168 * know the match is not *that* much linear. */
2c2d71f5 4169 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
4170 /* possible overflow for long strings and many CURLYX's */
4171 if (PL_reg_maxiter < 0)
4172 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
4173 PL_reg_leftiter = PL_reg_maxiter;
4174 }
c476f425 4175
2c2d71f5 4176 if (PL_reg_leftiter-- == 0) {
c476f425 4177 /* initialise cache */
3298f257 4178 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 4179 if (PL_reg_poscache) {
eb160463 4180 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
4181 Renew(PL_reg_poscache, size, char);
4182 PL_reg_poscache_size = size;
4183 }
4184 Zero(PL_reg_poscache, size, char);
4185 }
4186 else {
4187 PL_reg_poscache_size = size;
a02a5408 4188 Newxz(PL_reg_poscache, size, char);
2c2d71f5 4189 }
c476f425
DM
4190 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4191 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4192 PL_colors[4], PL_colors[5])
4193 );
2c2d71f5 4194 }
c476f425 4195
2c2d71f5 4196 if (PL_reg_leftiter < 0) {
c476f425
DM
4197 /* have we already failed at this position? */
4198 I32 offset, mask;
4199 offset = (scan->flags & 0xf) - 1
4200 + (locinput - PL_bostr) * (scan->flags>>4);
4201 mask = 1 << (offset % 8);
4202 offset /= 8;
4203 if (PL_reg_poscache[offset] & mask) {
4204 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4205 "%*s whilem: (cache) already tried at this position...\n",
4206 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 4207 );
3298f257 4208 sayNO; /* cache records failure */
2c2d71f5 4209 }
c476f425
DM
4210 ST.cache_offset = offset;
4211 ST.cache_mask = mask;
2c2d71f5 4212 }
c476f425 4213 }
2c2d71f5 4214
c476f425 4215 /* Prefer B over A for minimal matching. */
a687059c 4216
c476f425
DM
4217 if (cur_curlyx->u.curlyx.minmod) {
4218 ST.save_curlyx = cur_curlyx;
4219 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4220 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4221 REGCP_SET(ST.lastcp);
4222 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4223 /* NOTREACHED */
4224 }
a0d0e21e 4225
c476f425
DM
4226 /* Prefer A over B for maximal matching. */
4227
4228 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4229 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4230 cur_curlyx->u.curlyx.lastloc = locinput;
4231 REGCP_SET(ST.lastcp);
4232 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4233 /* NOTREACHED */
4234 }
4235 goto do_whilem_B_max;
4236 }
4237 /* NOTREACHED */
4238
4239 case WHILEM_B_min: /* just matched B in a minimal match */
4240 case WHILEM_B_max: /* just matched B in a maximal match */
4241 cur_curlyx = ST.save_curlyx;
4242 sayYES;
4243 /* NOTREACHED */
4244
4245 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4246 cur_curlyx = ST.save_curlyx;
4247 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4248 cur_curlyx->u.curlyx.count--;
4249 CACHEsayNO;
4250 /* NOTREACHED */
4251
4252 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4253 REGCP_UNWIND(ST.lastcp);
4254 regcppop(rex);
4255 /* FALL THROUGH */
4256 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4257 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4258 cur_curlyx->u.curlyx.count--;
4259 CACHEsayNO;
4260 /* NOTREACHED */
4261
4262 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4263 REGCP_UNWIND(ST.lastcp);
4264 regcppop(rex); /* Restore some previous $<digit>s? */
4265 PL_reginput = locinput;
4266 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4267 "%*s whilem: failed, trying continuation...\n",
4268 REPORT_CODE_OFF+depth*2, "")
4269 );
4270 do_whilem_B_max:
4271 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4272 && ckWARN(WARN_REGEXP)
4273 && !(PL_reg_flags & RF_warned))
4274 {
4275 PL_reg_flags |= RF_warned;
4276 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4277 "Complex regular subexpression recursion",
4278 REG_INFTY - 1);
4279 }
4280
4281 /* now try B */
4282 ST.save_curlyx = cur_curlyx;
4283 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4284 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4285 /* NOTREACHED */
4286
4287 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4288 cur_curlyx = ST.save_curlyx;
4289 REGCP_UNWIND(ST.lastcp);
4290 regcppop(rex);
4291
4292 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4293 /* Maximum greed exceeded */
4294 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4295 && ckWARN(WARN_REGEXP)
4296 && !(PL_reg_flags & RF_warned))
4297 {
3280af22 4298 PL_reg_flags |= RF_warned;
c476f425
DM
4299 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4300 "%s limit (%d) exceeded",
4301 "Complex regular subexpression recursion",
4302 REG_INFTY - 1);
a0d0e21e 4303 }
c476f425 4304 cur_curlyx->u.curlyx.count--;
3ab3c9b4 4305 CACHEsayNO;
a0d0e21e 4306 }
c476f425
DM
4307
4308 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4309 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4310 );
4311 /* Try grabbing another A and see if it helps. */
4312 PL_reginput = locinput;
4313 cur_curlyx->u.curlyx.lastloc = locinput;
4314 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4315 REGCP_SET(ST.lastcp);
4316 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
5f66b61c 4317 /* NOTREACHED */
40a82448
DM
4318
4319#undef ST
4320#define ST st->u.branch
4321
4322 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
4323 next = scan + ARG(scan);
4324 if (next == scan)
4325 next = NULL;
40a82448
DM
4326 scan = NEXTOPER(scan);
4327 /* FALL THROUGH */
c277df42 4328
40a82448
DM
4329 case BRANCH: /* /(...|A|...)/ */
4330 scan = NEXTOPER(scan); /* scan now points to inner node */
40a82448
DM
4331 ST.lastparen = *PL_reglastparen;
4332 ST.next_branch = next;
4333 REGCP_SET(ST.cp);
4334 PL_reginput = locinput;
02db2b7b 4335
40a82448 4336 /* Now go into the branch */
5d458dd8
YO
4337 if (has_cutgroup) {
4338 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4339 } else {
4340 PUSH_STATE_GOTO(BRANCH_next, scan);
4341 }
40a82448 4342 /* NOTREACHED */
5d458dd8
YO
4343 case CUTGROUP:
4344 PL_reginput = locinput;
4345 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 4346 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
4347 PUSH_STATE_GOTO(CUTGROUP_next,next);
4348 /* NOTREACHED */
4349 case CUTGROUP_next_fail:
4350 do_cutgroup = 1;
4351 no_final = 1;
4352 if (st->u.mark.mark_name)
4353 sv_commit = st->u.mark.mark_name;
4354 sayNO;
4355 /* NOTREACHED */
4356 case BRANCH_next:
4357 sayYES;
4358 /* NOTREACHED */
40a82448 4359 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
4360 if (do_cutgroup) {
4361 do_cutgroup = 0;
4362 no_final = 0;
4363 }
40a82448
DM
4364 REGCP_UNWIND(ST.cp);
4365 for (n = *PL_reglastparen; n > ST.lastparen; n--)
f0ab9afb 4366 PL_regoffs[n].end = -1;
40a82448 4367 *PL_reglastparen = n;
0a4db386 4368 /*dmq: *PL_reglastcloseparen = n; */
40a82448
DM
4369 scan = ST.next_branch;
4370 /* no more branches? */
5d458dd8
YO
4371 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4372 DEBUG_EXECUTE_r({
4373 PerlIO_printf( Perl_debug_log,
4374 "%*s %sBRANCH failed...%s\n",
4375 REPORT_CODE_OFF+depth*2, "",
4376 PL_colors[4],
4377 PL_colors[5] );
4378 });
4379 sayNO_SILENT;
4380 }
40a82448
DM
4381 continue; /* execute next BRANCH[J] op */
4382 /* NOTREACHED */
4383
a0d0e21e 4384 case MINMOD:
24d3c4a9 4385 minmod = 1;
a0d0e21e 4386 break;
40a82448
DM
4387
4388#undef ST
4389#define ST st->u.curlym
4390
4391 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4392
4393 /* This is an optimisation of CURLYX that enables us to push
4394 * only a single backtracking state, no matter now many matches
4395 * there are in {m,n}. It relies on the pattern being constant
4396 * length, with no parens to influence future backrefs
4397 */
4398
4399 ST.me = scan;
dc45a647 4400 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
4401
4402 /* if paren positive, emulate an OPEN/CLOSE around A */
4403 if (ST.me->flags) {
3b6647e0 4404 U32 paren = ST.me->flags;
40a82448
DM
4405 if (paren > PL_regsize)
4406 PL_regsize = paren;
3b6647e0 4407 if (paren > *PL_reglastparen)
40a82448 4408 *PL_reglastparen = paren;
c277df42 4409 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 4410 }
40a82448
DM
4411 ST.A = scan;
4412 ST.B = next;
4413 ST.alen = 0;
4414 ST.count = 0;
24d3c4a9
DM
4415 ST.minmod = minmod;
4416 minmod = 0;
40a82448
DM
4417 ST.c1 = CHRTEST_UNINIT;
4418 REGCP_SET(ST.cp);
6407bf3b 4419
40a82448
DM
4420 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4421 goto curlym_do_B;
4422
4423 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 4424 PL_reginput = locinput;
40a82448
DM
4425 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4426 /* NOTREACHED */
5f80c4cf 4427
40a82448
DM
4428 case CURLYM_A: /* we've just matched an A */
4429 locinput = st->locinput;
4430 nextchr = UCHARAT(locinput);
4431
4432 ST.count++;
4433 /* after first match, determine A's length: u.curlym.alen */
4434 if (ST.count == 1) {
4435 if (PL_reg_match_utf8) {
4436 char *s = locinput;
4437 while (s < PL_reginput) {
4438 ST.alen++;
4439 s += UTF8SKIP(s);
4440 }
4441 }
4442 else {
4443 ST.alen = PL_reginput - locinput;
4444 }
4445 if (ST.alen == 0)
4446 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4447 }
0cadcf80
DM
4448 DEBUG_EXECUTE_r(
4449 PerlIO_printf(Perl_debug_log,
40a82448 4450 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 4451 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 4452 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
4453 );
4454
40a82448 4455 locinput = PL_reginput;
0a4db386
YO
4456
4457 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4458 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
4459 goto fake_end;
4460
4461 if ( ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)) )
40a82448
DM
4462 goto curlym_do_A; /* try to match another A */
4463 goto curlym_do_B; /* try to match B */
5f80c4cf 4464
40a82448
DM
4465 case CURLYM_A_fail: /* just failed to match an A */
4466 REGCP_UNWIND(ST.cp);
0a4db386
YO
4467
4468 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4469 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4470 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 4471 sayNO;
0cadcf80 4472
40a82448
DM
4473 curlym_do_B: /* execute the B in /A{m,n}B/ */
4474 PL_reginput = locinput;
4475 if (ST.c1 == CHRTEST_UNINIT) {
4476 /* calculate c1 and c2 for possible match of 1st char
4477 * following curly */
4478 ST.c1 = ST.c2 = CHRTEST_VOID;
4479 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4480 regnode *text_node = ST.B;
4481 if (! HAS_TEXT(text_node))
4482 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
4483 /* this used to be
4484
4485 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4486
4487 But the former is redundant in light of the latter.
4488
4489 if this changes back then the macro for
4490 IS_TEXT and friends need to change.
4491 */
4492 if (PL_regkind[OP(text_node)] == EXACT)
40a82448 4493 {
ee9b8eae 4494
40a82448
DM
4495 ST.c1 = (U8)*STRING(text_node);
4496 ST.c2 =
ee9b8eae 4497 (IS_TEXTF(text_node))
40a82448 4498 ? PL_fold[ST.c1]
ee9b8eae 4499 : (IS_TEXTFL(text_node))
40a82448
DM
4500 ? PL_fold_locale[ST.c1]
4501 : ST.c1;
c277df42 4502 }
c277df42 4503 }
40a82448
DM
4504 }
4505
4506 DEBUG_EXECUTE_r(
4507 PerlIO_printf(Perl_debug_log,
4508 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 4509 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
4510 "", (IV)ST.count)
4511 );
4512 if (ST.c1 != CHRTEST_VOID
4513 && UCHARAT(PL_reginput) != ST.c1
4514 && UCHARAT(PL_reginput) != ST.c2)
4515 {
4516 /* simulate B failing */
3e901dc0
YO
4517 DEBUG_OPTIMISE_r(
4518 PerlIO_printf(Perl_debug_log,
4519 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4520 (int)(REPORT_CODE_OFF+(depth*2)),"",
4521 (IV)ST.c1,(IV)ST.c2
4522 ));
40a82448
DM
4523 state_num = CURLYM_B_fail;
4524 goto reenter_switch;
4525 }
4526
4527 if (ST.me->flags) {
4528 /* mark current A as captured */
4529 I32 paren = ST.me->flags;
4530 if (ST.count) {
f0ab9afb 4531 PL_regoffs[paren].start
40a82448 4532 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
f0ab9afb 4533 PL_regoffs[paren].end = PL_reginput - PL_bostr;
0a4db386 4534 /*dmq: *PL_reglastcloseparen = paren; */
c277df42 4535 }
40a82448 4536 else
f0ab9afb 4537 PL_regoffs[paren].end = -1;
0a4db386 4538 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4539 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
4540 {
4541 if (ST.count)
4542 goto fake_end;
4543 else
4544 sayNO;
4545 }
c277df42 4546 }
0a4db386 4547
40a82448 4548 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 4549 /* NOTREACHED */
40a82448
DM
4550
4551 case CURLYM_B_fail: /* just failed to match a B */
4552 REGCP_UNWIND(ST.cp);
4553 if (ST.minmod) {
4554 if (ST.count == ARG2(ST.me) /* max */)
4555 sayNO;
4556 goto curlym_do_A; /* try to match a further A */
4557 }
4558 /* backtrack one A */
4559 if (ST.count == ARG1(ST.me) /* min */)
4560 sayNO;
4561 ST.count--;
4562 locinput = HOPc(locinput, -ST.alen);
4563 goto curlym_do_B; /* try to match B */
4564
c255a977
DM
4565#undef ST
4566#define ST st->u.curly
40a82448 4567
c255a977
DM
4568#define CURLY_SETPAREN(paren, success) \
4569 if (paren) { \
4570 if (success) { \
f0ab9afb
NC
4571 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4572 PL_regoffs[paren].end = locinput - PL_bostr; \
0a4db386 4573 *PL_reglastcloseparen = paren; \
c255a977
DM
4574 } \
4575 else \
f0ab9afb 4576 PL_regoffs[paren].end = -1; \
c255a977
DM
4577 }
4578
4579 case STAR: /* /A*B/ where A is width 1 */
4580 ST.paren = 0;
4581 ST.min = 0;
4582 ST.max = REG_INFTY;
a0d0e21e
LW
4583 scan = NEXTOPER(scan);
4584 goto repeat;
c255a977
DM
4585 case PLUS: /* /A+B/ where A is width 1 */
4586 ST.paren = 0;
4587 ST.min = 1;
4588 ST.max = REG_INFTY;
c277df42 4589 scan = NEXTOPER(scan);
c255a977
DM
4590 goto repeat;
4591 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4592 ST.paren = scan->flags; /* Which paren to set */
4593 if (ST.paren > PL_regsize)
4594 PL_regsize = ST.paren;
3b6647e0 4595 if (ST.paren > *PL_reglastparen)
c255a977
DM
4596 *PL_reglastparen = ST.paren;
4597 ST.min = ARG1(scan); /* min to match */
4598 ST.max = ARG2(scan); /* max to match */
0a4db386 4599 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4600 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4601 ST.min=1;
4602 ST.max=1;
4603 }
c255a977
DM
4604 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4605 goto repeat;
4606 case CURLY: /* /A{m,n}B/ where A is width 1 */
4607 ST.paren = 0;
4608 ST.min = ARG1(scan); /* min to match */
4609 ST.max = ARG2(scan); /* max to match */
4610 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 4611 repeat:
a0d0e21e
LW
4612 /*
4613 * Lookahead to avoid useless match attempts
4614 * when we know what character comes next.
c255a977 4615 *
5f80c4cf
JP
4616 * Used to only do .*x and .*?x, but now it allows
4617 * for )'s, ('s and (?{ ... })'s to be in the way
4618 * of the quantifier and the EXACT-like node. -- japhy
4619 */
4620
c255a977
DM
4621 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4622 sayNO;
cca55fe3 4623 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4624 U8 *s;
4625 regnode *text_node = next;
4626
3dab1dad
YO
4627 if (! HAS_TEXT(text_node))
4628 FIND_NEXT_IMPT(text_node);
5f80c4cf 4629
9e137952 4630 if (! HAS_TEXT(text_node))
c255a977 4631 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 4632 else {
ee9b8eae 4633 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 4634 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 4635 goto assume_ok_easy;
cca55fe3 4636 }
be8e71aa
YO
4637 else
4638 s = (U8*)STRING(text_node);
ee9b8eae
YO
4639
4640 /* Currently we only get here when
4641
4642 PL_rekind[OP(text_node)] == EXACT
4643
4644 if this changes back then the macro for IS_TEXT and
4645 friends need to change. */
5f80c4cf 4646 if (!UTF) {
c255a977 4647 ST.c2 = ST.c1 = *s;
ee9b8eae 4648 if (IS_TEXTF(text_node))
c255a977 4649 ST.c2 = PL_fold[ST.c1];
ee9b8eae 4650 else if (IS_TEXTFL(text_node))
c255a977 4651 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 4652 }
5f80c4cf 4653 else { /* UTF */
ee9b8eae 4654 if (IS_TEXTF(text_node)) {
a2a2844f 4655 STRLEN ulen1, ulen2;
89ebb4a3
JH
4656 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4657 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4658
4659 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4660 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
e294cc5d
JH
4661#ifdef EBCDIC
4662 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4663 ckWARN(WARN_UTF8) ?
4664 0 : UTF8_ALLOW_ANY);
4665 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4666 ckWARN(WARN_UTF8) ?
4667 0 : UTF8_ALLOW_ANY);
4668#else
c255a977 4669 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
e294cc5d 4670 uniflags);
c255a977 4671 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
e294cc5d
JH
4672 uniflags);
4673#endif
5f80c4cf
JP
4674 }
4675 else {
c255a977 4676 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4677 uniflags);
5f80c4cf 4678 }
1aa99e6b
IH
4679 }
4680 }
bbce6d69 4681 }
a0d0e21e 4682 else
c255a977 4683 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 4684 assume_ok_easy:
c255a977
DM
4685
4686 ST.A = scan;
4687 ST.B = next;
3280af22 4688 PL_reginput = locinput;
24d3c4a9
DM
4689 if (minmod) {
4690 minmod = 0;
e2e6a0f1 4691 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4633a7c4 4692 sayNO;
c255a977 4693 ST.count = ST.min;
a0ed51b3 4694 locinput = PL_reginput;
c255a977
DM
4695 REGCP_SET(ST.cp);
4696 if (ST.c1 == CHRTEST_VOID)
4697 goto curly_try_B_min;
4698
4699 ST.oldloc = locinput;
4700
4701 /* set ST.maxpos to the furthest point along the
4702 * string that could possibly match */
4703 if (ST.max == REG_INFTY) {
4704 ST.maxpos = PL_regeol - 1;
4705 if (do_utf8)
4706 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4707 ST.maxpos--;
4708 }
4709 else if (do_utf8) {
4710 int m = ST.max - ST.min;
4711 for (ST.maxpos = locinput;
4712 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4713 ST.maxpos += UTF8SKIP(ST.maxpos);
4714 }
4715 else {
4716 ST.maxpos = locinput + ST.max - ST.min;
4717 if (ST.maxpos >= PL_regeol)
4718 ST.maxpos = PL_regeol - 1;
4719 }
4720 goto curly_try_B_min_known;
4721
4722 }
4723 else {
e2e6a0f1 4724 ST.count = regrepeat(rex, ST.A, ST.max, depth);
c255a977
DM
4725 locinput = PL_reginput;
4726 if (ST.count < ST.min)
4727 sayNO;
4728 if ((ST.count > ST.min)
4729 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4730 {
4731 /* A{m,n} must come at the end of the string, there's
4732 * no point in backing off ... */
4733 ST.min = ST.count;
4734 /* ...except that $ and \Z can match before *and* after
4735 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4736 We may back off by one in this case. */
4737 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4738 ST.min--;
4739 }
4740 REGCP_SET(ST.cp);
4741 goto curly_try_B_max;
4742 }
4743 /* NOTREACHED */
4744
4745
4746 case CURLY_B_min_known_fail:
4747 /* failed to find B in a non-greedy match where c1,c2 valid */
4748 if (ST.paren && ST.count)
f0ab9afb 4749 PL_regoffs[ST.paren].end = -1;
c255a977
DM
4750
4751 PL_reginput = locinput; /* Could be reset... */
4752 REGCP_UNWIND(ST.cp);
4753 /* Couldn't or didn't -- move forward. */
4754 ST.oldloc = locinput;
4755 if (do_utf8)
4756 locinput += UTF8SKIP(locinput);
4757 else
4758 locinput++;
4759 ST.count++;
4760 curly_try_B_min_known:
4761 /* find the next place where 'B' could work, then call B */
4762 {
4763 int n;
4764 if (do_utf8) {
4765 n = (ST.oldloc == locinput) ? 0 : 1;
4766 if (ST.c1 == ST.c2) {
4767 STRLEN len;
4768 /* set n to utf8_distance(oldloc, locinput) */
4769 while (locinput <= ST.maxpos &&
4770 utf8n_to_uvchr((U8*)locinput,
4771 UTF8_MAXBYTES, &len,
4772 uniflags) != (UV)ST.c1) {
4773 locinput += len;
4774 n++;
4775 }
1aa99e6b
IH
4776 }
4777 else {
c255a977
DM
4778 /* set n to utf8_distance(oldloc, locinput) */
4779 while (locinput <= ST.maxpos) {
4780 STRLEN len;
4781 const UV c = utf8n_to_uvchr((U8*)locinput,
4782 UTF8_MAXBYTES, &len,
4783 uniflags);
4784 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4785 break;
4786 locinput += len;
4787 n++;
1aa99e6b 4788 }
0fe9bf95
IZ
4789 }
4790 }
c255a977
DM
4791 else {
4792 if (ST.c1 == ST.c2) {
4793 while (locinput <= ST.maxpos &&
4794 UCHARAT(locinput) != ST.c1)
4795 locinput++;
bbce6d69 4796 }
c255a977
DM
4797 else {
4798 while (locinput <= ST.maxpos
4799 && UCHARAT(locinput) != ST.c1
4800 && UCHARAT(locinput) != ST.c2)
4801 locinput++;
a0ed51b3 4802 }
c255a977
DM
4803 n = locinput - ST.oldloc;
4804 }
4805 if (locinput > ST.maxpos)
4806 sayNO;
4807 /* PL_reginput == oldloc now */
4808 if (n) {
4809 ST.count += n;
e2e6a0f1 4810 if (regrepeat(rex, ST.A, n, depth) < n)
4633a7c4 4811 sayNO;
a0d0e21e 4812 }
c255a977
DM
4813 PL_reginput = locinput;
4814 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 4815 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4816 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4817 goto fake_end;
4818 }
c255a977 4819 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 4820 }
c255a977
DM
4821 /* NOTREACHED */
4822
4823
4824 case CURLY_B_min_fail:
4825 /* failed to find B in a non-greedy match where c1,c2 invalid */
4826 if (ST.paren && ST.count)
f0ab9afb 4827 PL_regoffs[ST.paren].end = -1;
c255a977
DM
4828
4829 REGCP_UNWIND(ST.cp);
4830 /* failed -- move forward one */
4831 PL_reginput = locinput;
e2e6a0f1 4832 if (regrepeat(rex, ST.A, 1, depth)) {
c255a977 4833 ST.count++;
a0ed51b3 4834 locinput = PL_reginput;
c255a977
DM
4835 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4836 ST.count > 0)) /* count overflow ? */
15272685 4837 {
c255a977
DM
4838 curly_try_B_min:
4839 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 4840 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4841 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4842 goto fake_end;
4843 }
c255a977 4844 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
4845 }
4846 }
4633a7c4 4847 sayNO;
c255a977
DM
4848 /* NOTREACHED */
4849
4850
4851 curly_try_B_max:
4852 /* a successful greedy match: now try to match B */
40d049e4 4853 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4854 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
4855 goto fake_end;
4856 }
c255a977
DM
4857 {
4858 UV c = 0;
4859 if (ST.c1 != CHRTEST_VOID)
4860 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4861 UTF8_MAXBYTES, 0, uniflags)
466787eb 4862 : (UV) UCHARAT(PL_reginput);
c255a977
DM
4863 /* If it could work, try it. */
4864 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4865 CURLY_SETPAREN(ST.paren, ST.count);
4866 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4867 /* NOTREACHED */
4868 }
4869 }
4870 /* FALL THROUGH */
4871 case CURLY_B_max_fail:
4872 /* failed to find B in a greedy match */
4873 if (ST.paren && ST.count)
f0ab9afb 4874 PL_regoffs[ST.paren].end = -1;
c255a977
DM
4875
4876 REGCP_UNWIND(ST.cp);
4877 /* back up. */
4878 if (--ST.count < ST.min)
4879 sayNO;
4880 PL_reginput = locinput = HOPc(locinput, -1);
4881 goto curly_try_B_max;
4882
4883#undef ST
4884
a0d0e21e 4885 case END:
6bda09f9 4886 fake_end:
faec1544
DM
4887 if (cur_eval) {
4888 /* we've just finished A in /(??{A})B/; now continue with B */
4889 I32 tmpix;
faec1544
DM
4890 st->u.eval.toggle_reg_flags
4891 = cur_eval->u.eval.toggle_reg_flags;
4892 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4893
288b8c02
NC
4894 st->u.eval.prev_rex = rex_sv; /* inner */
4895 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
4896 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 4897 rexi = RXi_GET(rex);
faec1544 4898 cur_curlyx = cur_eval->u.eval.prev_curlyx;
288b8c02 4899 ReREFCNT_inc(rex_sv);
faec1544
DM
4900 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4901 REGCP_SET(st->u.eval.lastcp);
4902 PL_reginput = locinput;
4903
4904 /* Restore parens of the outer rex without popping the
4905 * savestack */
4906 tmpix = PL_savestack_ix;
4907 PL_savestack_ix = cur_eval->u.eval.lastcp;
4908 regcppop(rex);
4909 PL_savestack_ix = tmpix;
4910
4911 st->u.eval.prev_eval = cur_eval;
4912 cur_eval = cur_eval->u.eval.prev_eval;
4913 DEBUG_EXECUTE_r(
2a49f0f5
JH
4914 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
4915 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
4916 if ( nochange_depth )
4917 nochange_depth--;
4918
4919 PUSH_YES_STATE_GOTO(EVAL_AB,
faec1544
DM
4920 st->u.eval.prev_eval->u.eval.B); /* match B */
4921 }
4922
3b0527fe 4923 if (locinput < reginfo->till) {
a3621e74 4924 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4925 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4926 PL_colors[4],
4927 (long)(locinput - PL_reg_starttry),
3b0527fe 4928 (long)(reginfo->till - PL_reg_starttry),
7821416a 4929 PL_colors[5]));
58e23c8d 4930
262b90c4 4931 sayNO_SILENT; /* Cannot match: too short. */
7821416a
IZ
4932 }
4933 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 4934 sayYES; /* Success! */
dad79028
DM
4935
4936 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4937 DEBUG_EXECUTE_r(
4938 PerlIO_printf(Perl_debug_log,
4939 "%*s %ssubpattern success...%s\n",
5bc10b2c 4940 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
3280af22 4941 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 4942 sayYES; /* Success! */
dad79028 4943
40a82448
DM
4944#undef ST
4945#define ST st->u.ifmatch
4946
4947 case SUSPEND: /* (?>A) */
4948 ST.wanted = 1;
9fe1d20c 4949 PL_reginput = locinput;
9041c2e3 4950 goto do_ifmatch;
dad79028 4951
40a82448
DM
4952 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4953 ST.wanted = 0;
dad79028
DM
4954 goto ifmatch_trivial_fail_test;
4955
40a82448
DM
4956 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4957 ST.wanted = 1;
dad79028 4958 ifmatch_trivial_fail_test:
a0ed51b3 4959 if (scan->flags) {
52657f30 4960 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4961 if (!s) {
4962 /* trivial fail */
24d3c4a9
DM
4963 if (logical) {
4964 logical = 0;
4965 sw = 1 - (bool)ST.wanted;
dad79028 4966 }
40a82448 4967 else if (ST.wanted)
dad79028
DM
4968 sayNO;
4969 next = scan + ARG(scan);
4970 if (next == scan)
4971 next = NULL;
4972 break;
4973 }
efb30f32 4974 PL_reginput = s;
a0ed51b3
LW
4975 }
4976 else
4977 PL_reginput = locinput;
4978
c277df42 4979 do_ifmatch:
40a82448 4980 ST.me = scan;
24d3c4a9 4981 ST.logical = logical;
40a82448
DM
4982 /* execute body of (?...A) */
4983 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4984 /* NOTREACHED */
4985
4986 case IFMATCH_A_fail: /* body of (?...A) failed */
4987 ST.wanted = !ST.wanted;
4988 /* FALL THROUGH */
4989
4990 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9
DM
4991 if (ST.logical) {
4992 sw = (bool)ST.wanted;
40a82448
DM
4993 }
4994 else if (!ST.wanted)
4995 sayNO;
4996
4997 if (OP(ST.me) == SUSPEND)
4998 locinput = PL_reginput;
4999 else {
5000 locinput = PL_reginput = st->locinput;
5001 nextchr = UCHARAT(locinput);
5002 }
5003 scan = ST.me + ARG(ST.me);
5004 if (scan == ST.me)
5005 scan = NULL;
5006 continue; /* execute B */
5007
5008#undef ST
dad79028 5009
c277df42 5010 case LONGJMP:
c277df42
IZ
5011 next = scan + ARG(scan);
5012 if (next == scan)
5013 next = NULL;
a0d0e21e 5014 break;
54612592 5015 case COMMIT:
e2e6a0f1
YO
5016 reginfo->cutpoint = PL_regeol;
5017 /* FALLTHROUGH */
5d458dd8 5018 case PRUNE:
24b23f37 5019 PL_reginput = locinput;
e2e6a0f1 5020 if (!scan->flags)
ad64d0ec 5021 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
54612592
YO
5022 PUSH_STATE_GOTO(COMMIT_next,next);
5023 /* NOTREACHED */
5024 case COMMIT_next_fail:
5025 no_final = 1;
5026 /* FALLTHROUGH */
7f69552c
YO
5027 case OPFAIL:
5028 sayNO;
e2e6a0f1
YO
5029 /* NOTREACHED */
5030
5031#define ST st->u.mark
5032 case MARKPOINT:
5033 ST.prev_mark = mark_state;
5d458dd8 5034 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 5035 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1
YO
5036 mark_state = st;
5037 ST.mark_loc = PL_reginput = locinput;
5038 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5039 /* NOTREACHED */
5040 case MARKPOINT_next:
5041 mark_state = ST.prev_mark;
5042 sayYES;
5043 /* NOTREACHED */
5044 case MARKPOINT_next_fail:
5d458dd8 5045 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
5046 {
5047 if (ST.mark_loc > startpoint)
5048 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5049 popmark = NULL; /* we found our mark */
5050 sv_commit = ST.mark_name;
5051
5052 DEBUG_EXECUTE_r({
5d458dd8 5053 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
5054 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5055 REPORT_CODE_OFF+depth*2, "",
be2597df 5056 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
5057 });
5058 }
5059 mark_state = ST.prev_mark;
5d458dd8
YO
5060 sv_yes_mark = mark_state ?
5061 mark_state->u.mark.mark_name : NULL;
e2e6a0f1
YO
5062 sayNO;
5063 /* NOTREACHED */
5d458dd8
YO
5064 case SKIP:
5065 PL_reginput = locinput;
5066 if (scan->flags) {
2bf803e2 5067 /* (*SKIP) : if we fail we cut here*/
5d458dd8 5068 ST.mark_name = NULL;
e2e6a0f1 5069 ST.mark_loc = locinput;
5d458dd8
YO
5070 PUSH_STATE_GOTO(SKIP_next,next);
5071 } else {
2bf803e2 5072 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
5073 otherwise do nothing. Meaning we need to scan
5074 */
5075 regmatch_state *cur = mark_state;
ad64d0ec 5076 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
5077
5078 while (cur) {
5079 if ( sv_eq( cur->u.mark.mark_name,
5080 find ) )
5081 {
5082 ST.mark_name = find;
5083 PUSH_STATE_GOTO( SKIP_next, next );
5084 }
5085 cur = cur->u.mark.prev_mark;
5086 }
e2e6a0f1 5087 }
2bf803e2 5088 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8
YO
5089 break;
5090 case SKIP_next_fail:
5091 if (ST.mark_name) {
5092 /* (*CUT:NAME) - Set up to search for the name as we
5093 collapse the stack*/
5094 popmark = ST.mark_name;
5095 } else {
5096 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
5097 if (ST.mark_loc > startpoint)
5098 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
5099 /* but we set sv_commit to latest mark_name if there
5100 is one so they can test to see how things lead to this
5101 cut */
5102 if (mark_state)
5103 sv_commit=mark_state->u.mark.mark_name;
5104 }
e2e6a0f1
YO
5105 no_final = 1;
5106 sayNO;
5107 /* NOTREACHED */
5108#undef ST
32e6a07c
YO
5109 case FOLDCHAR:
5110 n = ARG(scan);
81d4fa0f 5111 if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
e64b1bd1
YO
5112 locinput += ln;
5113 } else if ( 0xDF == n && !do_utf8 && !UTF ) {
5114 sayNO;
5115 } else {
5116 U8 folded[UTF8_MAXBYTES_CASE+1];
5117 STRLEN foldlen;
5118 const char * const l = locinput;
5119 char *e = PL_regeol;
5120 to_uni_fold(n, folded, &foldlen);
5121
59fe32ea 5122 if (ibcmp_utf8((const char*) folded, 0, foldlen, 1,
e64b1bd1 5123 l, &e, 0, do_utf8)) {
32e6a07c 5124 sayNO;
e64b1bd1
YO
5125 }
5126 locinput = e;
32e6a07c
YO
5127 }
5128 nextchr = UCHARAT(locinput);
5129 break;
e1d1eefb
YO
5130 case LNBREAK:
5131 if ((n=is_LNBREAK(locinput,do_utf8))) {
5132 locinput += n;
5133 nextchr = UCHARAT(locinput);
5134 } else
5135 sayNO;
5136 break;
5137
5138#define CASE_CLASS(nAmE) \
5139 case nAmE: \
5140 if ((n=is_##nAmE(locinput,do_utf8))) { \
5141 locinput += n; \
5142 nextchr = UCHARAT(locinput); \
5143 } else \
5144 sayNO; \
5145 break; \
5146 case N##nAmE: \
5147 if ((n=is_##nAmE(locinput,do_utf8))) { \
5148 sayNO; \
5149 } else { \
5150 locinput += UTF8SKIP(locinput); \
5151 nextchr = UCHARAT(locinput); \
5152 } \
5153 break
5154
5155 CASE_CLASS(VERTWS);
5156 CASE_CLASS(HORIZWS);
5157#undef CASE_CLASS
5158
a0d0e21e 5159 default:
b900a521 5160 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 5161 PTR2UV(scan), OP(scan));
cea2e8a9 5162 Perl_croak(aTHX_ "regexp memory corruption");
5d458dd8
YO
5163
5164 } /* end switch */
95b24440 5165
5d458dd8
YO
5166 /* switch break jumps here */
5167 scan = next; /* prepare to execute the next op and ... */
5168 continue; /* ... jump back to the top, reusing st */
95b24440
DM
5169 /* NOTREACHED */
5170
40a82448
DM
5171 push_yes_state:
5172 /* push a state that backtracks on success */
5173 st->u.yes.prev_yes_state = yes_state;
5174 yes_state = st;
5175 /* FALL THROUGH */
5176 push_state:
5177 /* push a new regex state, then continue at scan */
5178 {
5179 regmatch_state *newst;
5180
24b23f37
YO
5181 DEBUG_STACK_r({
5182 regmatch_state *cur = st;
5183 regmatch_state *curyes = yes_state;
5184 int curd = depth;
5185 regmatch_slab *slab = PL_regmatch_slab;
5186 for (;curd > -1;cur--,curd--) {
5187 if (cur < SLAB_FIRST(slab)) {
5188 slab = slab->prev;
5189 cur = SLAB_LAST(slab);
5190 }
5191 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5192 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 5193 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
5194 (curyes == cur) ? "yes" : ""
5195 );
5196 if (curyes == cur)
5197 curyes = cur->u.yes.prev_yes_state;
5198 }
5199 } else
5200 DEBUG_STATE_pp("push")
5201 );
40a82448 5202 depth++;
40a82448
DM
5203 st->locinput = locinput;
5204 newst = st+1;
5205 if (newst > SLAB_LAST(PL_regmatch_slab))
5206 newst = S_push_slab(aTHX);
5207 PL_regmatch_state = newst;
786e8c11 5208
40a82448
DM
5209 locinput = PL_reginput;
5210 nextchr = UCHARAT(locinput);
5211 st = newst;
5212 continue;
5213 /* NOTREACHED */
5214 }
a0d0e21e 5215 }
a687059c 5216
a0d0e21e
LW
5217 /*
5218 * We get here only if there's trouble -- normally "case END" is
5219 * the terminating point.
5220 */
cea2e8a9 5221 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 5222 /*NOTREACHED*/
4633a7c4
LW
5223 sayNO;
5224
262b90c4 5225yes:
77cb431f
DM
5226 if (yes_state) {
5227 /* we have successfully completed a subexpression, but we must now
5228 * pop to the state marked by yes_state and continue from there */
77cb431f 5229 assert(st != yes_state);
5bc10b2c
DM
5230#ifdef DEBUGGING
5231 while (st != yes_state) {
5232 st--;
5233 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5234 PL_regmatch_slab = PL_regmatch_slab->prev;
5235 st = SLAB_LAST(PL_regmatch_slab);
5236 }
e2e6a0f1 5237 DEBUG_STATE_r({
54612592
YO
5238 if (no_final) {
5239 DEBUG_STATE_pp("pop (no final)");
5240 } else {
5241 DEBUG_STATE_pp("pop (yes)");
5242 }
e2e6a0f1 5243 });
5bc10b2c
DM
5244 depth--;
5245 }
5246#else
77cb431f
DM
5247 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5248 || yes_state > SLAB_LAST(PL_regmatch_slab))
5249 {
5250 /* not in this slab, pop slab */
5251 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5252 PL_regmatch_slab = PL_regmatch_slab->prev;
5253 st = SLAB_LAST(PL_regmatch_slab);
5254 }
5255 depth -= (st - yes_state);
5bc10b2c 5256#endif
77cb431f
DM
5257 st = yes_state;
5258 yes_state = st->u.yes.prev_yes_state;
5259 PL_regmatch_state = st;
24b23f37 5260
5d458dd8
YO
5261 if (no_final) {
5262 locinput= st->locinput;
5263 nextchr = UCHARAT(locinput);
5264 }
54612592 5265 state_num = st->resume_state + no_final;
24d3c4a9 5266 goto reenter_switch;
77cb431f
DM
5267 }
5268
a3621e74 5269 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 5270 PL_colors[4], PL_colors[5]));
02db2b7b 5271
19b95bf0
DM
5272 if (PL_reg_eval_set) {
5273 /* each successfully executed (?{...}) block does the equivalent of
5274 * local $^R = do {...}
5275 * When popping the save stack, all these locals would be undone;
5276 * bypass this by setting the outermost saved $^R to the latest
5277 * value */
5278 if (oreplsv != GvSV(PL_replgv))
5279 sv_setsv(oreplsv, GvSV(PL_replgv));
5280 }
95b24440 5281 result = 1;
aa283a38 5282 goto final_exit;
4633a7c4
LW
5283
5284no:
a3621e74 5285 DEBUG_EXECUTE_r(
7821416a 5286 PerlIO_printf(Perl_debug_log,
786e8c11 5287 "%*s %sfailed...%s\n",
5bc10b2c 5288 REPORT_CODE_OFF+depth*2, "",
786e8c11 5289 PL_colors[4], PL_colors[5])
7821416a 5290 );
aa283a38 5291
262b90c4 5292no_silent:
54612592
YO
5293 if (no_final) {
5294 if (yes_state) {
5295 goto yes;
5296 } else {
5297 goto final_exit;
5298 }
5299 }
aa283a38
DM
5300 if (depth) {
5301 /* there's a previous state to backtrack to */
40a82448
DM
5302 st--;
5303 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5304 PL_regmatch_slab = PL_regmatch_slab->prev;
5305 st = SLAB_LAST(PL_regmatch_slab);
5306 }
5307 PL_regmatch_state = st;
40a82448
DM
5308 locinput= st->locinput;
5309 nextchr = UCHARAT(locinput);
5310
5bc10b2c
DM
5311 DEBUG_STATE_pp("pop");
5312 depth--;
262b90c4
DM
5313 if (yes_state == st)
5314 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 5315
24d3c4a9
DM
5316 state_num = st->resume_state + 1; /* failure = success + 1 */
5317 goto reenter_switch;
95b24440 5318 }
24d3c4a9 5319 result = 0;
aa283a38 5320
262b90c4 5321 final_exit:
bbe252da 5322 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
5323 SV *sv_err = get_sv("REGERROR", 1);
5324 SV *sv_mrk = get_sv("REGMARK", 1);
5325 if (result) {
e2e6a0f1 5326 sv_commit = &PL_sv_no;
5d458dd8
YO
5327 if (!sv_yes_mark)
5328 sv_yes_mark = &PL_sv_yes;
5329 } else {
5330 if (!sv_commit)
5331 sv_commit = &PL_sv_yes;
5332 sv_yes_mark = &PL_sv_no;
5333 }
5334 sv_setsv(sv_err, sv_commit);
5335 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 5336 }
19b95bf0 5337
2f554ef7
DM
5338 /* clean up; in particular, free all slabs above current one */
5339 LEAVE_SCOPE(oldsave);
5d9a96ca 5340
95b24440 5341 return result;
a687059c
LW
5342}
5343
5344/*
5345 - regrepeat - repeatedly match something simple, report how many
5346 */
5347/*
5348 * [This routine now assumes that it will only match on things of length 1.
5349 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 5350 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 5351 */
76e3520e 5352STATIC I32
e2e6a0f1 5353S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
a687059c 5354{
27da23d5 5355 dVAR;
a0d0e21e 5356 register char *scan;
a0d0e21e 5357 register I32 c;
3280af22 5358 register char *loceol = PL_regeol;
a0ed51b3 5359 register I32 hardcount = 0;
53c4c00c 5360 register bool do_utf8 = PL_reg_match_utf8;
4f55667c
SP
5361#ifndef DEBUGGING
5362 PERL_UNUSED_ARG(depth);
5363#endif
a0d0e21e 5364
7918f24d
NC
5365 PERL_ARGS_ASSERT_REGREPEAT;
5366
3280af22 5367 scan = PL_reginput;
faf11cac
HS
5368 if (max == REG_INFTY)
5369 max = I32_MAX;
5370 else if (max < loceol - scan)
7f596f4c 5371 loceol = scan + max;
a0d0e21e 5372 switch (OP(p)) {
22c35a8c 5373 case REG_ANY:
1aa99e6b 5374 if (do_utf8) {
ffc61ed2 5375 loceol = PL_regeol;
1aa99e6b 5376 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
5377 scan += UTF8SKIP(scan);
5378 hardcount++;
5379 }
5380 } else {
5381 while (scan < loceol && *scan != '\n')
5382 scan++;
a0ed51b3
LW
5383 }
5384 break;
ffc61ed2 5385 case SANY:
def8e4ea
JH
5386 if (do_utf8) {
5387 loceol = PL_regeol;
a0804c9e 5388 while (scan < loceol && hardcount < max) {
def8e4ea
JH
5389 scan += UTF8SKIP(scan);
5390 hardcount++;
5391 }
5392 }
5393 else
5394 scan = loceol;
a0ed51b3 5395 break;
f33976b4
DB
5396 case CANY:
5397 scan = loceol;
5398 break;
090f7165
JH
5399 case EXACT: /* length of string is 1 */
5400 c = (U8)*STRING(p);
5401 while (scan < loceol && UCHARAT(scan) == c)
5402 scan++;
bbce6d69 5403 break;
5404 case EXACTF: /* length of string is 1 */
cd439c50 5405 c = (U8)*STRING(p);
bbce6d69 5406 while (scan < loceol &&
22c35a8c 5407 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 5408 scan++;
5409 break;
5410 case EXACTFL: /* length of string is 1 */
3280af22 5411 PL_reg_flags |= RF_tainted;
cd439c50 5412 c = (U8)*STRING(p);
bbce6d69 5413 while (scan < loceol &&
22c35a8c 5414 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
5415 scan++;
5416 break;
5417 case ANYOF:
ffc61ed2
JH
5418 if (do_utf8) {
5419 loceol = PL_regeol;
cfc92286 5420 while (hardcount < max && scan < loceol &&
32fc9b6a 5421 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
5422 scan += UTF8SKIP(scan);
5423 hardcount++;
5424 }
5425 } else {
32fc9b6a 5426 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
5427 scan++;
5428 }
a0d0e21e
LW
5429 break;
5430 case ALNUM:
1aa99e6b 5431 if (do_utf8) {
ffc61ed2 5432 loceol = PL_regeol;
1a4fad37 5433 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 5434 while (hardcount < max && scan < loceol &&
3568d838 5435 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5436 scan += UTF8SKIP(scan);
5437 hardcount++;
5438 }
5439 } else {
5440 while (scan < loceol && isALNUM(*scan))
5441 scan++;
a0ed51b3
LW
5442 }
5443 break;
bbce6d69 5444 case ALNUML:
3280af22 5445 PL_reg_flags |= RF_tainted;
1aa99e6b 5446 if (do_utf8) {
ffc61ed2 5447 loceol = PL_regeol;
1aa99e6b
IH
5448 while (hardcount < max && scan < loceol &&
5449 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5450 scan += UTF8SKIP(scan);
5451 hardcount++;
5452 }
5453 } else {
5454 while (scan < loceol && isALNUM_LC(*scan))
5455 scan++;
a0ed51b3
LW
5456 }
5457 break;
a0d0e21e 5458 case NALNUM:
1aa99e6b 5459 if (do_utf8) {
ffc61ed2 5460 loceol = PL_regeol;
1a4fad37 5461 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 5462 while (hardcount < max && scan < loceol &&
3568d838 5463 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5464 scan += UTF8SKIP(scan);
5465 hardcount++;
5466 }
5467 } else {
5468 while (scan < loceol && !isALNUM(*scan))
5469 scan++;
a0ed51b3
LW
5470 }
5471 break;
bbce6d69 5472 case NALNUML:
3280af22 5473 PL_reg_flags |= RF_tainted;
1aa99e6b 5474 if (do_utf8) {
ffc61ed2 5475 loceol = PL_regeol;
1aa99e6b
IH
5476 while (hardcount < max && scan < loceol &&
5477 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5478 scan += UTF8SKIP(scan);
5479 hardcount++;
5480 }
5481 } else {
5482 while (scan < loceol && !isALNUM_LC(*scan))
5483 scan++;
a0ed51b3
LW
5484 }
5485 break;
a0d0e21e 5486 case SPACE:
1aa99e6b 5487 if (do_utf8) {
ffc61ed2 5488 loceol = PL_regeol;
1a4fad37 5489 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5490 while (hardcount < max && scan < loceol &&
3568d838
JH
5491 (*scan == ' ' ||
5492 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5493 scan += UTF8SKIP(scan);
5494 hardcount++;
5495 }
5496 } else {
5497 while (scan < loceol && isSPACE(*scan))
5498 scan++;
a0ed51b3
LW
5499 }
5500 break;
bbce6d69 5501 case SPACEL:
3280af22 5502 PL_reg_flags |= RF_tainted;
1aa99e6b 5503 if (do_utf8) {
ffc61ed2 5504 loceol = PL_regeol;
1aa99e6b 5505 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5506 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5507 scan += UTF8SKIP(scan);
5508 hardcount++;
5509 }
5510 } else {
5511 while (scan < loceol && isSPACE_LC(*scan))
5512 scan++;
a0ed51b3
LW
5513 }
5514 break;
a0d0e21e 5515 case NSPACE:
1aa99e6b 5516 if (do_utf8) {
ffc61ed2 5517 loceol = PL_regeol;
1a4fad37 5518 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5519 while (hardcount < max && scan < loceol &&
3568d838
JH
5520 !(*scan == ' ' ||
5521 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5522 scan += UTF8SKIP(scan);
5523 hardcount++;
5524 }
5525 } else {
5526 while (scan < loceol && !isSPACE(*scan))
5527 scan++;
a0ed51b3 5528 }
0008a298 5529 break;
bbce6d69 5530 case NSPACEL:
3280af22 5531 PL_reg_flags |= RF_tainted;
1aa99e6b 5532 if (do_utf8) {
ffc61ed2 5533 loceol = PL_regeol;
1aa99e6b 5534 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5535 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5536 scan += UTF8SKIP(scan);
5537 hardcount++;
5538 }
5539 } else {
5540 while (scan < loceol && !isSPACE_LC(*scan))
5541 scan++;
a0ed51b3
LW
5542 }
5543 break;
a0d0e21e 5544 case DIGIT:
1aa99e6b 5545 if (do_utf8) {
ffc61ed2 5546 loceol = PL_regeol;
1a4fad37 5547 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5548 while (hardcount < max && scan < loceol &&
3568d838 5549 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5550 scan += UTF8SKIP(scan);
5551 hardcount++;
5552 }
5553 } else {
5554 while (scan < loceol && isDIGIT(*scan))
5555 scan++;
a0ed51b3
LW
5556 }
5557 break;
a0d0e21e 5558 case NDIGIT:
1aa99e6b 5559 if (do_utf8) {
ffc61ed2 5560 loceol = PL_regeol;
1a4fad37 5561 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5562 while (hardcount < max && scan < loceol &&
3568d838 5563 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5564 scan += UTF8SKIP(scan);
5565 hardcount++;
5566 }
5567 } else {
5568 while (scan < loceol && !isDIGIT(*scan))
5569 scan++;
a0ed51b3 5570 }
e1d1eefb
YO
5571 case LNBREAK:
5572 if (do_utf8) {
5573 loceol = PL_regeol;
5574 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5575 scan += c;
5576 hardcount++;
5577 }
5578 } else {
5579 /*
5580 LNBREAK can match two latin chars, which is ok,
5581 because we have a null terminated string, but we
5582 have to use hardcount in this situation
5583 */
5584 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
5585 scan+=c;
5586 hardcount++;
5587 }
5588 }
5589 break;
5590 case HORIZWS:
5591 if (do_utf8) {
5592 loceol = PL_regeol;
5593 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5594 scan += c;
5595 hardcount++;
5596 }
5597 } else {
5598 while (scan < loceol && is_HORIZWS_latin1(scan))
5599 scan++;
5600 }
a0ed51b3 5601 break;
e1d1eefb
YO
5602 case NHORIZWS:
5603 if (do_utf8) {
5604 loceol = PL_regeol;
5605 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5606 scan += UTF8SKIP(scan);
5607 hardcount++;
5608 }
5609 } else {
5610 while (scan < loceol && !is_HORIZWS_latin1(scan))
5611 scan++;
5612
5613 }
5614 break;
5615 case VERTWS:
5616 if (do_utf8) {
5617 loceol = PL_regeol;
5618 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5619 scan += c;
5620 hardcount++;
5621 }
5622 } else {
5623 while (scan < loceol && is_VERTWS_latin1(scan))
5624 scan++;
5625
5626 }
5627 break;
5628 case NVERTWS:
5629 if (do_utf8) {
5630 loceol = PL_regeol;
5631 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5632 scan += UTF8SKIP(scan);
5633 hardcount++;
5634 }
5635 } else {
5636 while (scan < loceol && !is_VERTWS_latin1(scan))
5637 scan++;
5638
5639 }
5640 break;
5641
a0d0e21e
LW
5642 default: /* Called on something of 0 width. */
5643 break; /* So match right here or not at all. */
5644 }
a687059c 5645
a0ed51b3
LW
5646 if (hardcount)
5647 c = hardcount;
5648 else
5649 c = scan - PL_reginput;
3280af22 5650 PL_reginput = scan;
a687059c 5651
a3621e74 5652 DEBUG_r({
e68ec53f 5653 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 5654 DEBUG_EXECUTE_r({
e68ec53f
YO
5655 SV * const prop = sv_newmortal();
5656 regprop(prog, prop, p);
5657 PerlIO_printf(Perl_debug_log,
be8e71aa 5658 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 5659 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 5660 });
be8e71aa 5661 });
9041c2e3 5662
a0d0e21e 5663 return(c);
a687059c
LW
5664}
5665
c277df42 5666
be8e71aa 5667#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 5668/*
ffc61ed2
JH
5669- regclass_swash - prepare the utf8 swash
5670*/
5671
5672SV *
32fc9b6a 5673Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 5674{
97aff369 5675 dVAR;
9e55ce06
JH
5676 SV *sw = NULL;
5677 SV *si = NULL;
5678 SV *alt = NULL;
f8fc2ecf
YO
5679 RXi_GET_DECL(prog,progi);
5680 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 5681
7918f24d
NC
5682 PERL_ARGS_ASSERT_REGCLASS_SWASH;
5683
4f639d21 5684 if (data && data->count) {
a3b680e6 5685 const U32 n = ARG(node);
ffc61ed2 5686
4f639d21 5687 if (data->what[n] == 's') {
ad64d0ec
NC
5688 SV * const rv = MUTABLE_SV(data->data[n]);
5689 AV * const av = MUTABLE_AV(SvRV(rv));
2d03de9c 5690 SV **const ary = AvARRAY(av);
9e55ce06 5691 SV **a, **b;
9041c2e3 5692
711a919c 5693 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5694 * documentation of these array elements. */
5695
b11f357e 5696 si = *ary;
fe5bfecd
JH
5697 a = SvROK(ary[1]) ? &ary[1] : NULL;
5698 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
b11f357e 5699
ffc61ed2
JH
5700 if (a)
5701 sw = *a;
5702 else if (si && doinit) {
5703 sw = swash_init("utf8", "", si, 1, 0);
5704 (void)av_store(av, 1, sw);
5705 }
9e55ce06
JH
5706 if (b)
5707 alt = *b;
ffc61ed2
JH
5708 }
5709 }
5710
9e55ce06
JH
5711 if (listsvp)
5712 *listsvp = si;
5713 if (altsvp)
5714 *altsvp = alt;
ffc61ed2
JH
5715
5716 return sw;
5717}
76234dfb 5718#endif
ffc61ed2
JH
5719
5720/*
ba7b4546 5721 - reginclass - determine if a character falls into a character class
832705d4
JH
5722
5723 The n is the ANYOF regnode, the p is the target string, lenp
5724 is pointer to the maximum length of how far to go in the p
5725 (if the lenp is zero, UTF8SKIP(p) is used),
5726 do_utf8 tells whether the target string is in UTF-8.
5727
bbce6d69 5728 */
5729
76e3520e 5730STATIC bool
32fc9b6a 5731S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5732{
27da23d5 5733 dVAR;
a3b680e6 5734 const char flags = ANYOF_FLAGS(n);
bbce6d69 5735 bool match = FALSE;
cc07378b 5736 UV c = *p;
ae9ddab8 5737 STRLEN len = 0;
9e55ce06 5738 STRLEN plen;
1aa99e6b 5739
7918f24d
NC
5740 PERL_ARGS_ASSERT_REGINCLASS;
5741
19f67299
TS
5742 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5743 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5744 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5745 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
e8a70c6f
SP
5746 if (len == (STRLEN)-1)
5747 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 5748 }
bbce6d69 5749
0f0076b4 5750 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5751 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5752 if (lenp)
5753 *lenp = 0;
ffc61ed2 5754 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5755 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5756 match = TRUE;
bbce6d69 5757 }
3568d838 5758 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5759 match = TRUE;
ffc61ed2 5760 if (!match) {
9e55ce06 5761 AV *av;
32fc9b6a 5762 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5763
5764 if (sw) {
3568d838 5765 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
5766 match = TRUE;
5767 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5768 if (!match && lenp && av) {
5769 I32 i;
9e55ce06 5770 for (i = 0; i <= av_len(av); i++) {
890ce7af 5771 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5772 STRLEN len;
890ce7af 5773 const char * const s = SvPV_const(sv, len);
9e55ce06 5774
061b10df 5775 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
5776 *lenp = len;
5777 match = TRUE;
5778 break;
5779 }
5780 }
5781 }
5782 if (!match) {
89ebb4a3 5783 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
5784 STRLEN tmplen;
5785
9e55ce06
JH
5786 to_utf8_fold(p, tmpbuf, &tmplen);
5787 if (swash_fetch(sw, tmpbuf, do_utf8))
5788 match = TRUE;
5789 }
ffc61ed2
JH
5790 }
5791 }
bbce6d69 5792 }
9e55ce06 5793 if (match && lenp && *lenp == 0)
0f0076b4 5794 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5795 }
1aa99e6b 5796 if (!match && c < 256) {
ffc61ed2
JH
5797 if (ANYOF_BITMAP_TEST(n, c))
5798 match = TRUE;
5799 else if (flags & ANYOF_FOLD) {
eb160463 5800 U8 f;
a0ed51b3 5801
ffc61ed2
JH
5802 if (flags & ANYOF_LOCALE) {
5803 PL_reg_flags |= RF_tainted;
5804 f = PL_fold_locale[c];
5805 }
5806 else
5807 f = PL_fold[c];
5808 if (f != c && ANYOF_BITMAP_TEST(n, f))
5809 match = TRUE;
5810 }
5811
5812 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5813 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5814 if (
5815 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5816 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5817 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5818 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5819 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5820 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5821 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5822 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5823 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5824 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5825 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5826 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5827 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5828 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5829 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5830 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5831 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5832 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5833 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5834 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5835 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5836 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5837 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5838 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5839 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5840 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5841 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5842 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5843 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5844 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5845 ) /* How's that for a conditional? */
5846 {
5847 match = TRUE;
5848 }
a0ed51b3 5849 }
a0ed51b3
LW
5850 }
5851
a0ed51b3
LW
5852 return (flags & ANYOF_INVERT) ? !match : match;
5853}
161b471a 5854
dfe13c55 5855STATIC U8 *
0ce71af7 5856S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5857{
97aff369 5858 dVAR;
7918f24d
NC
5859
5860 PERL_ARGS_ASSERT_REGHOP3;
5861
a0ed51b3 5862 if (off >= 0) {
1aa99e6b 5863 while (off-- && s < lim) {
ffc61ed2 5864 /* XXX could check well-formedness here */
a0ed51b3 5865 s += UTF8SKIP(s);
ffc61ed2 5866 }
a0ed51b3
LW
5867 }
5868 else {
1de06328
YO
5869 while (off++ && s > lim) {
5870 s--;
5871 if (UTF8_IS_CONTINUED(*s)) {
5872 while (s > lim && UTF8_IS_CONTINUATION(*s))
5873 s--;
a0ed51b3 5874 }
1de06328 5875 /* XXX could check well-formedness here */
a0ed51b3
LW
5876 }
5877 }
5878 return s;
5879}
161b471a 5880
f9f4320a
YO
5881#ifdef XXX_dmq
5882/* there are a bunch of places where we use two reghop3's that should
5883 be replaced with this routine. but since thats not done yet
5884 we ifdef it out - dmq
5885*/
dfe13c55 5886STATIC U8 *
1de06328
YO
5887S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5888{
5889 dVAR;
7918f24d
NC
5890
5891 PERL_ARGS_ASSERT_REGHOP4;
5892
1de06328
YO
5893 if (off >= 0) {
5894 while (off-- && s < rlim) {
5895 /* XXX could check well-formedness here */
5896 s += UTF8SKIP(s);
5897 }
5898 }
5899 else {
5900 while (off++ && s > llim) {
5901 s--;
5902 if (UTF8_IS_CONTINUED(*s)) {
5903 while (s > llim && UTF8_IS_CONTINUATION(*s))
5904 s--;
5905 }
5906 /* XXX could check well-formedness here */
5907 }
5908 }
5909 return s;
5910}
f9f4320a 5911#endif
1de06328
YO
5912
5913STATIC U8 *
0ce71af7 5914S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5915{
97aff369 5916 dVAR;
7918f24d
NC
5917
5918 PERL_ARGS_ASSERT_REGHOPMAYBE3;
5919
a0ed51b3 5920 if (off >= 0) {
1aa99e6b 5921 while (off-- && s < lim) {
ffc61ed2 5922 /* XXX could check well-formedness here */
a0ed51b3 5923 s += UTF8SKIP(s);
ffc61ed2 5924 }
a0ed51b3 5925 if (off >= 0)
3dab1dad 5926 return NULL;
a0ed51b3
LW
5927 }
5928 else {
1de06328
YO
5929 while (off++ && s > lim) {
5930 s--;
5931 if (UTF8_IS_CONTINUED(*s)) {
5932 while (s > lim && UTF8_IS_CONTINUATION(*s))
5933 s--;
a0ed51b3 5934 }
1de06328 5935 /* XXX could check well-formedness here */
a0ed51b3
LW
5936 }
5937 if (off <= 0)
3dab1dad 5938 return NULL;
a0ed51b3
LW
5939 }
5940 return s;
5941}
51371543 5942
51371543 5943static void
acfe0abc 5944restore_pos(pTHX_ void *arg)
51371543 5945{
97aff369 5946 dVAR;
097eb12c 5947 regexp * const rex = (regexp *)arg;
51371543
GS
5948 if (PL_reg_eval_set) {
5949 if (PL_reg_oldsaved) {
4f639d21
DM
5950 rex->subbeg = PL_reg_oldsaved;
5951 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5952#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5953 rex->saved_copy = PL_nrs;
ed252734 5954#endif
07bc277f 5955 RXp_MATCH_COPIED_on(rex);
51371543
GS
5956 }
5957 PL_reg_magic->mg_len = PL_reg_oldpos;
5958 PL_reg_eval_set = 0;
5959 PL_curpm = PL_reg_oldcurpm;
5960 }
5961}
33b8afdf
JH
5962
5963STATIC void
5964S_to_utf8_substr(pTHX_ register regexp *prog)
5965{
a1cac82e 5966 int i = 1;
7918f24d
NC
5967
5968 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
5969
a1cac82e
NC
5970 do {
5971 if (prog->substrs->data[i].substr
5972 && !prog->substrs->data[i].utf8_substr) {
5973 SV* const sv = newSVsv(prog->substrs->data[i].substr);
5974 prog->substrs->data[i].utf8_substr = sv;
5975 sv_utf8_upgrade(sv);
610460f9
NC
5976 if (SvVALID(prog->substrs->data[i].substr)) {
5977 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
5978 if (flags & FBMcf_TAIL) {
5979 /* Trim the trailing \n that fbm_compile added last
5980 time. */
5981 SvCUR_set(sv, SvCUR(sv) - 1);
5982 /* Whilst this makes the SV technically "invalid" (as its
5983 buffer is no longer followed by "\0") when fbm_compile()
5984 adds the "\n" back, a "\0" is restored. */
5985 }
5986 fbm_compile(sv, flags);
5987 }
a1cac82e
NC
5988 if (prog->substrs->data[i].substr == prog->check_substr)
5989 prog->check_utf8 = sv;
5990 }
5991 } while (i--);
33b8afdf
JH
5992}
5993
5994STATIC void
5995S_to_byte_substr(pTHX_ register regexp *prog)
5996{
97aff369 5997 dVAR;
a1cac82e 5998 int i = 1;
7918f24d
NC
5999
6000 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6001
a1cac82e
NC
6002 do {
6003 if (prog->substrs->data[i].utf8_substr
6004 && !prog->substrs->data[i].substr) {
6005 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6006 if (sv_utf8_downgrade(sv, TRUE)) {
610460f9
NC
6007 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6008 const U8 flags
6009 = BmFLAGS(prog->substrs->data[i].utf8_substr);
6010 if (flags & FBMcf_TAIL) {
6011 /* Trim the trailing \n that fbm_compile added last
6012 time. */
6013 SvCUR_set(sv, SvCUR(sv) - 1);
6014 }
6015 fbm_compile(sv, flags);
6016 }
a1cac82e
NC
6017 } else {
6018 SvREFCNT_dec(sv);
6019 sv = &PL_sv_undef;
6020 }
6021 prog->substrs->data[i].substr = sv;
6022 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6023 prog->check_substr = sv;
33b8afdf 6024 }
a1cac82e 6025 } while (i--);
33b8afdf 6026}
66610fdd
RGS
6027
6028/*
6029 * Local variables:
6030 * c-indentation-style: bsd
6031 * c-basic-offset: 4
6032 * indent-tabs-mode: t
6033 * End:
6034 *
37442d52
RGS
6035 * ex: set ts=8 sts=4 sw=4 noet:
6036 */