This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
threads 1.38
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
9041c2e3 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 **** Alterations to Henry's code are...
57 ****
4bb101f2 58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 60 ****
9ef589d8
LW
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
a687059c
LW
63 *
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
67 */
68#include "EXTERN.h"
864dbfa3 69#define PERL_IN_REGEXEC_C
a687059c 70#include "perl.h"
0f5d15d6 71
54df2634
NC
72#ifdef PERL_IN_XSUB_RE
73# include "re_comp.h"
74#else
75# include "regcomp.h"
76#endif
a687059c 77
c277df42
IZ
78#define RF_tainted 1 /* tainted information used? */
79#define RF_warned 2 /* warned about big count? */
ce862d02 80#define RF_evaled 4 /* Did an EVAL with setting? */
ab3bbdeb 81#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 82
eb160463 83#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
84
85#define RS_init 1 /* eval environment created */
86#define RS_set 2 /* replsv value is set */
c277df42 87
a687059c
LW
88#ifndef STATIC
89#define STATIC static
90#endif
91
32fc9b6a 92#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 93
c277df42
IZ
94/*
95 * Forwards.
96 */
97
33b8afdf 98#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 99#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 100
3dab1dad
YO
101#define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
52657f30 103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
104 : (U8*)(pos + off))
105#define HOPBACKc(pos, off) \
07be1b83
YO
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
8e11feef 109 ? (U8*)pos - off \
3dab1dad 110 : NULL)
efb30f32 111
53c4c00c 112#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 113#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 114
1a4fad37
AL
115#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 121
3dab1dad
YO
122/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
123
5f80c4cf 124/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
125#define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
130)
131
cca55fe3 132#define HAS_TEXT(rn) ( \
3dab1dad 133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 134)
e2d8ce26 135
a84d97b6
HS
136/*
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
139*/
cca55fe3 140#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 144 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 145 else if (type == PLUS) \
cca55fe3 146 rn = NEXTOPER(rn); \
3dab1dad 147 else if (type == IFMATCH) \
a84d97b6 148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 149 else rn += NEXT_OFF(rn); \
3dab1dad 150 } \
5f80c4cf 151} STMT_END
74750237 152
acfe0abc 153static void restore_pos(pTHX_ void *arg);
51371543 154
76e3520e 155STATIC CHECKPOINT
cea2e8a9 156S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 157{
97aff369 158 dVAR;
a3b680e6 159 const int retval = PL_savestack_ix;
b1ce53c5 160#define REGCP_PAREN_ELEMS 4
a3b680e6 161 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 162 int p;
40a82448 163 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 164
e49a9654
IH
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
167
a01268b5 168#define REGCP_OTHER_ELEMS 6
4b3c1a47 169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 170 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 171/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
3280af22 174 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 175 SSPUSHINT(p);
40a82448
DM
176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
177 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
178 (UV)p, (IV)PL_regstartp[p],
179 (IV)(PL_reg_start_tmp[p] - PL_bostr),
180 (IV)PL_regendp[p]
181 ));
a0d0e21e 182 }
b1ce53c5 183/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
184 SSPUSHINT(PL_regsize);
185 SSPUSHINT(*PL_reglastparen);
a01268b5 186 SSPUSHINT(*PL_reglastcloseparen);
3280af22 187 SSPUSHPTR(PL_reginput);
41123dfd
JH
188#define REGCP_FRAME_ELEMS 2
189/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
190 * are needed for the regexp context stack bookkeeping. */
191 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 192 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 193
a0d0e21e
LW
194 return retval;
195}
196
c277df42 197/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
198#define REGCP_SET(cp) \
199 DEBUG_STATE_r( \
ab3bbdeb 200 PerlIO_printf(Perl_debug_log, \
e4f74956 201 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
202 (IV)PL_savestack_ix)); \
203 cp = PL_savestack_ix
c3464db5 204
ab3bbdeb 205#define REGCP_UNWIND(cp) \
e4f74956 206 DEBUG_STATE_r( \
ab3bbdeb 207 if (cp != PL_savestack_ix) \
e4f74956
YO
208 PerlIO_printf(Perl_debug_log, \
209 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
210 (IV)(cp), (IV)PL_savestack_ix)); \
211 regcpblow(cp)
c277df42 212
76e3520e 213STATIC char *
097eb12c 214S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 215{
97aff369 216 dVAR;
b1ce53c5 217 I32 i;
a0d0e21e 218 char *input;
b1ce53c5 219
a3621e74
YO
220 GET_RE_DEBUG_FLAGS_DECL;
221
b1ce53c5 222 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 223 i = SSPOPINT;
b1ce53c5
JH
224 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
225 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 226 input = (char *) SSPOPPTR;
a01268b5 227 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
228 *PL_reglastparen = SSPOPINT;
229 PL_regsize = SSPOPINT;
b1ce53c5
JH
230
231 /* Now restore the parentheses context. */
41123dfd
JH
232 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
233 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 234 I32 tmps;
097eb12c 235 U32 paren = (U32)SSPOPINT;
3280af22 236 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
237 PL_regstartp[paren] = SSPOPINT;
238 tmps = SSPOPINT;
3280af22
NIS
239 if (paren <= *PL_reglastparen)
240 PL_regendp[paren] = tmps;
a3621e74 241 DEBUG_EXECUTE_r(
c3464db5 242 PerlIO_printf(Perl_debug_log,
b900a521 243 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 244 (UV)paren, (IV)PL_regstartp[paren],
b900a521 245 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 246 (IV)PL_regendp[paren],
3280af22 247 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 248 );
a0d0e21e 249 }
a3621e74 250 DEBUG_EXECUTE_r(
bb7a0f54 251 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 252 PerlIO_printf(Perl_debug_log,
faccc32b 253 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 254 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
255 }
256 );
daf18116 257#if 1
dafc8851
JH
258 /* It would seem that the similar code in regtry()
259 * already takes care of this, and in fact it is in
260 * a better location to since this code can #if 0-ed out
261 * but the code in regtry() is needed or otherwise tests
262 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
263 * (as of patchlevel 7877) will fail. Then again,
264 * this code seems to be necessary or otherwise
265 * building DynaLoader will fail:
266 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
267 * --jhi */
bb7a0f54 268 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
269 if (i > PL_regsize)
270 PL_regstartp[i] = -1;
271 PL_regendp[i] = -1;
a0d0e21e 272 }
dafc8851 273#endif
a0d0e21e
LW
274 return input;
275}
276
02db2b7b 277#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 278
a687059c 279/*
e50aee73 280 * pregexec and friends
a687059c
LW
281 */
282
76234dfb 283#ifndef PERL_IN_XSUB_RE
a687059c 284/*
c277df42 285 - pregexec - match a regexp against a string
a687059c 286 */
c277df42 287I32
864dbfa3 288Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 289 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
290/* strend: pointer to null at end of string */
291/* strbeg: real beginning of string */
292/* minend: end of match must be >=minend after stringarg. */
293/* nosave: For optimizations. */
294{
295 return
9041c2e3 296 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
297 nosave ? 0 : REXEC_COPY_STR);
298}
76234dfb 299#endif
22e551b9 300
9041c2e3 301/*
cad2e5aa
JH
302 * Need to implement the following flags for reg_anch:
303 *
304 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
305 * USE_INTUIT_ML
306 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
307 * INTUIT_AUTORITATIVE_ML
308 * INTUIT_ONCE_NOML - Intuit can match in one location only.
309 * INTUIT_ONCE_ML
310 *
311 * Another flag for this function: SECOND_TIME (so that float substrs
312 * with giant delta may be not rechecked).
313 */
314
315/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
316
3f7c398e 317/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
318 Otherwise, only SvCUR(sv) is used to get strbeg. */
319
320/* XXXX We assume that strpos is strbeg unless sv. */
321
6eb5f6b9
JH
322/* XXXX Some places assume that there is a fixed substring.
323 An update may be needed if optimizer marks as "INTUITable"
324 RExen without fixed substrings. Similarly, it is assumed that
325 lengths of all the strings are no more than minlen, thus they
326 cannot come from lookahead.
327 (Or minlen should take into account lookahead.) */
328
2c2d71f5
JH
329/* A failure to find a constant substring means that there is no need to make
330 an expensive call to REx engine, thus we celebrate a failure. Similarly,
331 finding a substring too deep into the string means that less calls to
30944b6d
IZ
332 regtry() should be needed.
333
334 REx compiler's optimizer found 4 possible hints:
335 a) Anchored substring;
336 b) Fixed substring;
337 c) Whether we are anchored (beginning-of-line or \G);
338 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 339 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
340 string which does not contradict any of them.
341 */
2c2d71f5 342
6eb5f6b9
JH
343/* Most of decisions we do here should have been done at compile time.
344 The nodes of the REx which we used for the search should have been
345 deleted from the finite automaton. */
346
cad2e5aa
JH
347char *
348Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
349 char *strend, U32 flags, re_scream_pos_data *data)
350{
97aff369 351 dVAR;
b7953727 352 register I32 start_shift = 0;
cad2e5aa 353 /* Should be nonnegative! */
b7953727 354 register I32 end_shift = 0;
2c2d71f5
JH
355 register char *s;
356 register SV *check;
a1933d95 357 char *strbeg;
cad2e5aa 358 char *t;
a3b680e6 359 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 360 I32 ml_anch;
bd61b366
SS
361 register char *other_last = NULL; /* other substr checked before this */
362 char *check_at = NULL; /* check substr found at this pos */
1df70142 363 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 364#ifdef DEBUGGING
890ce7af 365 const char * const i_strpos = strpos;
30944b6d 366#endif
a3621e74
YO
367
368 GET_RE_DEBUG_FLAGS_DECL;
369
a30b2f1f 370 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 371
b8d68ded 372 if (prog->reganch & ROPT_UTF8) {
b8d68ded
JH
373 PL_reg_flags |= RF_utf8;
374 }
ab3bbdeb
YO
375 DEBUG_EXECUTE_r(
376 debug_start_match(prog, do_utf8, strpos, strend,
377 "Guessing start of match for");
2a782b5b 378 );
cad2e5aa 379
c344f387
JH
380 /* CHR_DIST() would be more correct here but it makes things slow. */
381 if (prog->minlen > strend - strpos) {
a3621e74 382 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 383 "String too short... [re_intuit_start]\n"));
cad2e5aa 384 goto fail;
2c2d71f5 385 }
a1933d95 386 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 387 PL_regeol = strend;
33b8afdf
JH
388 if (do_utf8) {
389 if (!prog->check_utf8 && prog->check_substr)
390 to_utf8_substr(prog);
391 check = prog->check_utf8;
392 } else {
393 if (!prog->check_substr && prog->check_utf8)
394 to_byte_substr(prog);
395 check = prog->check_substr;
396 }
397 if (check == &PL_sv_undef) {
a3621e74 398 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
399 "Non-utf string cannot match utf check string\n"));
400 goto fail;
401 }
2c2d71f5 402 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
403 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
404 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 405 && !multiline ) ); /* Check after \n? */
cad2e5aa 406
7e25d62c
JH
407 if (!ml_anch) {
408 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
409 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 410 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
411 && sv && !SvROK(sv)
412 && (strpos != strbeg)) {
a3621e74 413 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
414 goto fail;
415 }
416 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 417 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 418 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
419 I32 slen;
420
1aa99e6b 421 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
422 if (SvTAIL(check)) {
423 slen = SvCUR(check); /* >= 1 */
cad2e5aa 424
9041c2e3 425 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 426 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 427 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 428 goto fail_finish;
cad2e5aa
JH
429 }
430 /* Now should match s[0..slen-2] */
431 slen--;
3f7c398e 432 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 433 || (slen > 1
3f7c398e 434 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 435 report_neq:
a3621e74 436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
437 goto fail_finish;
438 }
cad2e5aa 439 }
3f7c398e 440 else if (*SvPVX_const(check) != *s
653099ff 441 || ((slen = SvCUR(check)) > 1
3f7c398e 442 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 443 goto report_neq;
c315bfe8 444 check_at = s;
2c2d71f5 445 goto success_at_start;
7e25d62c 446 }
cad2e5aa 447 }
2c2d71f5 448 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 449 s = strpos;
2c2d71f5 450 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 451 end_shift = prog->minlen - start_shift -
653099ff 452 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 453 if (!ml_anch) {
a3b680e6 454 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 455 - (SvTAIL(check) != 0);
a3b680e6 456 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
457
458 if (end_shift < eshift)
459 end_shift = eshift;
460 }
cad2e5aa 461 }
2c2d71f5 462 else { /* Can match at random position */
cad2e5aa
JH
463 ml_anch = 0;
464 s = strpos;
2c2d71f5
JH
465 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
466 /* Should be nonnegative! */
467 end_shift = prog->minlen - start_shift -
653099ff 468 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
469 }
470
2c2d71f5 471#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 472 if (end_shift < 0)
6bbae5e6 473 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
474#endif
475
2c2d71f5
JH
476 restart:
477 /* Find a possible match in the region s..strend by looking for
478 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 479 if (flags & REXEC_SCREAM) {
cad2e5aa 480 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 481 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 482
2c2d71f5
JH
483 if (PL_screamfirst[BmRARE(check)] >= 0
484 || ( BmRARE(check) == '\n'
485 && (BmPREVIOUS(check) == SvCUR(check) - 1)
486 && SvTAIL(check) ))
9041c2e3 487 s = screaminstr(sv, check,
2c2d71f5 488 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 489 else
2c2d71f5 490 goto fail_finish;
4addbd3b
HS
491 /* we may be pointing at the wrong string */
492 if (s && RX_MATCH_COPIED(prog))
3f7c398e 493 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
494 if (data)
495 *data->scream_olds = s;
496 }
f33976b4 497 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
498 s = fbm_instr((U8*)(s + start_shift),
499 (U8*)(strend - end_shift),
7fba1cd6 500 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 501 else
1aa99e6b
IH
502 s = fbm_instr(HOP3(s, start_shift, strend),
503 HOP3(strend, -end_shift, strbeg),
7fba1cd6 504 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
505
506 /* Update the count-of-usability, remove useless subpatterns,
507 unshift s. */
2c2d71f5 508
ab3bbdeb
YO
509 DEBUG_EXECUTE_r({
510 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
511 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
512 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 513 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
514 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
515 ? "anchored" : "floating"),
516 quoted,
517 RE_SV_TAIL(check),
518 (s ? " at offset " : "...\n") );
519 });
2c2d71f5
JH
520
521 if (!s)
522 goto fail_finish;
523
6eb5f6b9
JH
524 check_at = s;
525
2c2d71f5 526 /* Finish the diagnostic message */
a3621e74 527 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
528
529 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
530 Start with the other substr.
531 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 532 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
533 *always* match. Probably should be marked during compile...
534 Probably it is right to do no SCREAM here...
535 */
536
33b8afdf 537 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 538 /* Take into account the "other" substring. */
2c2d71f5
JH
539 /* XXXX May be hopelessly wrong for UTF... */
540 if (!other_last)
6eb5f6b9 541 other_last = strpos;
33b8afdf 542 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
543 do_other_anchored:
544 {
890ce7af
AL
545 char * const last = HOP3c(s, -start_shift, strbeg);
546 char *last1, *last2;
be8e71aa 547 char * const saved_s = s;
33b8afdf 548 SV* must;
2c2d71f5 549
2c2d71f5
JH
550 t = s - prog->check_offset_max;
551 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 552 && (!do_utf8
0ce71af7 553 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 554 && t > strpos)))
6f207bd3 555 NOOP;
2c2d71f5
JH
556 else
557 t = strpos;
1aa99e6b 558 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
559 if (t < other_last) /* These positions already checked */
560 t = other_last;
1aa99e6b 561 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
562 if (last < last1)
563 last1 = last;
564 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
565 /* On end-of-str: see comment below. */
33b8afdf
JH
566 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
567 if (must == &PL_sv_undef) {
568 s = (char*)NULL;
a3621e74 569 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
570 }
571 else
572 s = fbm_instr(
573 (unsigned char*)t,
574 HOP3(HOP3(last1, prog->anchored_offset, strend)
575 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
576 must,
7fba1cd6 577 multiline ? FBMrf_MULTILINE : 0
33b8afdf 578 );
ab3bbdeb
YO
579 DEBUG_EXECUTE_r({
580 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
581 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
582 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 583 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
584 quoted, RE_SV_TAIL(must));
585 });
586
587
2c2d71f5
JH
588 if (!s) {
589 if (last1 >= last2) {
a3621e74 590 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
591 ", giving up...\n"));
592 goto fail_finish;
593 }
a3621e74 594 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 595 ", trying floating at offset %ld...\n",
be8e71aa 596 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
597 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
598 s = HOP3c(last, 1, strend);
2c2d71f5
JH
599 goto restart;
600 }
601 else {
a3621e74 602 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 603 (long)(s - i_strpos)));
1aa99e6b
IH
604 t = HOP3c(s, -prog->anchored_offset, strbeg);
605 other_last = HOP3c(s, 1, strend);
be8e71aa 606 s = saved_s;
2c2d71f5
JH
607 if (t == strpos)
608 goto try_at_start;
2c2d71f5
JH
609 goto try_at_offset;
610 }
30944b6d 611 }
2c2d71f5
JH
612 }
613 else { /* Take into account the floating substring. */
33b8afdf 614 char *last, *last1;
be8e71aa 615 char * const saved_s = s;
33b8afdf
JH
616 SV* must;
617
618 t = HOP3c(s, -start_shift, strbeg);
619 last1 = last =
620 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
621 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
622 last = HOP3c(t, prog->float_max_offset, strend);
623 s = HOP3c(t, prog->float_min_offset, strend);
624 if (s < other_last)
625 s = other_last;
2c2d71f5 626 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
627 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
628 /* fbm_instr() takes into account exact value of end-of-str
629 if the check is SvTAIL(ed). Since false positives are OK,
630 and end-of-str is not later than strend we are OK. */
631 if (must == &PL_sv_undef) {
632 s = (char*)NULL;
a3621e74 633 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
634 }
635 else
2c2d71f5 636 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
637 (unsigned char*)last + SvCUR(must)
638 - (SvTAIL(must)!=0),
7fba1cd6 639 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
640 DEBUG_EXECUTE_r({
641 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
642 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
643 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 644 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
645 quoted, RE_SV_TAIL(must));
646 });
33b8afdf
JH
647 if (!s) {
648 if (last1 == last) {
a3621e74 649 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
650 ", giving up...\n"));
651 goto fail_finish;
2c2d71f5 652 }
a3621e74 653 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 654 ", trying anchored starting at offset %ld...\n",
be8e71aa 655 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
656 other_last = last;
657 s = HOP3c(t, 1, strend);
658 goto restart;
659 }
660 else {
a3621e74 661 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
662 (long)(s - i_strpos)));
663 other_last = s; /* Fix this later. --Hugo */
be8e71aa 664 s = saved_s;
33b8afdf
JH
665 if (t == strpos)
666 goto try_at_start;
667 goto try_at_offset;
668 }
2c2d71f5 669 }
cad2e5aa 670 }
2c2d71f5
JH
671
672 t = s - prog->check_offset_max;
2c2d71f5 673 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 674 && (!do_utf8
0ce71af7 675 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
1aa99e6b 676 && t > strpos))) {
2c2d71f5
JH
677 /* Fixed substring is found far enough so that the match
678 cannot start at strpos. */
679 try_at_offset:
cad2e5aa 680 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
681 /* Eventually fbm_*() should handle this, but often
682 anchored_offset is not 0, so this check will not be wasted. */
683 /* XXXX In the code below we prefer to look for "^" even in
684 presence of anchored substrings. And we search even
685 beyond the found float position. These pessimizations
686 are historical artefacts only. */
687 find_anchor:
2c2d71f5 688 while (t < strend - prog->minlen) {
cad2e5aa 689 if (*t == '\n') {
4ee3650e 690 if (t < check_at - prog->check_offset_min) {
33b8afdf 691 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
692 /* Since we moved from the found position,
693 we definitely contradict the found anchored
30944b6d
IZ
694 substr. Due to the above check we do not
695 contradict "check" substr.
696 Thus we can arrive here only if check substr
697 is float. Redo checking for "other"=="fixed".
698 */
9041c2e3 699 strpos = t + 1;
a3621e74 700 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 701 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
702 goto do_other_anchored;
703 }
4ee3650e
GS
704 /* We don't contradict the found floating substring. */
705 /* XXXX Why not check for STCLASS? */
cad2e5aa 706 s = t + 1;
a3621e74 707 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 708 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
709 goto set_useful;
710 }
4ee3650e
GS
711 /* Position contradicts check-string */
712 /* XXXX probably better to look for check-string
713 than for "\n", so one should lower the limit for t? */
a3621e74 714 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 715 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 716 other_last = strpos = s = t + 1;
cad2e5aa
JH
717 goto restart;
718 }
719 t++;
720 }
a3621e74 721 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 722 PL_colors[0], PL_colors[1]));
2c2d71f5 723 goto fail_finish;
cad2e5aa 724 }
f5952150 725 else {
a3621e74 726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 727 PL_colors[0], PL_colors[1]));
f5952150 728 }
cad2e5aa
JH
729 s = t;
730 set_useful:
33b8afdf 731 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
732 }
733 else {
f5952150 734 /* The found string does not prohibit matching at strpos,
2c2d71f5 735 - no optimization of calling REx engine can be performed,
f5952150
GS
736 unless it was an MBOL and we are not after MBOL,
737 or a future STCLASS check will fail this. */
2c2d71f5
JH
738 try_at_start:
739 /* Even in this situation we may use MBOL flag if strpos is offset
740 wrt the start of the string. */
05b4157f 741 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 742 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
743 /* May be due to an implicit anchor of m{.*foo} */
744 && !(prog->reganch & ROPT_IMPLICIT))
745 {
cad2e5aa
JH
746 t = strpos;
747 goto find_anchor;
748 }
a3621e74 749 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 750 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 751 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 752 );
2c2d71f5 753 success_at_start:
30944b6d 754 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
755 && (do_utf8 ? (
756 prog->check_utf8 /* Could be deleted already */
757 && --BmUSEFUL(prog->check_utf8) < 0
758 && (prog->check_utf8 == prog->float_utf8)
759 ) : (
760 prog->check_substr /* Could be deleted already */
761 && --BmUSEFUL(prog->check_substr) < 0
762 && (prog->check_substr == prog->float_substr)
763 )))
66e933ab 764 {
cad2e5aa 765 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 766 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
767 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
768 if (do_utf8 ? prog->check_substr : prog->check_utf8)
769 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
770 prog->check_substr = prog->check_utf8 = NULL; /* disable */
771 prog->float_substr = prog->float_utf8 = NULL; /* clear */
772 check = NULL; /* abort */
cad2e5aa 773 s = strpos;
3cf5c195
IZ
774 /* XXXX This is a remnant of the old implementation. It
775 looks wasteful, since now INTUIT can use many
6eb5f6b9 776 other heuristics. */
cad2e5aa
JH
777 prog->reganch &= ~RE_USE_INTUIT;
778 }
779 else
780 s = strpos;
781 }
782
6eb5f6b9
JH
783 /* Last resort... */
784 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
07be1b83 785 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
6eb5f6b9
JH
786 /* minlen == 0 is possible if regstclass is \b or \B,
787 and the fixed substr is ''$.
788 Since minlen is already taken into account, s+1 is before strend;
789 accidentally, minlen >= 1 guaranties no false positives at s + 1
790 even for \b or \B. But (minlen? 1 : 0) below assumes that
791 regstclass does not come from lookahead... */
792 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
793 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 794 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 795 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 796 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 797 : 1);
07be1b83 798 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 799 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 800 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
801 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
802 cl_l, strend)
803 : strend);
07be1b83
YO
804 /*if (OP(prog->regstclass) == TRIE)
805 endpos++;*/
6eb5f6b9 806 t = s;
3b0527fe 807 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
808 if (!s) {
809#ifdef DEBUGGING
cbbf8932 810 const char *what = NULL;
6eb5f6b9
JH
811#endif
812 if (endpos == strend) {
a3621e74 813 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
814 "Could not match STCLASS...\n") );
815 goto fail;
816 }
a3621e74 817 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 818 "This position contradicts STCLASS...\n") );
653099ff
GS
819 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
820 goto fail;
6eb5f6b9 821 /* Contradict one of substrings */
33b8afdf
JH
822 if (prog->anchored_substr || prog->anchored_utf8) {
823 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 824 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 825 hop_and_restart:
1aa99e6b 826 s = HOP3c(t, 1, strend);
66e933ab
GS
827 if (s + start_shift + end_shift > strend) {
828 /* XXXX Should be taken into account earlier? */
a3621e74 829 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
830 "Could not match STCLASS...\n") );
831 goto fail;
832 }
5e39e1e5
HS
833 if (!check)
834 goto giveup;
a3621e74 835 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 836 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
837 what, (long)(s + start_shift - i_strpos)) );
838 goto restart;
839 }
66e933ab 840 /* Have both, check_string is floating */
6eb5f6b9
JH
841 if (t + start_shift >= check_at) /* Contradicts floating=check */
842 goto retry_floating_check;
843 /* Recheck anchored substring, but not floating... */
9041c2e3 844 s = check_at;
5e39e1e5
HS
845 if (!check)
846 goto giveup;
a3621e74 847 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 848 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
849 (long)(other_last - i_strpos)) );
850 goto do_other_anchored;
851 }
60e71179
GS
852 /* Another way we could have checked stclass at the
853 current position only: */
854 if (ml_anch) {
855 s = t = t + 1;
5e39e1e5
HS
856 if (!check)
857 goto giveup;
a3621e74 858 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 859 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 860 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 861 goto try_at_offset;
66e933ab 862 }
33b8afdf 863 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 864 goto fail;
6eb5f6b9
JH
865 /* Check is floating subtring. */
866 retry_floating_check:
867 t = check_at - start_shift;
a3621e74 868 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
869 goto hop_and_restart;
870 }
b7953727 871 if (t != s) {
a3621e74 872 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 873 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
874 (long)(t - i_strpos), (long)(s - i_strpos))
875 );
876 }
877 else {
a3621e74 878 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
879 "Does not contradict STCLASS...\n");
880 );
881 }
6eb5f6b9 882 }
5e39e1e5 883 giveup:
a3621e74 884 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
885 PL_colors[4], (check ? "Guessed" : "Giving up"),
886 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 887 return s;
2c2d71f5
JH
888
889 fail_finish: /* Substring not found */
33b8afdf
JH
890 if (prog->check_substr || prog->check_utf8) /* could be removed already */
891 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 892 fail:
a3621e74 893 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 894 PL_colors[4], PL_colors[5]));
bd61b366 895 return NULL;
cad2e5aa 896}
9661b544 897
6eb5f6b9 898/* We know what class REx starts with. Try to find this position... */
3b0527fe 899/* if reginfo is NULL, its a dryrun */
07be1b83
YO
900/* annoyingly all the vars in this routine have different names from their counterparts
901 in regmatch. /grrr */
3b0527fe 902
4cadc6a9
YO
903#define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
904foldlen, foldbuf, uniflags) STMT_START { \
905 switch (trie_type) { \
906 case trie_utf8_fold: \
907 if ( foldlen>0 ) { \
908 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
909 foldlen -= len; \
910 uscan += len; \
911 len=0; \
912 } else { \
913 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
914 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
915 foldlen -= UNISKIP( uvc ); \
916 uscan = foldbuf + UNISKIP( uvc ); \
917 } \
918 break; \
919 case trie_utf8: \
920 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
921 break; \
922 case trie_plain: \
923 uvc = (UV)*uc; \
924 len = 1; \
925 } \
926 \
927 if (uvc < 256) { \
928 charid = trie->charmap[ uvc ]; \
929 } \
930 else { \
931 charid = 0; \
932 if (trie->widecharmap) { \
933 SV** const svpp = hv_fetch(trie->widecharmap, \
934 (char*)&uvc, sizeof(UV), 0); \
935 if (svpp) \
936 charid = (U16)SvIV(*svpp); \
937 } \
938 } \
939} STMT_END
940
941#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
942 if ( (CoNd) \
943 && (ln == len || \
944 ibcmp_utf8(s, NULL, 0, do_utf8, \
945 m, NULL, ln, (bool)UTF)) \
946 && (!reginfo || regtry(reginfo, s)) ) \
947 goto got_it; \
948 else { \
949 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
950 uvchr_to_utf8(tmpbuf, c); \
951 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
952 if ( f != c \
953 && (f == c1 || f == c2) \
954 && (ln == foldlen || \
955 !ibcmp_utf8((char *) foldbuf, \
956 NULL, foldlen, do_utf8, \
957 m, \
958 NULL, ln, (bool)UTF)) \
959 && (!reginfo || regtry(reginfo, s)) ) \
960 goto got_it; \
961 } \
962 s += len
963
964#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
965STMT_START { \
966 while (s <= e) { \
967 if ( (CoNd) \
968 && (ln == 1 || !(OP(c) == EXACTF \
969 ? ibcmp(s, m, ln) \
970 : ibcmp_locale(s, m, ln))) \
971 && (!reginfo || regtry(reginfo, s)) ) \
972 goto got_it; \
973 s++; \
974 } \
975} STMT_END
976
977#define REXEC_FBC_UTF8_SCAN(CoDe) \
978STMT_START { \
979 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
980 CoDe \
981 s += uskip; \
982 } \
983} STMT_END
984
985#define REXEC_FBC_SCAN(CoDe) \
986STMT_START { \
987 while (s < strend) { \
988 CoDe \
989 s++; \
990 } \
991} STMT_END
992
993#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
994REXEC_FBC_UTF8_SCAN( \
995 if (CoNd) { \
996 if (tmp && (!reginfo || regtry(reginfo, s))) \
997 goto got_it; \
998 else \
999 tmp = doevery; \
1000 } \
1001 else \
1002 tmp = 1; \
1003)
1004
1005#define REXEC_FBC_CLASS_SCAN(CoNd) \
1006REXEC_FBC_SCAN( \
1007 if (CoNd) { \
1008 if (tmp && (!reginfo || regtry(reginfo, s))) \
1009 goto got_it; \
1010 else \
1011 tmp = doevery; \
1012 } \
1013 else \
1014 tmp = 1; \
1015)
1016
1017#define REXEC_FBC_TRYIT \
1018if ((!reginfo || regtry(reginfo, s))) \
1019 goto got_it
1020
1021#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1022 if (do_utf8) { \
1023 UtFpReLoAd; \
1024 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1025 } \
1026 else { \
1027 REXEC_FBC_CLASS_SCAN(CoNd); \
1028 } \
1029 break
1030
1031#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1032 PL_reg_flags |= RF_tainted; \
1033 if (do_utf8) { \
1034 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1035 } \
1036 else { \
1037 REXEC_FBC_CLASS_SCAN(CoNd); \
1038 } \
1039 break
1040
3c3eec57 1041STATIC char *
07be1b83
YO
1042S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1043 const char *strend, const regmatch_info *reginfo)
a687059c 1044{
27da23d5 1045 dVAR;
1df70142 1046 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 1047 char *m;
d8093b23 1048 STRLEN ln;
5dab1207 1049 STRLEN lnc;
078c425b 1050 register STRLEN uskip;
d8093b23
G
1051 unsigned int c1;
1052 unsigned int c2;
6eb5f6b9
JH
1053 char *e;
1054 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1055 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 1056
6eb5f6b9
JH
1057 /* We know what class it must start with. */
1058 switch (OP(c)) {
6eb5f6b9 1059 case ANYOF:
388cc4de 1060 if (do_utf8) {
4cadc6a9 1061 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1062 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1063 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1064 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1065 }
1066 else {
1067 while (s < strend) {
1068 STRLEN skip = 1;
1069
32fc9b6a 1070 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1071 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1072 /* The assignment of 2 is intentional:
1073 * for the folded sharp s, the skip is 2. */
1074 (skip = SHARP_S_SKIP))) {
3b0527fe 1075 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
1076 goto got_it;
1077 else
1078 tmp = doevery;
1079 }
1080 else
1081 tmp = 1;
1082 s += skip;
1083 }
a0d0e21e 1084 }
6eb5f6b9 1085 break;
f33976b4 1086 case CANY:
4cadc6a9 1087 REXEC_FBC_SCAN(
3b0527fe 1088 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
1089 goto got_it;
1090 else
1091 tmp = doevery;
4cadc6a9 1092 );
f33976b4 1093 break;
6eb5f6b9 1094 case EXACTF:
5dab1207
NIS
1095 m = STRING(c);
1096 ln = STR_LEN(c); /* length to match in octets/bytes */
1097 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1098 if (UTF) {
a2a2844f 1099 STRLEN ulen1, ulen2;
5dab1207 1100 U8 *sm = (U8 *) m;
89ebb4a3
JH
1101 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1102 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1103 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1104
1105 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1106 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1107
89ebb4a3 1108 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1109 0, uniflags);
89ebb4a3 1110 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1111 0, uniflags);
5dab1207
NIS
1112 lnc = 0;
1113 while (sm < ((U8 *) m + ln)) {
1114 lnc++;
1115 sm += UTF8SKIP(sm);
1116 }
1aa99e6b
IH
1117 }
1118 else {
1119 c1 = *(U8*)m;
1120 c2 = PL_fold[c1];
1121 }
6eb5f6b9
JH
1122 goto do_exactf;
1123 case EXACTFL:
5dab1207
NIS
1124 m = STRING(c);
1125 ln = STR_LEN(c);
1126 lnc = (I32) ln;
d8093b23 1127 c1 = *(U8*)m;
6eb5f6b9
JH
1128 c2 = PL_fold_locale[c1];
1129 do_exactf:
db12adc6 1130 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1131
3b0527fe 1132 if (!reginfo && e < s)
6eb5f6b9 1133 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1134
60a8b682
JH
1135 /* The idea in the EXACTF* cases is to first find the
1136 * first character of the EXACTF* node and then, if
1137 * necessary, case-insensitively compare the full
1138 * text of the node. The c1 and c2 are the first
1139 * characters (though in Unicode it gets a bit
1140 * more complicated because there are more cases
7f16dd3d
JH
1141 * than just upper and lower: one needs to use
1142 * the so-called folding case for case-insensitive
1143 * matching (called "loose matching" in Unicode).
1144 * ibcmp_utf8() will do just that. */
60a8b682 1145
1aa99e6b 1146 if (do_utf8) {
575cac57 1147 UV c, f;
89ebb4a3 1148 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1149 STRLEN len, foldlen;
4ad0818d 1150 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1151 if (c1 == c2) {
5dab1207
NIS
1152 /* Upper and lower of 1st char are equal -
1153 * probably not a "letter". */
1aa99e6b 1154 while (s <= e) {
89ebb4a3 1155 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1156 uniflags);
4cadc6a9 1157 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1158 }
09091399
JH
1159 }
1160 else {
1aa99e6b 1161 while (s <= e) {
89ebb4a3 1162 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1163 uniflags);
80aecb99 1164
60a8b682 1165 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1166 * Note that not all the possible combinations
1167 * are handled here: some of them are handled
1168 * by the standard folding rules, and some of
1169 * them (the character class or ANYOF cases)
1170 * are handled during compiletime in
1171 * regexec.c:S_regclass(). */
880bd946
JH
1172 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1173 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1174 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1175
4cadc6a9 1176 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1177 }
09091399 1178 }
1aa99e6b
IH
1179 }
1180 else {
1181 if (c1 == c2)
4cadc6a9 1182 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1183 else
4cadc6a9 1184 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1185 }
1186 break;
bbce6d69 1187 case BOUNDL:
3280af22 1188 PL_reg_flags |= RF_tainted;
bbce6d69 1189 /* FALL THROUGH */
a0d0e21e 1190 case BOUND:
ffc61ed2 1191 if (do_utf8) {
12d33761 1192 if (s == PL_bostr)
ffc61ed2
JH
1193 tmp = '\n';
1194 else {
6136c704 1195 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1196 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1197 }
1198 tmp = ((OP(c) == BOUND ?
9041c2e3 1199 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1200 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1201 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1202 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1203 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1204 isALNUM_LC_utf8((U8*)s)))
1205 {
1206 tmp = !tmp;
4cadc6a9 1207 REXEC_FBC_TRYIT;
a687059c 1208 }
4cadc6a9 1209 );
a0d0e21e 1210 }
667bb95a 1211 else {
12d33761 1212 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1213 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1214 REXEC_FBC_SCAN(
ffc61ed2
JH
1215 if (tmp ==
1216 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1217 tmp = !tmp;
4cadc6a9 1218 REXEC_FBC_TRYIT;
a0ed51b3 1219 }
4cadc6a9 1220 );
a0ed51b3 1221 }
3b0527fe 1222 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1223 goto got_it;
1224 break;
bbce6d69 1225 case NBOUNDL:
3280af22 1226 PL_reg_flags |= RF_tainted;
bbce6d69 1227 /* FALL THROUGH */
a0d0e21e 1228 case NBOUND:
ffc61ed2 1229 if (do_utf8) {
12d33761 1230 if (s == PL_bostr)
ffc61ed2
JH
1231 tmp = '\n';
1232 else {
6136c704 1233 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1234 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1235 }
1236 tmp = ((OP(c) == NBOUND ?
9041c2e3 1237 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1238 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1239 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1240 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1241 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1242 isALNUM_LC_utf8((U8*)s)))
1243 tmp = !tmp;
4cadc6a9
YO
1244 else REXEC_FBC_TRYIT;
1245 );
a0d0e21e 1246 }
667bb95a 1247 else {
12d33761 1248 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1249 tmp = ((OP(c) == NBOUND ?
1250 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1251 REXEC_FBC_SCAN(
ffc61ed2
JH
1252 if (tmp ==
1253 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1254 tmp = !tmp;
4cadc6a9
YO
1255 else REXEC_FBC_TRYIT;
1256 );
a0ed51b3 1257 }
3b0527fe 1258 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1259 goto got_it;
1260 break;
a0d0e21e 1261 case ALNUM:
4cadc6a9
YO
1262 REXEC_FBC_CSCAN_PRELOAD(
1263 LOAD_UTF8_CHARCLASS_ALNUM(),
1264 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1265 isALNUM(*s)
1266 );
bbce6d69 1267 case ALNUML:
4cadc6a9
YO
1268 REXEC_FBC_CSCAN_TAINT(
1269 isALNUM_LC_utf8((U8*)s),
1270 isALNUM_LC(*s)
1271 );
a0d0e21e 1272 case NALNUM:
4cadc6a9
YO
1273 REXEC_FBC_CSCAN_PRELOAD(
1274 LOAD_UTF8_CHARCLASS_ALNUM(),
1275 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1276 !isALNUM(*s)
1277 );
bbce6d69 1278 case NALNUML:
4cadc6a9
YO
1279 REXEC_FBC_CSCAN_TAINT(
1280 !isALNUM_LC_utf8((U8*)s),
1281 !isALNUM_LC(*s)
1282 );
a0d0e21e 1283 case SPACE:
4cadc6a9
YO
1284 REXEC_FBC_CSCAN_PRELOAD(
1285 LOAD_UTF8_CHARCLASS_SPACE(),
1286 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1287 isSPACE(*s)
1288 );
bbce6d69 1289 case SPACEL:
4cadc6a9
YO
1290 REXEC_FBC_CSCAN_TAINT(
1291 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1292 isSPACE_LC(*s)
1293 );
a0d0e21e 1294 case NSPACE:
4cadc6a9
YO
1295 REXEC_FBC_CSCAN_PRELOAD(
1296 LOAD_UTF8_CHARCLASS_SPACE(),
1297 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1298 !isSPACE(*s)
1299 );
bbce6d69 1300 case NSPACEL:
4cadc6a9
YO
1301 REXEC_FBC_CSCAN_TAINT(
1302 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1303 !isSPACE_LC(*s)
1304 );
a0d0e21e 1305 case DIGIT:
4cadc6a9
YO
1306 REXEC_FBC_CSCAN_PRELOAD(
1307 LOAD_UTF8_CHARCLASS_DIGIT(),
1308 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1309 isDIGIT(*s)
1310 );
b8c5462f 1311 case DIGITL:
4cadc6a9
YO
1312 REXEC_FBC_CSCAN_TAINT(
1313 isDIGIT_LC_utf8((U8*)s),
1314 isDIGIT_LC(*s)
1315 );
a0d0e21e 1316 case NDIGIT:
4cadc6a9
YO
1317 REXEC_FBC_CSCAN_PRELOAD(
1318 LOAD_UTF8_CHARCLASS_DIGIT(),
1319 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1320 !isDIGIT(*s)
1321 );
b8c5462f 1322 case NDIGITL:
4cadc6a9
YO
1323 REXEC_FBC_CSCAN_TAINT(
1324 !isDIGIT_LC_utf8((U8*)s),
1325 !isDIGIT_LC(*s)
1326 );
07be1b83
YO
1327 case TRIE:
1328 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1329 {
1330 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1331 trie_type = do_utf8 ?
1332 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1333 : trie_plain;
1334 /* what trie are we using right now */
1335 reg_ac_data *aho
1336 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1337 reg_trie_data *trie=aho->trie;
1338
1339 const char *last_start = strend - trie->minlen;
6148ee25 1340#ifdef DEBUGGING
07be1b83 1341 const char *real_start = s;
6148ee25 1342#endif
07be1b83 1343 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1344 SV *sv_points;
1345 U8 **points; /* map of where we were in the input string
1346 when reading a given string. For ASCII this
1347 is unnecessary overhead as the relationship
1348 is always 1:1, but for unicode, especially
1349 case folded unicode this is not true. */
f9e705e8 1350 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
07be1b83
YO
1351
1352 GET_RE_DEBUG_FLAGS_DECL;
1353
be8e71aa
YO
1354 /* We can't just allocate points here. We need to wrap it in
1355 * an SV so it gets freed properly if there is a croak while
1356 * running the match */
1357 ENTER;
1358 SAVETMPS;
1359 sv_points=newSV(maxlen * sizeof(U8 *));
1360 SvCUR_set(sv_points,
1361 maxlen * sizeof(U8 *));
1362 SvPOK_on(sv_points);
1363 sv_2mortal(sv_points);
1364 points=(U8**)SvPV_nolen(sv_points );
07be1b83
YO
1365
1366 if (trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1367 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
07be1b83
YO
1368 s++;
1369 }
1370 }
1371
1372 while (s <= last_start) {
1373 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1374 U8 *uc = (U8*)s;
1375 U16 charid = 0;
1376 U32 base = 1;
1377 U32 state = 1;
1378 UV uvc = 0;
1379 STRLEN len = 0;
1380 STRLEN foldlen = 0;
1381 U8 *uscan = (U8*)NULL;
1382 U8 *leftmost = NULL;
1383
1384 U32 pointpos = 0;
1385
1386 while ( state && uc <= (U8*)strend ) {
1387 int failed=0;
1388 if (aho->states[ state ].wordnum) {
1389 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1390 if (!leftmost || lpos < leftmost)
1391 leftmost= lpos;
1392 if (base==0) break;
1393 }
1394 points[pointpos++ % maxlen]= uc;
4cadc6a9
YO
1395 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1396 uvc, charid, foldlen, foldbuf, uniflags);
07be1b83
YO
1397 DEBUG_TRIE_EXECUTE_r(
1398 PerlIO_printf(Perl_debug_log,
1399 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1400 (int)((const char*)uc - real_start), charid, uvc)
1401 );
1402 uc += len;
1403
1404 do {
6148ee25 1405#ifdef DEBUGGING
07be1b83 1406 U32 word = aho->states[ state ].wordnum;
6148ee25 1407#endif
07be1b83
YO
1408 base = aho->states[ state ].trans.base;
1409
1410 DEBUG_TRIE_EXECUTE_r(
1411 PerlIO_printf( Perl_debug_log,
1412 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1413 failed ? "Fail transition to " : "",
e4f74956 1414 (UV)state, (UV)base, (UV)uvc, (UV)word)
07be1b83
YO
1415 );
1416 if ( base ) {
1417 U32 tmp;
1418 if (charid &&
1419 (base + charid > trie->uniquecharcount )
1420 && (base + charid - 1 - trie->uniquecharcount
1421 < trie->lasttrans)
1422 && trie->trans[base + charid - 1 -
1423 trie->uniquecharcount].check == state
1424 && (tmp=trie->trans[base + charid - 1 -
1425 trie->uniquecharcount ].next))
1426 {
1427 state = tmp;
1428 break;
1429 }
1430 else {
1431 failed++;
1432 if ( state == 1 )
1433 break;
1434 else
1435 state = aho->fail[state];
1436 }
1437 }
1438 else {
1439 /* we must be accepting here */
1440 failed++;
1441 break;
1442 }
1443 } while(state);
1444 if (failed) {
1445 if (leftmost)
1446 break;
1447 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1448 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
07be1b83
YO
1449 uc++;
1450 }
1451 }
1452 }
1453 }
1454 if ( aho->states[ state ].wordnum ) {
1455 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1456 if (!leftmost || lpos < leftmost)
1457 leftmost = lpos;
1458 }
1459 DEBUG_TRIE_EXECUTE_r(
1460 PerlIO_printf( Perl_debug_log,
1461 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1462 "All done: ",
e4f74956 1463 (UV)state, (UV)base, (UV)uvc)
07be1b83
YO
1464 );
1465 if (leftmost) {
1466 s = (char*)leftmost;
be8e71aa
YO
1467 if (!reginfo || regtry(reginfo, s)) {
1468 FREETMPS;
1469 LEAVE;
07be1b83 1470 goto got_it;
be8e71aa 1471 }
07be1b83
YO
1472 s = HOPc(s,1);
1473 } else {
1474 break;
1475 }
1476 }
be8e71aa
YO
1477 FREETMPS;
1478 LEAVE;
07be1b83
YO
1479 }
1480 break;
b3c9acc1 1481 default:
3c3eec57
GS
1482 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1483 break;
d6a28714 1484 }
6eb5f6b9
JH
1485 return 0;
1486 got_it:
1487 return s;
1488}
1489
1490/*
1491 - regexec_flags - match a regexp against a string
1492 */
1493I32
1494Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1495 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1496/* strend: pointer to null at end of string */
1497/* strbeg: real beginning of string */
1498/* minend: end of match must be >=minend after stringarg. */
1499/* data: May be used for some additional optimizations. */
1500/* nosave: For optimizations. */
1501{
97aff369 1502 dVAR;
6eb5f6b9
JH
1503 register char *s;
1504 register regnode *c;
1505 register char *startpos = stringarg;
6eb5f6b9
JH
1506 I32 minlen; /* must match at least this many chars */
1507 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1508 I32 end_shift = 0; /* Same for the end. */ /* CC */
1509 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1510 char *scream_olds = NULL;
3dab1dad 1511 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1512 const bool do_utf8 = DO_UTF8(sv);
2757e526 1513 I32 multiline;
0df25f3d 1514
3b0527fe 1515 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1516
1517 GET_RE_DEBUG_FLAGS_DECL;
1518
9d4ba2ae 1519 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1520
1521 /* Be paranoid... */
1522 if (prog == NULL || startpos == NULL) {
1523 Perl_croak(aTHX_ "NULL regexp parameter");
1524 return 0;
1525 }
1526
2757e526 1527 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1528 reginfo.prog = prog;
2757e526 1529
bac06658
JH
1530 RX_MATCH_UTF8_set(prog, do_utf8);
1531
6eb5f6b9 1532 minlen = prog->minlen;
61a36c01 1533 if (strend - startpos < minlen) {
a3621e74 1534 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1535 "String too short [regexec_flags]...\n"));
1536 goto phooey;
1aa99e6b 1537 }
6eb5f6b9 1538
6eb5f6b9
JH
1539 /* Check validity of program. */
1540 if (UCHARAT(prog->program) != REG_MAGIC) {
1541 Perl_croak(aTHX_ "corrupted regexp program");
1542 }
1543
1544 PL_reg_flags = 0;
1545 PL_reg_eval_set = 0;
1546 PL_reg_maxiter = 0;
1547
1548 if (prog->reganch & ROPT_UTF8)
1549 PL_reg_flags |= RF_utf8;
1550
1551 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1552 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1553 PL_bostr = strbeg;
3b0527fe 1554 reginfo.sv = sv;
6eb5f6b9
JH
1555
1556 /* Mark end of line for $ (and such) */
1557 PL_regeol = strend;
1558
1559 /* see how far we have to get to not match where we matched before */
3b0527fe 1560 reginfo.till = startpos+minend;
6eb5f6b9 1561
6eb5f6b9
JH
1562 /* If there is a "must appear" string, look for it. */
1563 s = startpos;
1564
3b0527fe 1565 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1566 MAGIC *mg;
1567
1568 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1569 reginfo.ganch = startpos;
6eb5f6b9
JH
1570 else if (sv && SvTYPE(sv) >= SVt_PVMG
1571 && SvMAGIC(sv)
14befaf4
DM
1572 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1573 && mg->mg_len >= 0) {
3b0527fe 1574 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1575 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1576 if (s > reginfo.ganch)
6eb5f6b9 1577 goto phooey;
3b0527fe 1578 s = reginfo.ganch;
6eb5f6b9
JH
1579 }
1580 }
1581 else /* pos() not defined */
3b0527fe 1582 reginfo.ganch = strbeg;
6eb5f6b9
JH
1583 }
1584
a0714e2c 1585 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1586 re_scream_pos_data d;
1587
1588 d.scream_olds = &scream_olds;
1589 d.scream_pos = &scream_pos;
1590 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1591 if (!s) {
a3621e74 1592 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1593 goto phooey; /* not present */
3fa9c3d7 1594 }
6eb5f6b9
JH
1595 }
1596
ab3bbdeb
YO
1597 DEBUG_EXECUTE_r(
1598 debug_start_match(prog, do_utf8, startpos, strend,
1599 "Matching");
2a782b5b 1600 );
6eb5f6b9
JH
1601
1602 /* Simplest case: anchored match need be tried only once. */
1603 /* [unless only anchor is BOL and multiline is set] */
1604 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1605 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1606 goto got_it;
7fba1cd6 1607 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1608 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1609 {
1610 char *end;
1611
1612 if (minlen)
1613 dontbother = minlen - 1;
1aa99e6b 1614 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1615 /* for multiline we only have to try after newlines */
33b8afdf 1616 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1617 if (s == startpos)
1618 goto after_try;
1619 while (1) {
3b0527fe 1620 if (regtry(&reginfo, s))
6eb5f6b9
JH
1621 goto got_it;
1622 after_try:
1623 if (s >= end)
1624 goto phooey;
1625 if (prog->reganch & RE_USE_INTUIT) {
1626 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1627 if (!s)
1628 goto phooey;
1629 }
1630 else
1631 s++;
1632 }
1633 } else {
1634 if (s > startpos)
1635 s--;
1636 while (s < end) {
1637 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1638 if (regtry(&reginfo, s))
6eb5f6b9
JH
1639 goto got_it;
1640 }
1641 }
1642 }
1643 }
1644 goto phooey;
1645 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1646 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1647 goto got_it;
1648 goto phooey;
1649 }
1650
1651 /* Messy cases: unanchored match. */
33b8afdf 1652 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1653 /* we have /x+whatever/ */
1654 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1655 char ch;
bf93d4cc
GS
1656#ifdef DEBUGGING
1657 int did_match = 0;
1658#endif
33b8afdf
JH
1659 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1660 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1661 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1662
1aa99e6b 1663 if (do_utf8) {
4cadc6a9 1664 REXEC_FBC_SCAN(
6eb5f6b9 1665 if (*s == ch) {
a3621e74 1666 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1667 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1668 s += UTF8SKIP(s);
1669 while (s < strend && *s == ch)
1670 s += UTF8SKIP(s);
1671 }
4cadc6a9 1672 );
6eb5f6b9
JH
1673 }
1674 else {
4cadc6a9 1675 REXEC_FBC_SCAN(
6eb5f6b9 1676 if (*s == ch) {
a3621e74 1677 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1678 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1679 s++;
1680 while (s < strend && *s == ch)
1681 s++;
1682 }
4cadc6a9 1683 );
6eb5f6b9 1684 }
a3621e74 1685 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1686 PerlIO_printf(Perl_debug_log,
b7953727
JH
1687 "Did not find anchored character...\n")
1688 );
6eb5f6b9 1689 }
a0714e2c
SS
1690 else if (prog->anchored_substr != NULL
1691 || prog->anchored_utf8 != NULL
1692 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1693 && prog->float_max_offset < strend - s)) {
1694 SV *must;
1695 I32 back_max;
1696 I32 back_min;
1697 char *last;
6eb5f6b9 1698 char *last1; /* Last position checked before */
bf93d4cc
GS
1699#ifdef DEBUGGING
1700 int did_match = 0;
1701#endif
33b8afdf
JH
1702 if (prog->anchored_substr || prog->anchored_utf8) {
1703 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1704 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1705 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1706 back_max = back_min = prog->anchored_offset;
1707 } else {
1708 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1709 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1710 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1711 back_max = prog->float_max_offset;
1712 back_min = prog->float_min_offset;
1713 }
1714 if (must == &PL_sv_undef)
1715 /* could not downgrade utf8 check substring, so must fail */
1716 goto phooey;
1717
1718 last = HOP3c(strend, /* Cannot start after this */
1719 -(I32)(CHR_SVLEN(must)
1720 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1721
1722 if (s > PL_bostr)
1723 last1 = HOPc(s, -1);
1724 else
1725 last1 = s - 1; /* bogus */
1726
a0288114 1727 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1728 check_substr==must. */
1729 scream_pos = -1;
1730 dontbother = end_shift;
1731 strend = HOPc(strend, -dontbother);
1732 while ( (s <= last) &&
9041c2e3 1733 ((flags & REXEC_SCREAM)
1aa99e6b 1734 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1735 end_shift, &scream_pos, 0))
1aa99e6b 1736 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1737 (unsigned char*)strend, must,
7fba1cd6 1738 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1739 /* we may be pointing at the wrong string */
1740 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1741 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1742 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1743 if (HOPc(s, -back_max) > last1) {
1744 last1 = HOPc(s, -back_min);
1745 s = HOPc(s, -back_max);
1746 }
1747 else {
52657f30 1748 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1749
1750 last1 = HOPc(s, -back_min);
52657f30 1751 s = t;
6eb5f6b9 1752 }
1aa99e6b 1753 if (do_utf8) {
6eb5f6b9 1754 while (s <= last1) {
3b0527fe 1755 if (regtry(&reginfo, s))
6eb5f6b9
JH
1756 goto got_it;
1757 s += UTF8SKIP(s);
1758 }
1759 }
1760 else {
1761 while (s <= last1) {
3b0527fe 1762 if (regtry(&reginfo, s))
6eb5f6b9
JH
1763 goto got_it;
1764 s++;
1765 }
1766 }
1767 }
ab3bbdeb
YO
1768 DEBUG_EXECUTE_r(if (!did_match) {
1769 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1770 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1771 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 1772 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 1773 ? "anchored" : "floating"),
ab3bbdeb
YO
1774 quoted, RE_SV_TAIL(must));
1775 });
6eb5f6b9
JH
1776 goto phooey;
1777 }
155aba94 1778 else if ((c = prog->regstclass)) {
f14c76ed 1779 if (minlen) {
be8e71aa 1780 const OPCODE op = OP(prog->regstclass);
66e933ab 1781 /* don't bother with what can't match */
07be1b83 1782 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
f14c76ed
RGS
1783 strend = HOPc(strend, -(minlen - 1));
1784 }
a3621e74 1785 DEBUG_EXECUTE_r({
be8e71aa 1786 SV * const prop = sv_newmortal();
32fc9b6a 1787 regprop(prog, prop, c);
0df25f3d 1788 {
ab3bbdeb
YO
1789 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1790 s,strend-s,60);
0df25f3d 1791 PerlIO_printf(Perl_debug_log,
ab3bbdeb 1792 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 1793 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 1794 quoted, (int)(strend - s));
0df25f3d 1795 }
ffc61ed2 1796 });
3b0527fe 1797 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 1798 goto got_it;
07be1b83 1799 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
1800 }
1801 else {
1802 dontbother = 0;
a0714e2c 1803 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1804 /* Trim the end. */
d6a28714 1805 char *last;
33b8afdf
JH
1806 SV* float_real;
1807
1808 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1809 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1810 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1811
1812 if (flags & REXEC_SCREAM) {
33b8afdf 1813 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1814 end_shift, &scream_pos, 1); /* last one */
1815 if (!last)
ffc61ed2 1816 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1817 /* we may be pointing at the wrong string */
1818 else if (RX_MATCH_COPIED(prog))
3f7c398e 1819 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1820 }
d6a28714
JH
1821 else {
1822 STRLEN len;
cfd0369c 1823 const char * const little = SvPV_const(float_real, len);
d6a28714 1824
33b8afdf 1825 if (SvTAIL(float_real)) {
d6a28714
JH
1826 if (memEQ(strend - len + 1, little, len - 1))
1827 last = strend - len + 1;
7fba1cd6 1828 else if (!multiline)
9041c2e3 1829 last = memEQ(strend - len, little, len)
bd61b366 1830 ? strend - len : NULL;
b8c5462f 1831 else
d6a28714
JH
1832 goto find_last;
1833 } else {
1834 find_last:
9041c2e3 1835 if (len)
d6a28714 1836 last = rninstr(s, strend, little, little + len);
b8c5462f 1837 else
a0288114 1838 last = strend; /* matching "$" */
b8c5462f 1839 }
b8c5462f 1840 }
bf93d4cc 1841 if (last == NULL) {
a3621e74 1842 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 1843 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 1844 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1845 goto phooey; /* Should not happen! */
1846 }
d6a28714
JH
1847 dontbother = strend - last + prog->float_min_offset;
1848 }
1849 if (minlen && (dontbother < minlen))
1850 dontbother = minlen - 1;
1851 strend -= dontbother; /* this one's always in bytes! */
1852 /* We don't know much -- general case. */
1aa99e6b 1853 if (do_utf8) {
d6a28714 1854 for (;;) {
3b0527fe 1855 if (regtry(&reginfo, s))
d6a28714
JH
1856 goto got_it;
1857 if (s >= strend)
1858 break;
b8c5462f 1859 s += UTF8SKIP(s);
d6a28714
JH
1860 };
1861 }
1862 else {
1863 do {
3b0527fe 1864 if (regtry(&reginfo, s))
d6a28714
JH
1865 goto got_it;
1866 } while (s++ < strend);
1867 }
1868 }
1869
1870 /* Failure. */
1871 goto phooey;
1872
1873got_it:
1874 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1875
1876 if (PL_reg_eval_set) {
1877 /* Preserve the current value of $^R */
1878 if (oreplsv != GvSV(PL_replgv))
1879 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1880 restored, the value remains
1881 the same. */
4f639d21 1882 restore_pos(aTHX_ prog);
d6a28714
JH
1883 }
1884
1885 /* make sure $`, $&, $', and $digit will work later */
1886 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 1887 RX_MATCH_COPY_FREE(prog);
d6a28714 1888 if (flags & REXEC_COPY_STR) {
be8e71aa 1889 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 1890#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
1891 if ((SvIsCOW(sv)
1892 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
1893 if (DEBUG_C_TEST) {
1894 PerlIO_printf(Perl_debug_log,
1895 "Copy on write: regexp capture, type %d\n",
1896 (int) SvTYPE(sv));
1897 }
1898 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 1899 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
1900 assert (SvPOKp(prog->saved_copy));
1901 } else
1902#endif
1903 {
1904 RX_MATCH_COPIED_on(prog);
1905 s = savepvn(strbeg, i);
1906 prog->subbeg = s;
1907 }
d6a28714 1908 prog->sublen = i;
d6a28714
JH
1909 }
1910 else {
1911 prog->subbeg = strbeg;
1912 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1913 }
1914 }
9041c2e3 1915
d6a28714
JH
1916 return 1;
1917
1918phooey:
a3621e74 1919 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 1920 PL_colors[4], PL_colors[5]));
d6a28714 1921 if (PL_reg_eval_set)
4f639d21 1922 restore_pos(aTHX_ prog);
d6a28714
JH
1923 return 0;
1924}
1925
1926/*
1927 - regtry - try match at specific point
1928 */
1929STATIC I32 /* 0 failure, 1 success */
3b0527fe 1930S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 1931{
97aff369 1932 dVAR;
d6a28714
JH
1933 register I32 *sp;
1934 register I32 *ep;
1935 CHECKPOINT lastcp;
3b0527fe 1936 regexp *prog = reginfo->prog;
a3621e74 1937 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 1938
02db2b7b
IZ
1939#ifdef DEBUGGING
1940 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1941#endif
d6a28714
JH
1942 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1943 MAGIC *mg;
1944
1945 PL_reg_eval_set = RS_init;
a3621e74 1946 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
1947 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1948 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 1949 ));
e8347627 1950 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
1951 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1952 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1953 SAVETMPS;
1954 /* Apparently this is not needed, judging by wantarray. */
e8347627 1955 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
1956 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1957
3b0527fe 1958 if (reginfo->sv) {
d6a28714 1959 /* Make $_ available to executed code. */
3b0527fe 1960 if (reginfo->sv != DEFSV) {
59f00321 1961 SAVE_DEFSV;
3b0527fe 1962 DEFSV = reginfo->sv;
b8c5462f 1963 }
d6a28714 1964
3b0527fe
DM
1965 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
1966 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 1967 /* prepare for quick setting of pos */
d300d9fa
NC
1968#ifdef PERL_OLD_COPY_ON_WRITE
1969 if (SvIsCOW(sv))
1970 sv_force_normal_flags(sv, 0);
1971#endif
3dab1dad 1972 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 1973 &PL_vtbl_mglob, NULL, 0);
d6a28714 1974 mg->mg_len = -1;
b8c5462f 1975 }
d6a28714
JH
1976 PL_reg_magic = mg;
1977 PL_reg_oldpos = mg->mg_len;
4f639d21 1978 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 1979 }
09687e5a 1980 if (!PL_reg_curpm) {
a02a5408 1981 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
1982#ifdef USE_ITHREADS
1983 {
be8e71aa 1984 SV* const repointer = newSViv(0);
577e12cc 1985 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 1986 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
1987 av_push(PL_regex_padav,repointer);
1988 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
1989 PL_regex_pad = AvARRAY(PL_regex_padav);
1990 }
1991#endif
1992 }
aaa362c4 1993 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
1994 PL_reg_oldcurpm = PL_curpm;
1995 PL_curpm = PL_reg_curpm;
1996 if (RX_MATCH_COPIED(prog)) {
1997 /* Here is a serious problem: we cannot rewrite subbeg,
1998 since it may be needed if this match fails. Thus
1999 $` inside (?{}) could fail... */
2000 PL_reg_oldsaved = prog->subbeg;
2001 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2002#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2003 PL_nrs = prog->saved_copy;
2004#endif
d6a28714
JH
2005 RX_MATCH_COPIED_off(prog);
2006 }
2007 else
bd61b366 2008 PL_reg_oldsaved = NULL;
d6a28714
JH
2009 prog->subbeg = PL_bostr;
2010 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2011 }
973dddac 2012 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2013 PL_reginput = startpos;
2014 PL_regstartp = prog->startp;
2015 PL_regendp = prog->endp;
2016 PL_reglastparen = &prog->lastparen;
a01268b5 2017 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2018 prog->lastparen = 0;
03994de8 2019 prog->lastcloseparen = 0;
d6a28714 2020 PL_regsize = 0;
a3621e74 2021 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2022 if (PL_reg_start_tmpl <= prog->nparens) {
2023 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2024 if(PL_reg_start_tmp)
2025 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2026 else
a02a5408 2027 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2028 }
2029
2030 /* XXXX What this code is doing here?!!! There should be no need
2031 to do this again and again, PL_reglastparen should take care of
3dd2943c 2032 this! --ilya*/
dafc8851
JH
2033
2034 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2035 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2036 * PL_reglastparen), is not needed at all by the test suite
2037 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2038 * enough, for building DynaLoader, or otherwise this
2039 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2040 * will happen. Meanwhile, this code *is* needed for the
2041 * above-mentioned test suite tests to succeed. The common theme
2042 * on those tests seems to be returning null fields from matches.
2043 * --jhi */
dafc8851 2044#if 1
d6a28714
JH
2045 sp = prog->startp;
2046 ep = prog->endp;
2047 if (prog->nparens) {
097eb12c 2048 register I32 i;
eb160463 2049 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2050 *++sp = -1;
2051 *++ep = -1;
2052 }
2053 }
dafc8851 2054#endif
02db2b7b 2055 REGCP_SET(lastcp);
3b0527fe 2056 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2057 prog->endp[0] = PL_reginput - PL_bostr;
2058 return 1;
2059 }
02db2b7b 2060 REGCP_UNWIND(lastcp);
d6a28714
JH
2061 return 0;
2062}
2063
02db2b7b 2064
8ba1375e
MJD
2065#define sayYES goto yes
2066#define sayNO goto no
e0f9d4a8 2067#define sayNO_ANYOF goto no_anyof
8ba1375e 2068#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2069#define sayNO_FINAL goto no_final
2070#define sayNO_SILENT goto do_no
2071#define saySAME(x) if (x) goto yes; else goto no
2072
3ab3c9b4 2073#define CACHEsayNO STMT_START { \
3298f257
DM
2074 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) \
2075 PL_reg_poscache[st->u.whilem.cache_offset] |= \
2076 (1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2077 sayNO; \
2078} STMT_END
2079
3298f257 2080
a3621e74
YO
2081/* this is used to determine how far from the left messages like
2082 'failed...' are printed. Currently 29 makes these messages line
2083 up with the opcode they refer to. Earlier perls used 25 which
2084 left these messages outdented making reviewing a debug output
2085 quite difficult.
2086*/
2087#define REPORT_CODE_OFF 29
2088
2089
2090/* Make sure there is a test for this +1 options in re_tests */
2091#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2092
40a82448
DM
2093#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2094#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2095
86545054
DM
2096#define SLAB_FIRST(s) (&(s)->states[0])
2097#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2098
5d9a96ca
DM
2099/* grab a new slab and return the first slot in it */
2100
2101STATIC regmatch_state *
2102S_push_slab(pTHX)
2103{
54df2634
NC
2104#if PERL_VERSION < 9
2105 dMY_CXT;
2106#endif
5d9a96ca
DM
2107 regmatch_slab *s = PL_regmatch_slab->next;
2108 if (!s) {
2109 Newx(s, 1, regmatch_slab);
2110 s->prev = PL_regmatch_slab;
2111 s->next = NULL;
2112 PL_regmatch_slab->next = s;
2113 }
2114 PL_regmatch_slab = s;
86545054 2115 return SLAB_FIRST(s);
5d9a96ca 2116}
5b47454d 2117
95b24440
DM
2118/* simulate a recursive call to regmatch */
2119
2120#define REGMATCH(ns, where) \
5d9a96ca
DM
2121 st->scan = scan; \
2122 scan = (ns); \
2123 st->resume_state = resume_##where; \
95b24440
DM
2124 goto start_recurse; \
2125 resume_point_##where:
2126
40a82448
DM
2127/* push a new state then goto it */
2128
2129#define PUSH_STATE_GOTO(state, node) \
2130 scan = node; \
2131 st->resume_state = state; \
2132 goto push_state;
2133
2134/* push a new state with success backtracking, then goto it */
2135
2136#define PUSH_YES_STATE_GOTO(state, node) \
2137 scan = node; \
2138 st->resume_state = state; \
2139 goto push_yes_state;
2140
aa283a38 2141
aa283a38 2142
d6a28714
JH
2143/*
2144 - regmatch - main matching routine
2145 *
2146 * Conceptually the strategy is simple: check to see whether the current
2147 * node matches, call self recursively to see whether the rest matches,
2148 * and then act accordingly. In practice we make some effort to avoid
2149 * recursion, in particular by going through "ordinary" nodes (that don't
2150 * need to know whether the rest of the match failed) by a loop instead of
2151 * by recursion.
2152 */
2153/* [lwall] I've hoisted the register declarations to the outer block in order to
2154 * maybe save a little bit of pushing and popping on the stack. It also takes
2155 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2156 *
2157 * This function used to be heavily recursive, but since this had the
2158 * effect of blowing the CPU stack on complex regexes, it has been
2159 * restructured to be iterative, and to save state onto the heap rather
2160 * than the stack. Essentially whereever regmatch() used to be called, it
2161 * pushes the current state, notes where to return, then jumps back into
2162 * the main loop.
2163 *
2164 * Originally the structure of this function used to look something like
2165
2166 S_regmatch() {
2167 int a = 1, b = 2;
2168 ...
2169 while (scan != NULL) {
5d9a96ca 2170 a++; // do stuff with a and b
95b24440
DM
2171 ...
2172 switch (OP(scan)) {
2173 case FOO: {
2174 int local = 3;
2175 ...
2176 if (regmatch(...)) // recurse
2177 goto yes;
2178 }
2179 ...
2180 }
2181 }
2182 yes:
2183 return 1;
2184 }
2185
2186 * Now it looks something like this:
2187
5d9a96ca 2188 typedef struct {
95b24440
DM
2189 int a, b, local;
2190 int resume_state;
5d9a96ca 2191 } regmatch_state;
95b24440
DM
2192
2193 S_regmatch() {
5d9a96ca
DM
2194 regmatch_state *st = new();
2195 int depth=0;
2196 st->a++; // do stuff with a and b
95b24440
DM
2197 ...
2198 while (scan != NULL) {
2199 ...
2200 switch (OP(scan)) {
2201 case FOO: {
5d9a96ca 2202 st->local = 3;
95b24440 2203 ...
5d9a96ca
DM
2204 st->scan = scan;
2205 scan = ...;
2206 st->resume_state = resume_FOO;
2207 goto start_recurse; // recurse
95b24440 2208
5d9a96ca
DM
2209 resume_point_FOO:
2210 if (result)
95b24440
DM
2211 goto yes;
2212 }
2213 ...
2214 }
5d9a96ca
DM
2215 start_recurse:
2216 st = new(); push a new state
2217 st->a = 1; st->b = 2;
2218 depth++;
95b24440 2219 }
5d9a96ca 2220 yes:
95b24440 2221 result = 1;
5d9a96ca
DM
2222 if (depth--) {
2223 st = pop();
95b24440
DM
2224 switch (resume_state) {
2225 case resume_FOO:
2226 goto resume_point_FOO;
2227 ...
2228 }
2229 }
2230 return result
2231 }
2232
2233 * WARNING: this means that any line in this function that contains a
2234 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2235 * regmatch() using gotos instead. Thus the values of any local variables
2236 * not saved in the regmatch_state structure will have been lost when
2237 * execution resumes on the next line .
5d9a96ca
DM
2238 *
2239 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2240 * PL_regmatch_state always points to the currently active state, and
2241 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2242 * The first time regmatch is called, the first slab is allocated, and is
2243 * never freed until interpreter desctruction. When the slab is full,
2244 * a new one is allocated chained to the end. At exit from regmatch, slabs
2245 * allocated since entry are freed.
d6a28714 2246 */
95b24440 2247
40a82448 2248/* *** every FOO_fail should = FOO+1 */
c255a977
DM
2249#define TRIE_next (REGNODE_MAX+1)
2250#define TRIE_next_fail (REGNODE_MAX+2)
2251#define EVAL_A (REGNODE_MAX+3)
2252#define EVAL_A_fail (REGNODE_MAX+4)
2253#define resume_CURLYX (REGNODE_MAX+5)
2254#define resume_WHILEM1 (REGNODE_MAX+6)
2255#define resume_WHILEM2 (REGNODE_MAX+7)
2256#define resume_WHILEM3 (REGNODE_MAX+8)
2257#define resume_WHILEM4 (REGNODE_MAX+9)
2258#define resume_WHILEM5 (REGNODE_MAX+10)
2259#define resume_WHILEM6 (REGNODE_MAX+11)
2260#define BRANCH_next (REGNODE_MAX+12)
2261#define BRANCH_next_fail (REGNODE_MAX+13)
2262#define CURLYM_A (REGNODE_MAX+14)
2263#define CURLYM_A_fail (REGNODE_MAX+15)
2264#define CURLYM_B (REGNODE_MAX+16)
2265#define CURLYM_B_fail (REGNODE_MAX+17)
2266#define IFMATCH_A (REGNODE_MAX+18)
2267#define IFMATCH_A_fail (REGNODE_MAX+19)
2268#define CURLY_B_min_known (REGNODE_MAX+20)
2269#define CURLY_B_min_known_fail (REGNODE_MAX+21)
2270#define CURLY_B_min (REGNODE_MAX+22)
2271#define CURLY_B_min_fail (REGNODE_MAX+23)
2272#define CURLY_B_max (REGNODE_MAX+24)
2273#define CURLY_B_max_fail (REGNODE_MAX+25)
40a82448
DM
2274
2275
3dab1dad 2276#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2277
3df15adc 2278#ifdef DEBUGGING
ab3bbdeb
YO
2279STATIC void
2280S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2281 const char *start, const char *end, const char *blurb)
2282{
2283 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2284 if (!PL_colorset)
2285 reginitcolors();
2286 {
2287 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2288 prog->precomp, prog->prelen, 60);
2289
2290 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2291 start, end - start, 60);
2292
2293 PerlIO_printf(Perl_debug_log,
2294 "%s%s REx%s %s against %s\n",
2295 PL_colors[4], blurb, PL_colors[5], s0, s1);
2296
2297 if (do_utf8||utf8_pat)
2298 PerlIO_printf(Perl_debug_log, "UTF-8 %s...\n",
2299 !do_utf8 ? "pattern" : !utf8_pat ? "string" :
2300 "pattern and string"
2301 );
2302 }
2303}
3df15adc
YO
2304
2305STATIC void
07be1b83
YO
2306S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2307{
2308 const int docolor = *PL_colors[0];
2309 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2310 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2311 /* The part of the string before starttry has one color
2312 (pref0_len chars), between starttry and current
2313 position another one (pref_len - pref0_len chars),
2314 after the current position the third one.
2315 We assume that pref0_len <= pref_len, otherwise we
2316 decrease pref0_len. */
2317 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2318 ? (5 + taill) - l : locinput - PL_bostr;
2319 int pref0_len;
2320
2321 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2322 pref_len++;
2323 pref0_len = pref_len - (locinput - PL_reg_starttry);
2324 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2325 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2326 ? (5 + taill) - pref_len : PL_regeol - locinput);
2327 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2328 l--;
2329 if (pref0_len < 0)
2330 pref0_len = 0;
2331 if (pref0_len > pref_len)
2332 pref0_len = pref_len;
2333 {
3df15adc 2334 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2335
ab3bbdeb
YO
2336 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2337 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2338
ab3bbdeb 2339 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2340 (locinput - pref_len + pref0_len),
ab3bbdeb 2341 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2342
ab3bbdeb
YO
2343 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2344 locinput, PL_regeol - locinput, 60, 0, 1);
0df25f3d 2345
3df15adc 2346 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2347 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
07be1b83 2348 (IV)(locinput - PL_bostr),
07be1b83 2349 len0, s0,
07be1b83 2350 len1, s1,
07be1b83 2351 (docolor ? "" : "> <"),
07be1b83 2352 len2, s2,
07be1b83
YO
2353 15 - l - pref_len + 1,
2354 "");
2355 }
2356}
3df15adc 2357
07be1b83
YO
2358#endif
2359
d6a28714 2360STATIC I32 /* 0 failure, 1 success */
3b0527fe 2361S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2362{
54df2634
NC
2363#if PERL_VERSION < 9
2364 dMY_CXT;
2365#endif
27da23d5 2366 dVAR;
95b24440 2367 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2368 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2369
3b0527fe
DM
2370 regexp *rex = reginfo->prog;
2371
5d9a96ca
DM
2372 regmatch_slab *orig_slab;
2373 regmatch_state *orig_state;
a3621e74 2374
5d9a96ca
DM
2375 /* the current state. This is a cached copy of PL_regmatch_state */
2376 register regmatch_state *st;
95b24440 2377
5d9a96ca
DM
2378 /* cache heavy used fields of st in registers */
2379 register regnode *scan;
2380 register regnode *next;
2381 register I32 n = 0; /* initialize to shut up compiler warning */
2382 register char *locinput = PL_reginput;
95b24440 2383
5d9a96ca
DM
2384 /* these variables are NOT saved during a recusive RFEGMATCH: */
2385 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2386 bool result; /* return value of S_regmatch */
5d9a96ca 2387 int depth = 0; /* depth of recursion */
77cb431f
DM
2388 regmatch_state *yes_state = NULL; /* state to pop to on success of
2389 subpattern */
40a82448 2390 U32 state_num;
95b24440
DM
2391
2392#ifdef DEBUGGING
e68ec53f 2393 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2394 PL_regindent++;
2395#endif
2396
5d9a96ca
DM
2397 /* on first ever call to regmatch, allocate first slab */
2398 if (!PL_regmatch_slab) {
2399 Newx(PL_regmatch_slab, 1, regmatch_slab);
2400 PL_regmatch_slab->prev = NULL;
2401 PL_regmatch_slab->next = NULL;
86545054 2402 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2403 }
2404
2405 /* remember current high-water mark for exit */
2406 /* XXX this should be done with SAVE* instead */
2407 orig_slab = PL_regmatch_slab;
2408 orig_state = PL_regmatch_state;
2409
2410 /* grab next free state slot */
2411 st = ++PL_regmatch_state;
86545054 2412 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2413 st = PL_regmatch_state = S_push_slab(aTHX);
2414
2415 st->minmod = 0;
2416 st->sw = 0;
2417 st->logical = 0;
5d9a96ca 2418 st->cc = NULL;
d6a28714
JH
2419 /* Note that nextchr is a byte even in UTF */
2420 nextchr = UCHARAT(locinput);
2421 scan = prog;
2422 while (scan != NULL) {
8ba1375e 2423
a3621e74 2424 DEBUG_EXECUTE_r( {
6136c704 2425 SV * const prop = sv_newmortal();
07be1b83 2426 dump_exec_pos( locinput, scan, do_utf8 );
32fc9b6a 2427 regprop(rex, prop, scan);
07be1b83
YO
2428
2429 PerlIO_printf(Perl_debug_log,
2430 "%3"IVdf":%*s%s(%"IVdf")\n",
2431 (IV)(scan - rex->program), PL_regindent*2, "",
2432 SvPVX_const(prop),
2433 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2a782b5b 2434 });
d6a28714
JH
2435
2436 next = scan + NEXT_OFF(scan);
2437 if (next == scan)
2438 next = NULL;
40a82448 2439 state_num = OP(scan);
d6a28714 2440
40a82448
DM
2441 reenter_switch:
2442 switch (state_num) {
d6a28714 2443 case BOL:
7fba1cd6 2444 if (locinput == PL_bostr)
d6a28714 2445 {
3b0527fe 2446 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2447 break;
2448 }
d6a28714
JH
2449 sayNO;
2450 case MBOL:
12d33761
HS
2451 if (locinput == PL_bostr ||
2452 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2453 {
b8c5462f
JH
2454 break;
2455 }
d6a28714
JH
2456 sayNO;
2457 case SBOL:
c2a73568 2458 if (locinput == PL_bostr)
b8c5462f 2459 break;
d6a28714
JH
2460 sayNO;
2461 case GPOS:
3b0527fe 2462 if (locinput == reginfo->ganch)
d6a28714
JH
2463 break;
2464 sayNO;
2465 case EOL:
d6a28714
JH
2466 goto seol;
2467 case MEOL:
d6a28714 2468 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2469 sayNO;
b8c5462f 2470 break;
d6a28714
JH
2471 case SEOL:
2472 seol:
2473 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2474 sayNO;
d6a28714 2475 if (PL_regeol - locinput > 1)
b8c5462f 2476 sayNO;
b8c5462f 2477 break;
d6a28714
JH
2478 case EOS:
2479 if (PL_regeol != locinput)
b8c5462f 2480 sayNO;
d6a28714 2481 break;
ffc61ed2 2482 case SANY:
d6a28714 2483 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2484 sayNO;
f33976b4
DB
2485 if (do_utf8) {
2486 locinput += PL_utf8skip[nextchr];
2487 if (locinput > PL_regeol)
2488 sayNO;
2489 nextchr = UCHARAT(locinput);
2490 }
2491 else
2492 nextchr = UCHARAT(++locinput);
2493 break;
2494 case CANY:
2495 if (!nextchr && locinput >= PL_regeol)
2496 sayNO;
b8c5462f 2497 nextchr = UCHARAT(++locinput);
a0d0e21e 2498 break;
ffc61ed2 2499 case REG_ANY:
1aa99e6b
IH
2500 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2501 sayNO;
2502 if (do_utf8) {
b8c5462f 2503 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2504 if (locinput > PL_regeol)
2505 sayNO;
a0ed51b3 2506 nextchr = UCHARAT(locinput);
a0ed51b3 2507 }
1aa99e6b
IH
2508 else
2509 nextchr = UCHARAT(++locinput);
a0ed51b3 2510 break;
166ba7cd
DM
2511
2512#undef ST
2513#define ST st->u.trie
2514
5b47454d 2515 case TRIE:
3dab1dad 2516 {
07be1b83 2517 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2518 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2519 trie_type = do_utf8 ?
2520 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2521 : trie_plain;
2522
2523 /* what trie are we using right now */
be8e71aa 2524 reg_trie_data * const trie
3dab1dad
YO
2525 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2526 U32 state = trie->startstate;
166ba7cd
DM
2527
2528 U8 *uc = ( U8* )locinput;
2529 U16 charid = 0;
2530 U32 base = 0;
2531 UV uvc = 0;
2532 STRLEN len = 0;
2533 STRLEN foldlen = 0;
2534 U8 *uscan = (U8*)NULL;
2535 STRLEN bufflen=0;
2536 SV *sv_accept_buff = NULL;
2537 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2538
2539 ST.accepted = 0; /* how many accepting states we have seen */
2540 ST.B = next;
2541#ifdef DEBUGGING
2542 ST.me = scan;
2543#endif
3dab1dad
YO
2544
2545 if (trie->bitmap && trie_type != trie_utf8_fold &&
2546 !TRIE_BITMAP_TEST(trie,*locinput)
2547 ) {
2548 if (trie->states[ state ].wordnum) {
2549 DEBUG_EXECUTE_r(
2550 PerlIO_printf(Perl_debug_log,
2551 "%*s %smatched empty string...%s\n",
2552 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2553 );
2554 break;
2555 } else {
2556 DEBUG_EXECUTE_r(
2557 PerlIO_printf(Perl_debug_log,
2558 "%*s %sfailed to match start class...%s\n",
2559 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2560 );
2561 sayNO_SILENT;
2562 }
2563 }
166ba7cd 2564
07be1b83
YO
2565 /*
2566 traverse the TRIE keeping track of all accepting states
2567 we transition through until we get to a failing node.
2568 */
2569
a3621e74
YO
2570 while ( state && uc <= (U8*)PL_regeol ) {
2571
5b47454d 2572 if (trie->states[ state ].wordnum) {
166ba7cd 2573 if (!ST.accepted ) {
5b47454d
DM
2574 ENTER;
2575 SAVETMPS;
2576 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2577 sv_accept_buff=newSV(bufflen *
2578 sizeof(reg_trie_accepted) - 1);
2579 SvCUR_set(sv_accept_buff,
2580 sizeof(reg_trie_accepted));
2581 SvPOK_on(sv_accept_buff);
2582 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2583 SAVETMPS;
2584 ST.accept_buff =
5b47454d
DM
2585 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2586 }
2587 else {
166ba7cd 2588 if (ST.accepted >= bufflen) {
5b47454d 2589 bufflen *= 2;
166ba7cd 2590 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2591 SvGROW(sv_accept_buff,
2592 bufflen * sizeof(reg_trie_accepted));
2593 }
2594 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2595 + sizeof(reg_trie_accepted));
2596 }
166ba7cd
DM
2597 ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum;
2598 ST.accept_buff[ST.accepted].endpos = uc;
2599 ++ST.accepted;
5b47454d 2600 }
a3621e74
YO
2601
2602 base = trie->states[ state ].trans.base;
2603
07be1b83
YO
2604 DEBUG_TRIE_EXECUTE_r({
2605 dump_exec_pos( (char *)uc, scan, do_utf8 );
a3621e74 2606 PerlIO_printf( Perl_debug_log,
e4584336 2607 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
07be1b83 2608 2+PL_regindent * 2, "", PL_colors[4],
166ba7cd 2609 (UV)state, (UV)base, (UV)ST.accepted );
07be1b83 2610 });
a3621e74
YO
2611
2612 if ( base ) {
4cadc6a9
YO
2613 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2614 uvc, charid, foldlen, foldbuf, uniflags);
a3621e74 2615
5b47454d
DM
2616 if (charid &&
2617 (base + charid > trie->uniquecharcount )
2618 && (base + charid - 1 - trie->uniquecharcount
2619 < trie->lasttrans)
2620 && trie->trans[base + charid - 1 -
2621 trie->uniquecharcount].check == state)
2622 {
2623 state = trie->trans[base + charid - 1 -
2624 trie->uniquecharcount ].next;
2625 }
2626 else {
2627 state = 0;
2628 }
2629 uc += len;
2630
2631 }
2632 else {
a3621e74
YO
2633 state = 0;
2634 }
2635 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2636 PerlIO_printf( Perl_debug_log,
2637 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2638 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2639 );
2640 }
166ba7cd 2641 if (!ST.accepted )
a3621e74 2642 sayNO;
a3621e74 2643
166ba7cd
DM
2644 DEBUG_EXECUTE_r(
2645 PerlIO_printf( Perl_debug_log,
2646 "%*s %sgot %"IVdf" possible matches%s\n",
2647 REPORT_CODE_OFF + PL_regindent * 2, "",
2648 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2649 );
2650 }
2651
2652 /* FALL THROUGH */
2653
2654 case TRIE_next_fail: /* we failed - try next alterative */
2655
2656 if ( ST.accepted == 1 ) {
2657 /* only one choice left - just continue */
2658 DEBUG_EXECUTE_r({
2659 reg_trie_data * const trie
2660 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2661 SV ** const tmp = RX_DEBUG(reginfo->prog)
2662 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2663 : NULL;
2664 PerlIO_printf( Perl_debug_log,
2665 "%*s %sonly one match left: #%d <%s>%s\n",
2666 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2667 ST.accept_buff[ 0 ].wordnum,
2668 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2669 PL_colors[5] );
2670 });
2671 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2672 /* in this case we free tmps/leave before we call regmatch
2673 as we wont be using accept_buff again. */
2674 FREETMPS;
2675 LEAVE;
2676 locinput = PL_reginput;
2677 nextchr = UCHARAT(locinput);
2678 scan = ST.B;
2679 continue; /* execute rest of RE */
2680 }
2681
2682 if (!ST.accepted-- ) {
2683 FREETMPS;
2684 LEAVE;
2685 sayNO;
2686 }
2687
a3621e74 2688 /*
166ba7cd
DM
2689 There are at least two accepting states left. Presumably
2690 the number of accepting states is going to be low,
2691 typically two. So we simply scan through to find the one
2692 with lowest wordnum. Once we find it, we swap the last
2693 state into its place and decrement the size. We then try to
2694 match the rest of the pattern at the point where the word
2695 ends. If we succeed, control just continues along the
2696 regex; if we fail we return here to try the next accepting
2697 state
2698 */
a3621e74 2699
166ba7cd
DM
2700 {
2701 U32 best = 0;
2702 U32 cur;
2703 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2704 DEBUG_TRIE_EXECUTE_r(
f2278c82 2705 PerlIO_printf( Perl_debug_log,
166ba7cd
DM
2706 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2707 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2708 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2709 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2710 );
2711
2712 if (ST.accept_buff[cur].wordnum <
2713 ST.accept_buff[best].wordnum)
2714 best = cur;
a3621e74 2715 }
166ba7cd
DM
2716
2717 DEBUG_EXECUTE_r({
2718 reg_trie_data * const trie
2719 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2720 SV ** const tmp = RX_DEBUG(reginfo->prog)
2721 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2722 : NULL;
2723 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
2724 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2725 ST.accept_buff[best].wordnum,
2726 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
2727 PL_colors[5] );
2728 });
2729
2730 if ( best<ST.accepted ) {
2731 reg_trie_accepted tmp = ST.accept_buff[ best ];
2732 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2733 ST.accept_buff[ ST.accepted ] = tmp;
2734 best = ST.accepted;
a3621e74 2735 }
166ba7cd
DM
2736 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
2737 }
2738 PUSH_STATE_GOTO(TRIE_next, ST.B);
2739 /* NOTREACHED */
2740
2741#undef ST
2742
95b24440
DM
2743 case EXACT: {
2744 char *s = STRING(scan);
5d9a96ca 2745 st->ln = STR_LEN(scan);
eb160463 2746 if (do_utf8 != UTF) {
bc517b45 2747 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2748 char *l = locinput;
be8e71aa 2749 const char * const e = s + st->ln;
a72c7584 2750
5ff6fc6d
JH
2751 if (do_utf8) {
2752 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2753 while (s < e) {
a3b680e6 2754 STRLEN ulen;
1aa99e6b 2755 if (l >= PL_regeol)
5ff6fc6d
JH
2756 sayNO;
2757 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2758 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2759 uniflags))
5ff6fc6d 2760 sayNO;
bc517b45 2761 l += ulen;
5ff6fc6d 2762 s ++;
1aa99e6b 2763 }
5ff6fc6d
JH
2764 }
2765 else {
2766 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2767 while (s < e) {
a3b680e6 2768 STRLEN ulen;
1aa99e6b
IH
2769 if (l >= PL_regeol)
2770 sayNO;
5ff6fc6d 2771 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2772 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2773 uniflags))
1aa99e6b 2774 sayNO;
bc517b45 2775 s += ulen;
a72c7584 2776 l ++;
1aa99e6b 2777 }
5ff6fc6d 2778 }
1aa99e6b
IH
2779 locinput = l;
2780 nextchr = UCHARAT(locinput);
2781 break;
2782 }
bc517b45 2783 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2784 /* Inline the first character, for speed. */
2785 if (UCHARAT(s) != nextchr)
2786 sayNO;
5d9a96ca 2787 if (PL_regeol - locinput < st->ln)
d6a28714 2788 sayNO;
5d9a96ca 2789 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 2790 sayNO;
5d9a96ca 2791 locinput += st->ln;
d6a28714
JH
2792 nextchr = UCHARAT(locinput);
2793 break;
95b24440 2794 }
d6a28714 2795 case EXACTFL:
b8c5462f
JH
2796 PL_reg_flags |= RF_tainted;
2797 /* FALL THROUGH */
95b24440 2798 case EXACTF: {
be8e71aa 2799 char * const s = STRING(scan);
5d9a96ca 2800 st->ln = STR_LEN(scan);
d6a28714 2801
d07ddd77
JH
2802 if (do_utf8 || UTF) {
2803 /* Either target or the pattern are utf8. */
be8e71aa 2804 const char * const l = locinput;
d07ddd77 2805 char *e = PL_regeol;
bc517b45 2806
5d9a96ca 2807 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 2808 l, &e, 0, do_utf8)) {
5486206c
JH
2809 /* One more case for the sharp s:
2810 * pack("U0U*", 0xDF) =~ /ss/i,
2811 * the 0xC3 0x9F are the UTF-8
2812 * byte sequence for the U+00DF. */
2813 if (!(do_utf8 &&
2814 toLOWER(s[0]) == 's' &&
5d9a96ca 2815 st->ln >= 2 &&
5486206c
JH
2816 toLOWER(s[1]) == 's' &&
2817 (U8)l[0] == 0xC3 &&
2818 e - l >= 2 &&
2819 (U8)l[1] == 0x9F))
2820 sayNO;
2821 }
d07ddd77
JH
2822 locinput = e;
2823 nextchr = UCHARAT(locinput);
2824 break;
a0ed51b3 2825 }
d6a28714 2826
bc517b45
JH
2827 /* Neither the target and the pattern are utf8. */
2828
d6a28714
JH
2829 /* Inline the first character, for speed. */
2830 if (UCHARAT(s) != nextchr &&
2831 UCHARAT(s) != ((OP(scan) == EXACTF)
2832 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2833 sayNO;
5d9a96ca 2834 if (PL_regeol - locinput < st->ln)
b8c5462f 2835 sayNO;
5d9a96ca
DM
2836 if (st->ln > 1 && (OP(scan) == EXACTF
2837 ? ibcmp(s, locinput, st->ln)
2838 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 2839 sayNO;
5d9a96ca 2840 locinput += st->ln;
d6a28714 2841 nextchr = UCHARAT(locinput);
a0d0e21e 2842 break;
95b24440 2843 }
d6a28714 2844 case ANYOF:
ffc61ed2 2845 if (do_utf8) {
9e55ce06
JH
2846 STRLEN inclasslen = PL_regeol - locinput;
2847
32fc9b6a 2848 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2849 sayNO_ANYOF;
ffc61ed2
JH
2850 if (locinput >= PL_regeol)
2851 sayNO;
0f0076b4 2852 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2853 nextchr = UCHARAT(locinput);
e0f9d4a8 2854 break;
ffc61ed2
JH
2855 }
2856 else {
2857 if (nextchr < 0)
2858 nextchr = UCHARAT(locinput);
32fc9b6a 2859 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 2860 sayNO_ANYOF;
ffc61ed2
JH
2861 if (!nextchr && locinput >= PL_regeol)
2862 sayNO;
2863 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2864 break;
2865 }
2866 no_anyof:
2867 /* If we might have the case of the German sharp s
2868 * in a casefolding Unicode character class. */
2869
ebc501f0
JH
2870 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2871 locinput += SHARP_S_SKIP;
e0f9d4a8 2872 nextchr = UCHARAT(locinput);
ffc61ed2 2873 }
e0f9d4a8
JH
2874 else
2875 sayNO;
b8c5462f 2876 break;
d6a28714 2877 case ALNUML:
b8c5462f
JH
2878 PL_reg_flags |= RF_tainted;
2879 /* FALL THROUGH */
d6a28714 2880 case ALNUM:
b8c5462f 2881 if (!nextchr)
4633a7c4 2882 sayNO;
ffc61ed2 2883 if (do_utf8) {
1a4fad37 2884 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2885 if (!(OP(scan) == ALNUM
bb7a0f54 2886 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2887 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2888 {
2889 sayNO;
a0ed51b3 2890 }
b8c5462f 2891 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2892 nextchr = UCHARAT(locinput);
2893 break;
2894 }
ffc61ed2 2895 if (!(OP(scan) == ALNUM
d6a28714 2896 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2897 sayNO;
b8c5462f 2898 nextchr = UCHARAT(++locinput);
a0d0e21e 2899 break;
d6a28714 2900 case NALNUML:
b8c5462f
JH
2901 PL_reg_flags |= RF_tainted;
2902 /* FALL THROUGH */
d6a28714
JH
2903 case NALNUM:
2904 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2905 sayNO;
ffc61ed2 2906 if (do_utf8) {
1a4fad37 2907 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2908 if (OP(scan) == NALNUM
bb7a0f54 2909 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2910 : isALNUM_LC_utf8((U8*)locinput))
2911 {
b8c5462f 2912 sayNO;
d6a28714 2913 }
b8c5462f
JH
2914 locinput += PL_utf8skip[nextchr];
2915 nextchr = UCHARAT(locinput);
2916 break;
2917 }
ffc61ed2 2918 if (OP(scan) == NALNUM
d6a28714 2919 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2920 sayNO;
76e3520e 2921 nextchr = UCHARAT(++locinput);
a0d0e21e 2922 break;
d6a28714
JH
2923 case BOUNDL:
2924 case NBOUNDL:
3280af22 2925 PL_reg_flags |= RF_tainted;
bbce6d69 2926 /* FALL THROUGH */
d6a28714
JH
2927 case BOUND:
2928 case NBOUND:
2929 /* was last char in word? */
ffc61ed2 2930 if (do_utf8) {
12d33761 2931 if (locinput == PL_bostr)
5d9a96ca 2932 st->ln = '\n';
ffc61ed2 2933 else {
a3b680e6 2934 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2935
4ad0818d 2936 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
2937 }
2938 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 2939 st->ln = isALNUM_uni(st->ln);
1a4fad37 2940 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 2941 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2942 }
2943 else {
5d9a96ca 2944 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
2945 n = isALNUM_LC_utf8((U8*)locinput);
2946 }
a0ed51b3 2947 }
d6a28714 2948 else {
5d9a96ca 2949 st->ln = (locinput != PL_bostr) ?
12d33761 2950 UCHARAT(locinput - 1) : '\n';
ffc61ed2 2951 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 2952 st->ln = isALNUM(st->ln);
ffc61ed2
JH
2953 n = isALNUM(nextchr);
2954 }
2955 else {
5d9a96ca 2956 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
2957 n = isALNUM_LC(nextchr);
2958 }
d6a28714 2959 }
5d9a96ca 2960 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
2961 OP(scan) == BOUNDL))
2962 sayNO;
a0ed51b3 2963 break;
d6a28714 2964 case SPACEL:
3280af22 2965 PL_reg_flags |= RF_tainted;
bbce6d69 2966 /* FALL THROUGH */
d6a28714 2967 case SPACE:
9442cb0e 2968 if (!nextchr)
4633a7c4 2969 sayNO;
1aa99e6b 2970 if (do_utf8) {
fd400ab9 2971 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 2972 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 2973 if (!(OP(scan) == SPACE
bb7a0f54 2974 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2975 : isSPACE_LC_utf8((U8*)locinput)))
2976 {
2977 sayNO;
2978 }
2979 locinput += PL_utf8skip[nextchr];
2980 nextchr = UCHARAT(locinput);
2981 break;
d6a28714 2982 }
ffc61ed2
JH
2983 if (!(OP(scan) == SPACE
2984 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2985 sayNO;
2986 nextchr = UCHARAT(++locinput);
2987 }
2988 else {
2989 if (!(OP(scan) == SPACE
2990 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2991 sayNO;
2992 nextchr = UCHARAT(++locinput);
a0ed51b3 2993 }
a0ed51b3 2994 break;
d6a28714 2995 case NSPACEL:
3280af22 2996 PL_reg_flags |= RF_tainted;
bbce6d69 2997 /* FALL THROUGH */
d6a28714 2998 case NSPACE:
9442cb0e 2999 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3000 sayNO;
1aa99e6b 3001 if (do_utf8) {
1a4fad37 3002 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3003 if (OP(scan) == NSPACE
bb7a0f54 3004 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3005 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3006 {
3007 sayNO;
3008 }
3009 locinput += PL_utf8skip[nextchr];
3010 nextchr = UCHARAT(locinput);
3011 break;
a0ed51b3 3012 }
ffc61ed2 3013 if (OP(scan) == NSPACE
d6a28714 3014 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3015 sayNO;
76e3520e 3016 nextchr = UCHARAT(++locinput);
a0d0e21e 3017 break;
d6a28714 3018 case DIGITL:
a0ed51b3
LW
3019 PL_reg_flags |= RF_tainted;
3020 /* FALL THROUGH */
d6a28714 3021 case DIGIT:
9442cb0e 3022 if (!nextchr)
a0ed51b3 3023 sayNO;
1aa99e6b 3024 if (do_utf8) {
1a4fad37 3025 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3026 if (!(OP(scan) == DIGIT
bb7a0f54 3027 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3028 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3029 {
a0ed51b3 3030 sayNO;
dfe13c55 3031 }
6f06b55f 3032 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3033 nextchr = UCHARAT(locinput);
3034 break;
3035 }
ffc61ed2 3036 if (!(OP(scan) == DIGIT
9442cb0e 3037 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3038 sayNO;
3039 nextchr = UCHARAT(++locinput);
3040 break;
d6a28714 3041 case NDIGITL:
b8c5462f
JH
3042 PL_reg_flags |= RF_tainted;
3043 /* FALL THROUGH */
d6a28714 3044 case NDIGIT:
9442cb0e 3045 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3046 sayNO;
1aa99e6b 3047 if (do_utf8) {
1a4fad37 3048 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3049 if (OP(scan) == NDIGIT
bb7a0f54 3050 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3051 : isDIGIT_LC_utf8((U8*)locinput))
3052 {
a0ed51b3 3053 sayNO;
9442cb0e 3054 }
6f06b55f 3055 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3056 nextchr = UCHARAT(locinput);
3057 break;
3058 }
ffc61ed2 3059 if (OP(scan) == NDIGIT
9442cb0e 3060 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3061 sayNO;
3062 nextchr = UCHARAT(++locinput);
3063 break;
3064 case CLUMP:
b7c83a7e 3065 if (locinput >= PL_regeol)
a0ed51b3 3066 sayNO;
b7c83a7e 3067 if (do_utf8) {
1a4fad37 3068 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3069 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3070 sayNO;
3071 locinput += PL_utf8skip[nextchr];
3072 while (locinput < PL_regeol &&
3073 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3074 locinput += UTF8SKIP(locinput);
3075 if (locinput > PL_regeol)
3076 sayNO;
eb08e2da
JH
3077 }
3078 else
3079 locinput++;
a0ed51b3
LW
3080 nextchr = UCHARAT(locinput);
3081 break;
c8756f30 3082 case REFFL:
3280af22 3083 PL_reg_flags |= RF_tainted;
c8756f30 3084 /* FALL THROUGH */
c277df42 3085 case REF:
95b24440
DM
3086 case REFF: {
3087 char *s;
c277df42 3088 n = ARG(scan); /* which paren pair */
5d9a96ca 3089 st->ln = PL_regstartp[n];
2c2d71f5 3090 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3091 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3092 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3093 if (st->ln == PL_regendp[n])
a0d0e21e 3094 break;
a0ed51b3 3095
5d9a96ca 3096 s = PL_bostr + st->ln;
1aa99e6b 3097 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3098 char *l = locinput;
a3b680e6 3099 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3100 /*
3101 * Note that we can't do the "other character" lookup trick as
3102 * in the 8-bit case (no pun intended) because in Unicode we
3103 * have to map both upper and title case to lower case.
3104 */
3105 if (OP(scan) == REFF) {
3106 while (s < e) {
a3b680e6
AL
3107 STRLEN ulen1, ulen2;
3108 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3109 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3110
a0ed51b3
LW
3111 if (l >= PL_regeol)
3112 sayNO;
a2a2844f
JH
3113 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3114 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3115 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3116 sayNO;
a2a2844f
JH
3117 s += ulen1;
3118 l += ulen2;
a0ed51b3
LW
3119 }
3120 }
3121 locinput = l;
3122 nextchr = UCHARAT(locinput);
3123 break;
3124 }
3125
a0d0e21e 3126 /* Inline the first character, for speed. */
76e3520e 3127 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3128 (OP(scan) == REF ||
3129 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3130 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3131 sayNO;
5d9a96ca
DM
3132 st->ln = PL_regendp[n] - st->ln;
3133 if (locinput + st->ln > PL_regeol)
4633a7c4 3134 sayNO;
5d9a96ca
DM
3135 if (st->ln > 1 && (OP(scan) == REF
3136 ? memNE(s, locinput, st->ln)
c8756f30 3137 : (OP(scan) == REFF
5d9a96ca
DM
3138 ? ibcmp(s, locinput, st->ln)
3139 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3140 sayNO;
5d9a96ca 3141 locinput += st->ln;
76e3520e 3142 nextchr = UCHARAT(locinput);
a0d0e21e 3143 break;
95b24440 3144 }
a0d0e21e
LW
3145
3146 case NOTHING:
c277df42 3147 case TAIL:
a0d0e21e
LW
3148 break;
3149 case BACK:
3150 break;
40a82448
DM
3151
3152#undef ST
3153#define ST st->u.eval
3154
3155 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
c277df42 3156 {
c277df42 3157 SV *ret;
8e5e9ebe 3158 {
4aabdb9b
DM
3159 /* execute the code in the {...} */
3160 dSP;
6136c704 3161 SV ** const before = SP;
4aabdb9b
DM
3162 OP_4tree * const oop = PL_op;
3163 COP * const ocurcop = PL_curcop;
3164 PAD *old_comppad;
4aabdb9b
DM
3165
3166 n = ARG(scan);
32fc9b6a 3167 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3168 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3169 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3170 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3171
8e5e9ebe
RGS
3172 CALLRUNOPS(aTHX); /* Scalar context. */
3173 SPAGAIN;
3174 if (SP == before)
075aa684 3175 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3176 else {
3177 ret = POPs;
3178 PUTBACK;
3179 }
4aabdb9b
DM
3180
3181 PL_op = oop;
3182 PAD_RESTORE_LOCAL(old_comppad);
3183 PL_curcop = ocurcop;
3184 if (!st->logical) {
3185 /* /(?{...})/ */
3186 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3187 break;
3188 }
8e5e9ebe 3189 }
4aabdb9b
DM
3190 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3191 regexp *re;
4aabdb9b 3192 {
4f639d21
DM
3193 /* extract RE object from returned value; compiling if
3194 * necessary */
3195
6136c704 3196 MAGIC *mg = NULL;
be8e71aa 3197 const SV *sv;
faf82a0b
AE
3198 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3199 mg = mg_find(sv, PERL_MAGIC_qr);
3200 else if (SvSMAGICAL(ret)) {
3201 if (SvGMAGICAL(ret))
3202 sv_unmagic(ret, PERL_MAGIC_qr);
3203 else
3204 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3205 }
faf82a0b 3206
0f5d15d6
IZ
3207 if (mg) {
3208 re = (regexp *)mg->mg_obj;
df0003d4 3209 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3210 }
3211 else {
3212 STRLEN len;
6136c704 3213 const char * const t = SvPV_const(ret, len);
0f5d15d6 3214 PMOP pm;
a3b680e6 3215 const I32 osize = PL_regsize;
0f5d15d6 3216
5fcd1c1b 3217 Zero(&pm, 1, PMOP);
4aabdb9b 3218 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3219 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3220 if (!(SvFLAGS(ret)
faf82a0b
AE
3221 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3222 | SVs_GMG)))
14befaf4
DM
3223 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3224 PERL_MAGIC_qr,0,0);
0f5d15d6 3225 PL_regsize = osize;
0f5d15d6 3226 }
4aabdb9b 3227 }
aa283a38
DM
3228
3229 /* run the pattern returned from (??{...}) */
4aabdb9b 3230 DEBUG_EXECUTE_r(
ab3bbdeb
YO
3231 debug_start_match(re, do_utf8, locinput, PL_regeol,
3232 "Matching embedded");
4aabdb9b 3233 );
2c2d71f5 3234
40a82448
DM
3235 ST.cp = regcppush(0); /* Save *all* the positions. */
3236 REGCP_SET(ST.lastcp);
4aabdb9b
DM
3237 *PL_reglastparen = 0;
3238 *PL_reglastcloseparen = 0;
4aabdb9b 3239 PL_reginput = locinput;
4aabdb9b
DM
3240
3241 /* XXXX This is too dramatic a measure... */
3242 PL_reg_maxiter = 0;
3243
5d9a96ca 3244 st->logical = 0;
40a82448 3245 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
aa283a38 3246 ((re->reganch & ROPT_UTF8) != 0);
40a82448
DM
3247 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3248 ST.prev_rex = rex;
aa283a38 3249 rex = re;
aa283a38 3250
40a82448 3251 ST.B = next;
aa283a38 3252 /* now continue from first node in postoned RE */
40a82448 3253 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
4aabdb9b 3254 /* NOTREACHED */
a0ed51b3 3255 }
4aabdb9b
DM
3256 /* /(?(?{...})X|Y)/ */
3257 st->sw = SvTRUE(ret);
3258 st->logical = 0;
c277df42
IZ
3259 break;
3260 }
40a82448
DM
3261
3262 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3263 if (ST.toggleutf)
3264 PL_reg_flags ^= RF_utf8;
3265 ReREFCNT_dec(rex);
3266 rex = ST.prev_rex;
3267 /* XXXX This is too dramatic a measure... */
3268 PL_reg_maxiter = 0;
3269 /* Restore parens of the caller without popping the
3270 * savestack */
3271 {
3272 const I32 tmp = PL_savestack_ix;
3273 PL_savestack_ix = ST.lastcp;
3274 regcppop(rex);
3275 PL_savestack_ix = tmp;
3276 }
3277 PL_reginput = locinput;
3278 /* continue at the node following the (??{...}) */
3279 scan = ST.B;
3280 continue;
3281
3282 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3283 /* Restore state to the outer re then re-throw the failure */
3284 if (ST.toggleutf)
3285 PL_reg_flags ^= RF_utf8;
3286 ReREFCNT_dec(rex);
3287 rex = ST.prev_rex;
3288
3289 /* XXXX This is too dramatic a measure... */
3290 PL_reg_maxiter = 0;
3291
3292 PL_reginput = locinput;
3293 REGCP_UNWIND(ST.lastcp);
3294 regcppop(rex);
3295 sayNO_SILENT;
3296
3297#undef ST
3298
a0d0e21e 3299 case OPEN:
c277df42 3300 n = ARG(scan); /* which paren pair */
3280af22
NIS
3301 PL_reg_start_tmp[n] = locinput;
3302 if (n > PL_regsize)
3303 PL_regsize = n;
a0d0e21e
LW
3304 break;
3305 case CLOSE:
c277df42 3306 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3307 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3308 PL_regendp[n] = locinput - PL_bostr;
eb160463 3309 if (n > (I32)*PL_reglastparen)
3280af22 3310 *PL_reglastparen = n;
a01268b5 3311 *PL_reglastcloseparen = n;
a0d0e21e 3312 break;
c277df42
IZ
3313 case GROUPP:
3314 n = ARG(scan); /* which paren pair */
5d9a96ca 3315 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3316 break;
3317 case IFTHEN:
2c2d71f5 3318 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3319 if (st->sw)
c277df42
IZ
3320 next = NEXTOPER(NEXTOPER(scan));
3321 else {
3322 next = scan + ARG(scan);
3323 if (OP(next) == IFTHEN) /* Fake one. */
3324 next = NEXTOPER(NEXTOPER(next));
3325 }
3326 break;
3327 case LOGICAL:
5d9a96ca 3328 st->logical = scan->flags;
c277df42 3329 break;
2ab05381 3330/*******************************************************************
a0374537
DM
3331 cc points to the regmatch_state associated with the most recent CURLYX.
3332 This struct contains info about the innermost (...)* loop (an
3333 "infoblock"), and a pointer to the next outer cc.
2ab05381
IZ
3334
3335 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3336
95b24440 3337 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3338
a0374537 3339 2) This regnode populates cc, and calls regmatch() recursively
95b24440 3340 with the starting point at WHILEM node;
2ab05381
IZ
3341
3342 3) Each hit of WHILEM node tries to match A and Z (in the order
3343 depending on the current iteration, min/max of {min,max} and
3344 greediness). The information about where are nodes for "A"
a0374537 3345 and "Z" is read from cc, as is info on how many times "A"
2ab05381
IZ
3346 was already matched, and greediness.
3347
3348 4) After A matches, the same WHILEM node is hit again.
3349
95b24440 3350 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
2ab05381 3351 of the same pair. Thus when WHILEM tries to match Z, it temporarily
95b24440 3352 resets cc, since this Y(A)*Z can be a part of some other loop:
2ab05381
IZ
3353 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3354 of the external loop.
3355
a0374537 3356 Currently present infoblocks form a tree with a stem formed by st->cc
2ab05381
IZ
3357 and whatever it mentions via ->next, and additional attached trees
3358 corresponding to temporarily unset infoblocks as in "5" above.
3359
95b24440 3360 In the following picture, infoblocks for outer loop of
2ab05381
IZ
3361 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3362 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3363 infoblocks are drawn below the "reset" infoblock.
3364
3365 In fact in the picture below we do not show failed matches for Z and T
3366 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3367 more obvious *why* one needs to *temporary* unset infoblocks.]
3368
3369 Matched REx position InfoBlocks Comment
3370 (Y(A)*?Z)*?T x
3371 Y(A)*?Z)*?T x <- O
3372 Y (A)*?Z)*?T x <- O
3373 Y A)*?Z)*?T x <- O <- I
3374 YA )*?Z)*?T x <- O <- I
3375 YA A)*?Z)*?T x <- O <- I
3376 YAA )*?Z)*?T x <- O <- I
3377 YAA Z)*?T x <- O # Temporary unset I
3378 I
3379
3380 YAAZ Y(A)*?Z)*?T x <- O
3381 I
3382
3383 YAAZY (A)*?Z)*?T x <- O
3384 I
3385
3386 YAAZY A)*?Z)*?T x <- O <- I
3387 I
3388
3389 YAAZYA )*?Z)*?T x <- O <- I
3390 I
3391
3392 YAAZYA Z)*?T x <- O # Temporary unset I
3393 I,I
3394
3395 YAAZYAZ )*?T x <- O
3396 I,I
3397
3398 YAAZYAZ T x # Temporary unset O
3399 O
3400 I,I
3401
3402 YAAZYAZT x
3403 O
3404 I,I
3405 *******************************************************************/
95b24440 3406
a0d0e21e 3407 case CURLYX: {
cb434fcc
IZ
3408 /* No need to save/restore up to this paren */
3409 I32 parenfloor = scan->flags;
c277df42 3410
c2b7afd3
NC
3411 /* Dave says:
3412
3413 CURLYX and WHILEM are always paired: they're the moral
3414 equivalent of pp_enteriter anbd pp_iter.
3415
3416 The only time next could be null is if the node tree is
3417 corrupt. This was mentioned on p5p a few days ago.
3418
3419 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3420 So we'll assert that this is true:
3421 */
3422 assert(next);
30b2893d 3423 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
c277df42 3424 next += ARG(next);
cb434fcc
IZ
3425 /* XXXX Probably it is better to teach regpush to support
3426 parenfloor > PL_regsize... */
eb160463 3427 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc 3428 parenfloor = *PL_reglastparen; /* Pessimization... */
a0374537 3429
d8319b27
DM
3430 st->u.curlyx.cp = PL_savestack_ix;
3431 st->u.curlyx.outercc = st->cc;
a0374537
DM
3432 st->cc = st;
3433 /* these fields contain the state of the current curly.
3434 * they are accessed by subsequent WHILEMs;
3435 * cur and lastloc are also updated by WHILEM */
d8319b27
DM
3436 st->u.curlyx.parenfloor = parenfloor;
3437 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3438 st->u.curlyx.min = ARG1(scan);
3439 st->u.curlyx.max = ARG2(scan);
3440 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3441 st->u.curlyx.lastloc = 0;
a0374537
DM
3442 /* st->next and st->minmod are also read by WHILEM */
3443
3280af22 3444 PL_reginput = locinput;
95b24440
DM
3445 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3446 /*** all unsaved local vars undefined at this point */
d8319b27
DM
3447 regcpblow(st->u.curlyx.cp);
3448 st->cc = st->u.curlyx.outercc;
95b24440 3449 saySAME(result);
a0d0e21e 3450 }
5f66b61c 3451 /* NOTREACHED */
a0d0e21e
LW
3452 case WHILEM: {
3453 /*
3454 * This is really hard to understand, because after we match
3455 * what we're trying to match, we must make sure the rest of
2c2d71f5 3456 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3457 * to go back UP the parse tree by recursing ever deeper. And
3458 * if it fails, we have to reset our parent's current state
3459 * that we can try again after backing off.
3460 */
3461
c2b7afd3
NC
3462 /* Dave says:
3463
3464 st->cc gets initialised by CURLYX ready for use by WHILEM.
3465 So again, unless somethings been corrupted, st->cc cannot
3466 be null at that point in WHILEM.
3467
3468 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3469 So we'll assert that this is true:
3470 */
3471 assert(st->cc);
d8319b27
DM
3472 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3473 st->u.whilem.cache_offset = 0;
3474 st->u.whilem.cache_bit = 0;
c277df42 3475
d8319b27 3476 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3280af22 3477 PL_reginput = locinput;
a0d0e21e 3478
a3621e74 3479 DEBUG_EXECUTE_r(
9041c2e3 3480 PerlIO_printf(Perl_debug_log,
91f3b821 3481 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3482 REPORT_CODE_OFF+PL_regindent*2, "",
d8319b27
DM
3483 (long)n, (long)st->cc->u.curlyx.min,
3484 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
c277df42 3485 );
4633a7c4 3486
a0d0e21e
LW
3487 /* If degenerate scan matches "", assume scan done. */
3488
d8319b27
DM
3489 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3490 st->u.whilem.savecc = st->cc;
3491 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3492 if (st->cc)
d8319b27 3493 st->ln = st->cc->u.curlyx.cur;
a3621e74 3494 DEBUG_EXECUTE_r(
c3464db5
DD
3495 PerlIO_printf(Perl_debug_log,
3496 "%*s empty match detected, try continuation...\n",
3280af22 3497 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3498 );
d8319b27 3499 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
95b24440 3500 /*** all unsaved local vars undefined at this point */
d8319b27 3501 st->cc = st->u.whilem.savecc;
95b24440 3502 if (result)
4633a7c4 3503 sayYES;
d8319b27
DM
3504 if (st->cc->u.curlyx.outercc)
3505 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4633a7c4 3506 sayNO;
a0d0e21e
LW
3507 }
3508
3509 /* First just match a string of min scans. */
3510
d8319b27
DM
3511 if (n < st->cc->u.curlyx.min) {
3512 st->cc->u.curlyx.cur = n;
3513 st->cc->u.curlyx.lastloc = locinput;
3514 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
95b24440
DM
3515 /*** all unsaved local vars undefined at this point */
3516 if (result)
4633a7c4 3517 sayYES;
d8319b27
DM
3518 st->cc->u.curlyx.cur = n - 1;
3519 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
4633a7c4 3520 sayNO;
a0d0e21e
LW
3521 }
3522
2c2d71f5
JH
3523 if (scan->flags) {
3524 /* Check whether we already were at this position.
3525 Postpone detection until we know the match is not
3526 *that* much linear. */
3527 if (!PL_reg_maxiter) {
3528 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
3529 /* possible overflow for long strings and many CURLYX's */
3530 if (PL_reg_maxiter < 0)
3531 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
3532 PL_reg_leftiter = PL_reg_maxiter;
3533 }
3534 if (PL_reg_leftiter-- == 0) {
3298f257 3535 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 3536 if (PL_reg_poscache) {
eb160463 3537 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3538 Renew(PL_reg_poscache, size, char);
3539 PL_reg_poscache_size = size;
3540 }
3541 Zero(PL_reg_poscache, size, char);
3542 }
3543 else {
3544 PL_reg_poscache_size = size;
a02a5408 3545 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3546 }
a3621e74 3547 DEBUG_EXECUTE_r(
2c2d71f5
JH
3548 PerlIO_printf(Perl_debug_log,
3549 "%sDetected a super-linear match, switching on caching%s...\n",
3550 PL_colors[4], PL_colors[5])
3551 );
3552 }
3553 if (PL_reg_leftiter < 0) {
d8319b27 3554 st->u.whilem.cache_offset = locinput - PL_bostr;
2c2d71f5 3555
3298f257 3556 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1
d8319b27
DM
3557 + st->u.whilem.cache_offset * (scan->flags>>4);
3558 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3559 st->u.whilem.cache_offset /= 8;
3560 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
a3621e74 3561 DEBUG_EXECUTE_r(
2c2d71f5
JH
3562 PerlIO_printf(Perl_debug_log,
3563 "%*s already tried at this position...\n",
3564 REPORT_CODE_OFF+PL_regindent*2, "")
3565 );
3298f257 3566 sayNO; /* cache records failure */
2c2d71f5 3567 }
2c2d71f5
JH
3568 }
3569 }
3570
a0d0e21e
LW
3571 /* Prefer next over scan for minimal matching. */
3572
5d9a96ca 3573 if (st->cc->minmod) {
d8319b27
DM
3574 st->u.whilem.savecc = st->cc;
3575 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3576 if (st->cc)
d8319b27
DM
3577 st->ln = st->cc->u.curlyx.cur;
3578 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3579 REGCP_SET(st->u.whilem.lastcp);
3580 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
95b24440 3581 /*** all unsaved local vars undefined at this point */
d8319b27 3582 st->cc = st->u.whilem.savecc;
95b24440 3583 if (result) {
d8319b27 3584 regcpblow(st->u.whilem.cp);
3298f257 3585 sayYES; /* All done. */
5f05dabc 3586 }
d8319b27 3587 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3588 regcppop(rex);
d8319b27
DM
3589 if (st->cc->u.curlyx.outercc)
3590 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
a0d0e21e 3591
d8319b27 3592 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
9041c2e3 3593 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3594 && !(PL_reg_flags & RF_warned)) {
3595 PL_reg_flags |= RF_warned;
9014280d 3596 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3597 "Complex regular subexpression recursion",
3598 REG_INFTY - 1);
c277df42 3599 }
3ab3c9b4 3600 CACHEsayNO;
c277df42 3601 }
a687059c 3602
a3621e74 3603 DEBUG_EXECUTE_r(
c3464db5
DD
3604 PerlIO_printf(Perl_debug_log,
3605 "%*s trying longer...\n",
3280af22 3606 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3607 );
a0d0e21e 3608 /* Try scanning more and see if it helps. */
3280af22 3609 PL_reginput = locinput;
d8319b27
DM
3610 st->cc->u.curlyx.cur = n;
3611 st->cc->u.curlyx.lastloc = locinput;
3612 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3613 REGCP_SET(st->u.whilem.lastcp);
3614 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
95b24440
DM
3615 /*** all unsaved local vars undefined at this point */
3616 if (result) {
d8319b27 3617 regcpblow(st->u.whilem.cp);
3298f257 3618 sayYES;
5f05dabc 3619 }
d8319b27 3620 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3621 regcppop(rex);
d8319b27
DM
3622 st->cc->u.curlyx.cur = n - 1;
3623 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3624 CACHEsayNO;
a0d0e21e
LW
3625 }
3626
3627 /* Prefer scan over next for maximal matching. */
3628
d8319b27
DM
3629 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3630 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3631 st->cc->u.curlyx.cur = n;
3632 st->cc->u.curlyx.lastloc = locinput;
3633 REGCP_SET(st->u.whilem.lastcp);
3634 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
95b24440
DM
3635 /*** all unsaved local vars undefined at this point */
3636 if (result) {
d8319b27 3637 regcpblow(st->u.whilem.cp);
3298f257 3638 sayYES;
5f05dabc 3639 }
d8319b27 3640 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3641 regcppop(rex); /* Restore some previous $<digit>s? */
3280af22 3642 PL_reginput = locinput;
a3621e74 3643 DEBUG_EXECUTE_r(
c3464db5
DD
3644 PerlIO_printf(Perl_debug_log,
3645 "%*s failed, try continuation...\n",
3280af22 3646 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3647 );
3648 }
9041c2e3 3649 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3650 && !(PL_reg_flags & RF_warned)) {
3280af22 3651 PL_reg_flags |= RF_warned;
9014280d 3652 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3653 "Complex regular subexpression recursion",
3654 REG_INFTY - 1);
a0d0e21e
LW
3655 }
3656
3657 /* Failed deeper matches of scan, so see if this one works. */
d8319b27
DM
3658 st->u.whilem.savecc = st->cc;
3659 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3660 if (st->cc)
d8319b27
DM
3661 st->ln = st->cc->u.curlyx.cur;
3662 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
95b24440 3663 /*** all unsaved local vars undefined at this point */
d8319b27 3664 st->cc = st->u.whilem.savecc;
95b24440 3665 if (result)
3298f257 3666 sayYES;
d8319b27
DM
3667 if (st->cc->u.curlyx.outercc)
3668 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3669 st->cc->u.curlyx.cur = n - 1;
3670 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3671 CACHEsayNO;
a0d0e21e 3672 }
5f66b61c 3673 /* NOTREACHED */
40a82448
DM
3674
3675#undef ST
3676#define ST st->u.branch
3677
3678 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
3679 next = scan + ARG(scan);
3680 if (next == scan)
3681 next = NULL;
40a82448
DM
3682 scan = NEXTOPER(scan);
3683 /* FALL THROUGH */
c277df42 3684
40a82448
DM
3685 case BRANCH: /* /(...|A|...)/ */
3686 scan = NEXTOPER(scan); /* scan now points to inner node */
3687 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3688 /* last branch; skip state push and jump direct to node */
3689 continue;
3690 ST.lastparen = *PL_reglastparen;
3691 ST.next_branch = next;
3692 REGCP_SET(ST.cp);
3693 PL_reginput = locinput;
02db2b7b 3694
40a82448
DM
3695 /* Now go into the branch */
3696 PUSH_STATE_GOTO(BRANCH_next, scan);
3697 /* NOTREACHED */
3698
3699 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3700 REGCP_UNWIND(ST.cp);
3701 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3702 PL_regendp[n] = -1;
3703 *PL_reglastparen = n;
3704 scan = ST.next_branch;
3705 /* no more branches? */
3706 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
3707 sayNO;
3708 continue; /* execute next BRANCH[J] op */
3709 /* NOTREACHED */
3710
a0d0e21e 3711 case MINMOD:
5d9a96ca 3712 st->minmod = 1;
a0d0e21e 3713 break;
40a82448
DM
3714
3715#undef ST
3716#define ST st->u.curlym
3717
3718 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
3719
3720 /* This is an optimisation of CURLYX that enables us to push
3721 * only a single backtracking state, no matter now many matches
3722 * there are in {m,n}. It relies on the pattern being constant
3723 * length, with no parens to influence future backrefs
3724 */
3725
3726 ST.me = scan;
dc45a647 3727 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
3728
3729 /* if paren positive, emulate an OPEN/CLOSE around A */
3730 if (ST.me->flags) {
3731 I32 paren = ST.me->flags;
3732 if (paren > PL_regsize)
3733 PL_regsize = paren;
3734 if (paren > (I32)*PL_reglastparen)
3735 *PL_reglastparen = paren;
c277df42 3736 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 3737 }
40a82448
DM
3738 ST.A = scan;
3739 ST.B = next;
3740 ST.alen = 0;
3741 ST.count = 0;
3742 ST.minmod = st->minmod;
3743 st->minmod = 0;
3744 ST.c1 = CHRTEST_UNINIT;
3745 REGCP_SET(ST.cp);
6407bf3b 3746
40a82448
DM
3747 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
3748 goto curlym_do_B;
3749
3750 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 3751 PL_reginput = locinput;
40a82448
DM
3752 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
3753 /* NOTREACHED */
5f80c4cf 3754
40a82448
DM
3755 case CURLYM_A: /* we've just matched an A */
3756 locinput = st->locinput;
3757 nextchr = UCHARAT(locinput);
3758
3759 ST.count++;
3760 /* after first match, determine A's length: u.curlym.alen */
3761 if (ST.count == 1) {
3762 if (PL_reg_match_utf8) {
3763 char *s = locinput;
3764 while (s < PL_reginput) {
3765 ST.alen++;
3766 s += UTF8SKIP(s);
3767 }
3768 }
3769 else {
3770 ST.alen = PL_reginput - locinput;
3771 }
3772 if (ST.alen == 0)
3773 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
3774 }
0cadcf80
DM
3775 DEBUG_EXECUTE_r(
3776 PerlIO_printf(Perl_debug_log,
40a82448 3777 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
ab3bbdeb 3778 (int)(REPORT_CODE_OFF+(PL_regindent*2)), "",
40a82448 3779 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
3780 );
3781
40a82448
DM
3782 locinput = PL_reginput;
3783 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
3784 goto curlym_do_A; /* try to match another A */
3785 goto curlym_do_B; /* try to match B */
5f80c4cf 3786
40a82448
DM
3787 case CURLYM_A_fail: /* just failed to match an A */
3788 REGCP_UNWIND(ST.cp);
3789 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
3790 sayNO;
0cadcf80 3791
40a82448
DM
3792 curlym_do_B: /* execute the B in /A{m,n}B/ */
3793 PL_reginput = locinput;
3794 if (ST.c1 == CHRTEST_UNINIT) {
3795 /* calculate c1 and c2 for possible match of 1st char
3796 * following curly */
3797 ST.c1 = ST.c2 = CHRTEST_VOID;
3798 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
3799 regnode *text_node = ST.B;
3800 if (! HAS_TEXT(text_node))
3801 FIND_NEXT_IMPT(text_node);
3802 if (HAS_TEXT(text_node)
3803 && PL_regkind[OP(text_node)] != REF)
3804 {
3805 ST.c1 = (U8)*STRING(text_node);
3806 ST.c2 =
3807 (OP(text_node) == EXACTF || OP(text_node) == REFF)
3808 ? PL_fold[ST.c1]
3809 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3810 ? PL_fold_locale[ST.c1]
3811 : ST.c1;
c277df42 3812 }
c277df42 3813 }
40a82448
DM
3814 }
3815
3816 DEBUG_EXECUTE_r(
3817 PerlIO_printf(Perl_debug_log,
3818 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
ab3bbdeb 3819 (int)(REPORT_CODE_OFF+(PL_regindent*2)),
40a82448
DM
3820 "", (IV)ST.count)
3821 );
3822 if (ST.c1 != CHRTEST_VOID
3823 && UCHARAT(PL_reginput) != ST.c1
3824 && UCHARAT(PL_reginput) != ST.c2)
3825 {
3826 /* simulate B failing */
3827 state_num = CURLYM_B_fail;
3828 goto reenter_switch;
3829 }
3830
3831 if (ST.me->flags) {
3832 /* mark current A as captured */
3833 I32 paren = ST.me->flags;
3834 if (ST.count) {
3835 PL_regstartp[paren]
3836 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
3837 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3838 }
40a82448
DM
3839 else
3840 PL_regendp[paren] = -1;
c277df42 3841 }
40a82448 3842 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 3843 /* NOTREACHED */
40a82448
DM
3844
3845 case CURLYM_B_fail: /* just failed to match a B */
3846 REGCP_UNWIND(ST.cp);
3847 if (ST.minmod) {
3848 if (ST.count == ARG2(ST.me) /* max */)
3849 sayNO;
3850 goto curlym_do_A; /* try to match a further A */
3851 }
3852 /* backtrack one A */
3853 if (ST.count == ARG1(ST.me) /* min */)
3854 sayNO;
3855 ST.count--;
3856 locinput = HOPc(locinput, -ST.alen);
3857 goto curlym_do_B; /* try to match B */
3858
c255a977
DM
3859#undef ST
3860#define ST st->u.curly
40a82448 3861
c255a977
DM
3862#define CURLY_SETPAREN(paren, success) \
3863 if (paren) { \
3864 if (success) { \
3865 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
3866 PL_regendp[paren] = locinput - PL_bostr; \
3867 } \
3868 else \
3869 PL_regendp[paren] = -1; \
3870 }
3871
3872 case STAR: /* /A*B/ where A is width 1 */
3873 ST.paren = 0;
3874 ST.min = 0;
3875 ST.max = REG_INFTY;
a0d0e21e
LW
3876 scan = NEXTOPER(scan);
3877 goto repeat;
c255a977
DM
3878 case PLUS: /* /A+B/ where A is width 1 */
3879 ST.paren = 0;
3880 ST.min = 1;
3881 ST.max = REG_INFTY;
c277df42 3882 scan = NEXTOPER(scan);
c255a977
DM
3883 goto repeat;
3884 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
3885 ST.paren = scan->flags; /* Which paren to set */
3886 if (ST.paren > PL_regsize)
3887 PL_regsize = ST.paren;
3888 if (ST.paren > (I32)*PL_reglastparen)
3889 *PL_reglastparen = ST.paren;
3890 ST.min = ARG1(scan); /* min to match */
3891 ST.max = ARG2(scan); /* max to match */
3892 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3893 goto repeat;
3894 case CURLY: /* /A{m,n}B/ where A is width 1 */
3895 ST.paren = 0;
3896 ST.min = ARG1(scan); /* min to match */
3897 ST.max = ARG2(scan); /* max to match */
3898 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 3899 repeat:
a0d0e21e
LW
3900 /*
3901 * Lookahead to avoid useless match attempts
3902 * when we know what character comes next.
c255a977 3903 *
5f80c4cf
JP
3904 * Used to only do .*x and .*?x, but now it allows
3905 * for )'s, ('s and (?{ ... })'s to be in the way
3906 * of the quantifier and the EXACT-like node. -- japhy
3907 */
3908
c255a977
DM
3909 if (ST.min > ST.max) /* XXX make this a compile-time check? */
3910 sayNO;
cca55fe3 3911 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3912 U8 *s;
3913 regnode *text_node = next;
3914
3dab1dad
YO
3915 if (! HAS_TEXT(text_node))
3916 FIND_NEXT_IMPT(text_node);
5f80c4cf 3917
9e137952 3918 if (! HAS_TEXT(text_node))
c255a977 3919 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 3920 else {
3dab1dad 3921 if (PL_regkind[OP(text_node)] == REF) {
c255a977 3922 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 3923 goto assume_ok_easy;
cca55fe3 3924 }
be8e71aa
YO
3925 else
3926 s = (U8*)STRING(text_node);
5f80c4cf
JP
3927
3928 if (!UTF) {
c255a977 3929 ST.c2 = ST.c1 = *s;
f65d3ee7 3930 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
c255a977 3931 ST.c2 = PL_fold[ST.c1];
f65d3ee7 3932 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
c255a977 3933 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 3934 }
5f80c4cf 3935 else { /* UTF */
f65d3ee7 3936 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3937 STRLEN ulen1, ulen2;
89ebb4a3
JH
3938 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3939 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
3940
3941 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3942 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
e294cc5d
JH
3943#ifdef EBCDIC
3944 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
3945 ckWARN(WARN_UTF8) ?
3946 0 : UTF8_ALLOW_ANY);
3947 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
3948 ckWARN(WARN_UTF8) ?
3949 0 : UTF8_ALLOW_ANY);
3950#else
c255a977 3951 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
e294cc5d 3952 uniflags);
c255a977 3953 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
e294cc5d
JH
3954 uniflags);
3955#endif
5f80c4cf
JP
3956 }
3957 else {
c255a977 3958 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 3959 uniflags);
5f80c4cf 3960 }
1aa99e6b
IH
3961 }
3962 }
bbce6d69 3963 }
a0d0e21e 3964 else
c255a977 3965 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 3966 assume_ok_easy:
c255a977
DM
3967
3968 ST.A = scan;
3969 ST.B = next;
3280af22 3970 PL_reginput = locinput;
5d9a96ca
DM
3971 if (st->minmod) {
3972 st->minmod = 0;
c255a977 3973 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4633a7c4 3974 sayNO;
c255a977 3975 ST.count = ST.min;
a0ed51b3 3976 locinput = PL_reginput;
c255a977
DM
3977 REGCP_SET(ST.cp);
3978 if (ST.c1 == CHRTEST_VOID)
3979 goto curly_try_B_min;
3980
3981 ST.oldloc = locinput;
3982
3983 /* set ST.maxpos to the furthest point along the
3984 * string that could possibly match */
3985 if (ST.max == REG_INFTY) {
3986 ST.maxpos = PL_regeol - 1;
3987 if (do_utf8)
3988 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
3989 ST.maxpos--;
3990 }
3991 else if (do_utf8) {
3992 int m = ST.max - ST.min;
3993 for (ST.maxpos = locinput;
3994 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
3995 ST.maxpos += UTF8SKIP(ST.maxpos);
3996 }
3997 else {
3998 ST.maxpos = locinput + ST.max - ST.min;
3999 if (ST.maxpos >= PL_regeol)
4000 ST.maxpos = PL_regeol - 1;
4001 }
4002 goto curly_try_B_min_known;
4003
4004 }
4005 else {
4006 ST.count = regrepeat(rex, ST.A, ST.max);
4007 locinput = PL_reginput;
4008 if (ST.count < ST.min)
4009 sayNO;
4010 if ((ST.count > ST.min)
4011 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4012 {
4013 /* A{m,n} must come at the end of the string, there's
4014 * no point in backing off ... */
4015 ST.min = ST.count;
4016 /* ...except that $ and \Z can match before *and* after
4017 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4018 We may back off by one in this case. */
4019 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4020 ST.min--;
4021 }
4022 REGCP_SET(ST.cp);
4023 goto curly_try_B_max;
4024 }
4025 /* NOTREACHED */
4026
4027
4028 case CURLY_B_min_known_fail:
4029 /* failed to find B in a non-greedy match where c1,c2 valid */
4030 if (ST.paren && ST.count)
4031 PL_regendp[ST.paren] = -1;
4032
4033 PL_reginput = locinput; /* Could be reset... */
4034 REGCP_UNWIND(ST.cp);
4035 /* Couldn't or didn't -- move forward. */
4036 ST.oldloc = locinput;
4037 if (do_utf8)
4038 locinput += UTF8SKIP(locinput);
4039 else
4040 locinput++;
4041 ST.count++;
4042 curly_try_B_min_known:
4043 /* find the next place where 'B' could work, then call B */
4044 {
4045 int n;
4046 if (do_utf8) {
4047 n = (ST.oldloc == locinput) ? 0 : 1;
4048 if (ST.c1 == ST.c2) {
4049 STRLEN len;
4050 /* set n to utf8_distance(oldloc, locinput) */
4051 while (locinput <= ST.maxpos &&
4052 utf8n_to_uvchr((U8*)locinput,
4053 UTF8_MAXBYTES, &len,
4054 uniflags) != (UV)ST.c1) {
4055 locinput += len;
4056 n++;
4057 }
1aa99e6b
IH
4058 }
4059 else {
c255a977
DM
4060 /* set n to utf8_distance(oldloc, locinput) */
4061 while (locinput <= ST.maxpos) {
4062 STRLEN len;
4063 const UV c = utf8n_to_uvchr((U8*)locinput,
4064 UTF8_MAXBYTES, &len,
4065 uniflags);
4066 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4067 break;
4068 locinput += len;
4069 n++;
1aa99e6b 4070 }
0fe9bf95
IZ
4071 }
4072 }
c255a977
DM
4073 else {
4074 if (ST.c1 == ST.c2) {
4075 while (locinput <= ST.maxpos &&
4076 UCHARAT(locinput) != ST.c1)
4077 locinput++;
bbce6d69 4078 }
c255a977
DM
4079 else {
4080 while (locinput <= ST.maxpos
4081 && UCHARAT(locinput) != ST.c1
4082 && UCHARAT(locinput) != ST.c2)
4083 locinput++;
a0ed51b3 4084 }
c255a977
DM
4085 n = locinput - ST.oldloc;
4086 }
4087 if (locinput > ST.maxpos)
4088 sayNO;
4089 /* PL_reginput == oldloc now */
4090 if (n) {
4091 ST.count += n;
4092 if (regrepeat(rex, ST.A, n) < n)
4633a7c4 4093 sayNO;
a0d0e21e 4094 }
c255a977
DM
4095 PL_reginput = locinput;
4096 CURLY_SETPAREN(ST.paren, ST.count);
4097 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 4098 }
c255a977
DM
4099 /* NOTREACHED */
4100
4101
4102 case CURLY_B_min_fail:
4103 /* failed to find B in a non-greedy match where c1,c2 invalid */
4104 if (ST.paren && ST.count)
4105 PL_regendp[ST.paren] = -1;
4106
4107 REGCP_UNWIND(ST.cp);
4108 /* failed -- move forward one */
4109 PL_reginput = locinput;
4110 if (regrepeat(rex, ST.A, 1)) {
4111 ST.count++;
a0ed51b3 4112 locinput = PL_reginput;
c255a977
DM
4113 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4114 ST.count > 0)) /* count overflow ? */
15272685 4115 {
c255a977
DM
4116 curly_try_B_min:
4117 CURLY_SETPAREN(ST.paren, ST.count);
4118 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
4119 }
4120 }
4633a7c4 4121 sayNO;
c255a977
DM
4122 /* NOTREACHED */
4123
4124
4125 curly_try_B_max:
4126 /* a successful greedy match: now try to match B */
4127 {
4128 UV c = 0;
4129 if (ST.c1 != CHRTEST_VOID)
4130 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4131 UTF8_MAXBYTES, 0, uniflags)
466787eb 4132 : (UV) UCHARAT(PL_reginput);
c255a977
DM
4133 /* If it could work, try it. */
4134 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4135 CURLY_SETPAREN(ST.paren, ST.count);
4136 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4137 /* NOTREACHED */
4138 }
4139 }
4140 /* FALL THROUGH */
4141 case CURLY_B_max_fail:
4142 /* failed to find B in a greedy match */
4143 if (ST.paren && ST.count)
4144 PL_regendp[ST.paren] = -1;
4145
4146 REGCP_UNWIND(ST.cp);
4147 /* back up. */
4148 if (--ST.count < ST.min)
4149 sayNO;
4150 PL_reginput = locinput = HOPc(locinput, -1);
4151 goto curly_try_B_max;
4152
4153#undef ST
4154
4155
a0d0e21e 4156 case END:
3b0527fe 4157 if (locinput < reginfo->till) {
a3621e74 4158 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4159 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4160 PL_colors[4],
4161 (long)(locinput - PL_reg_starttry),
3b0527fe 4162 (long)(reginfo->till - PL_reg_starttry),
7821416a
IZ
4163 PL_colors[5]));
4164 sayNO_FINAL; /* Cannot match: too short. */
4165 }
4166 PL_reginput = locinput; /* put where regtry can find it */
4167 sayYES_FINAL; /* Success! */
dad79028
DM
4168
4169 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4170 DEBUG_EXECUTE_r(
4171 PerlIO_printf(Perl_debug_log,
4172 "%*s %ssubpattern success...%s\n",
4173 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
3280af22 4174 PL_reginput = locinput; /* put where regtry can find it */
dad79028
DM
4175 sayYES_FINAL; /* Success! */
4176
40a82448
DM
4177#undef ST
4178#define ST st->u.ifmatch
4179
4180 case SUSPEND: /* (?>A) */
4181 ST.wanted = 1;
9fe1d20c 4182 PL_reginput = locinput;
9041c2e3 4183 goto do_ifmatch;
dad79028 4184
40a82448
DM
4185 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4186 ST.wanted = 0;
dad79028
DM
4187 goto ifmatch_trivial_fail_test;
4188
40a82448
DM
4189 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4190 ST.wanted = 1;
dad79028 4191 ifmatch_trivial_fail_test:
a0ed51b3 4192 if (scan->flags) {
52657f30 4193 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4194 if (!s) {
4195 /* trivial fail */
4196 if (st->logical) {
4197 st->logical = 0;
40a82448 4198 st->sw = 1 - (bool)ST.wanted;
dad79028 4199 }
40a82448 4200 else if (ST.wanted)
dad79028
DM
4201 sayNO;
4202 next = scan + ARG(scan);
4203 if (next == scan)
4204 next = NULL;
4205 break;
4206 }
efb30f32 4207 PL_reginput = s;
a0ed51b3
LW
4208 }
4209 else
4210 PL_reginput = locinput;
4211
c277df42 4212 do_ifmatch:
40a82448
DM
4213 ST.me = scan;
4214 /* execute body of (?...A) */
4215 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4216 /* NOTREACHED */
4217
4218 case IFMATCH_A_fail: /* body of (?...A) failed */
4219 ST.wanted = !ST.wanted;
4220 /* FALL THROUGH */
4221
4222 case IFMATCH_A: /* body of (?...A) succeeded */
4223 if (st->logical) {
4224 st->logical = 0;
4225 st->sw = (bool)ST.wanted;
4226 }
4227 else if (!ST.wanted)
4228 sayNO;
4229
4230 if (OP(ST.me) == SUSPEND)
4231 locinput = PL_reginput;
4232 else {
4233 locinput = PL_reginput = st->locinput;
4234 nextchr = UCHARAT(locinput);
4235 }
4236 scan = ST.me + ARG(ST.me);
4237 if (scan == ST.me)
4238 scan = NULL;
4239 continue; /* execute B */
4240
4241#undef ST
dad79028 4242
c277df42 4243 case LONGJMP:
c277df42
IZ
4244 next = scan + ARG(scan);
4245 if (next == scan)
4246 next = NULL;
a0d0e21e
LW
4247 break;
4248 default:
b900a521 4249 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4250 PTR2UV(scan), OP(scan));
cea2e8a9 4251 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4252 }
95b24440 4253
a0d0e21e 4254 scan = next;
95b24440
DM
4255 continue;
4256 /* NOTREACHED */
4257
40a82448
DM
4258 push_yes_state:
4259 /* push a state that backtracks on success */
4260 st->u.yes.prev_yes_state = yes_state;
4261 yes_state = st;
4262 /* FALL THROUGH */
4263 push_state:
4264 /* push a new regex state, then continue at scan */
4265 {
4266 regmatch_state *newst;
4267
4268 depth++;
ab3bbdeb 4269 DEBUG_STATE_r(PerlIO_printf(Perl_debug_log,
40a82448
DM
4270 "PUSH STATE(%d)\n", depth));
4271 st->locinput = locinput;
4272 newst = st+1;
4273 if (newst > SLAB_LAST(PL_regmatch_slab))
4274 newst = S_push_slab(aTHX);
4275 PL_regmatch_state = newst;
4276 newst->cc = st->cc;
4277 /* XXX probably don't need to initialise these */
4278 newst->minmod = 0;
4279 newst->sw = 0;
4280 newst->logical = 0;
4281
4282 locinput = PL_reginput;
4283 nextchr = UCHARAT(locinput);
4284 st = newst;
4285 continue;
4286 /* NOTREACHED */
4287 }
4288
95b24440
DM
4289 /* simulate recursively calling regmatch(), but without actually
4290 * recursing - ie save the current state on the heap rather than on
4291 * the stack, then re-enter the loop. This avoids complex regexes
4292 * blowing the processor stack */
4293
4294 start_recurse:
4295 {
5d9a96ca
DM
4296 /* push new state */
4297 regmatch_state *oldst = st;
4298
4299 depth++;
40a82448 4300 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH RECURSE STATE(%d)\n", depth));
5d9a96ca
DM
4301
4302 /* grab the next free state slot */
4303 st++;
86545054 4304 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
4305 st = S_push_slab(aTHX);
4306 PL_regmatch_state = st;
4307
4308 oldst->next = next;
4309 oldst->n = n;
4310 oldst->locinput = locinput;
5d9a96ca
DM
4311
4312 st->cc = oldst->cc;
95b24440
DM
4313 locinput = PL_reginput;
4314 nextchr = UCHARAT(locinput);
5d9a96ca
DM
4315 st->minmod = 0;
4316 st->sw = 0;
4317 st->logical = 0;
95b24440
DM
4318#ifdef DEBUGGING
4319 PL_regindent++;
4320#endif
4321 }
a0d0e21e 4322 }
a687059c 4323
aa283a38
DM
4324
4325
a0d0e21e
LW
4326 /*
4327 * We get here only if there's trouble -- normally "case END" is
4328 * the terminating point.
4329 */
cea2e8a9 4330 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4331 /*NOTREACHED*/
4633a7c4
LW
4332 sayNO;
4333
7821416a 4334yes_final:
77cb431f
DM
4335
4336 if (yes_state) {
4337 /* we have successfully completed a subexpression, but we must now
4338 * pop to the state marked by yes_state and continue from there */
4339
4340 assert(st != yes_state);
4341 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4342 || yes_state > SLAB_LAST(PL_regmatch_slab))
4343 {
4344 /* not in this slab, pop slab */
4345 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4346 PL_regmatch_slab = PL_regmatch_slab->prev;
4347 st = SLAB_LAST(PL_regmatch_slab);
4348 }
4349 depth -= (st - yes_state);
e4f74956
YO
4350 DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%"UVuf"..%"UVuf")\n",
4351 (UV)(depth+1), (UV)(depth+(st - yes_state))));
77cb431f
DM
4352 st = yes_state;
4353 yes_state = st->u.yes.prev_yes_state;
4354 PL_regmatch_state = st;
4355
4356 switch (st->resume_state) {
40a82448
DM
4357 case IFMATCH_A:
4358 case CURLYM_A:
40a82448
DM
4359 case EVAL_A:
4360 state_num = st->resume_state;
4361 goto reenter_switch;
4362
4363 case CURLYM_B:
166ba7cd
DM
4364 case BRANCH_next:
4365 case TRIE_next:
c255a977 4366 case CURLY_B_max:
77cb431f 4367 default:
40a82448 4368 Perl_croak(aTHX_ "unexpected yes resume state");
77cb431f
DM
4369 }
4370 }
4371
a3621e74 4372 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4373 PL_colors[4], PL_colors[5]));
4633a7c4
LW
4374yes:
4375#ifdef DEBUGGING
3280af22 4376 PL_regindent--;
4633a7c4 4377#endif
02db2b7b 4378
95b24440 4379 result = 1;
aa283a38 4380 /* XXX this is duplicate(ish) code to that in the do_no section.
40a82448 4381 * will disappear when REGFMATCH goes */
aa283a38
DM
4382 if (depth) {
4383 /* restore previous state and re-enter */
ab3bbdeb 4384 DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
40a82448
DM
4385 depth--;
4386 st--;
4387 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4388 PL_regmatch_slab = PL_regmatch_slab->prev;
4389 st = SLAB_LAST(PL_regmatch_slab);
4390 }
4391 PL_regmatch_state = st;
4392 scan = st->scan;
4393 next = st->next;
4394 n = st->n;
4395 locinput= st->locinput;
4396 nextchr = UCHARAT(locinput);
aa283a38
DM
4397
4398 switch (st->resume_state) {
aa283a38
DM
4399 case resume_CURLYX:
4400 goto resume_point_CURLYX;
4401 case resume_WHILEM1:
4402 goto resume_point_WHILEM1;
4403 case resume_WHILEM2:
4404 goto resume_point_WHILEM2;
4405 case resume_WHILEM3:
4406 goto resume_point_WHILEM3;
4407 case resume_WHILEM4:
4408 goto resume_point_WHILEM4;
4409 case resume_WHILEM5:
4410 goto resume_point_WHILEM5;
4411 case resume_WHILEM6:
4412 goto resume_point_WHILEM6;
77cb431f 4413
166ba7cd 4414 case TRIE_next:
40a82448
DM
4415 case CURLYM_A:
4416 case CURLYM_B:
4417 case EVAL_A:
4418 case IFMATCH_A:
4419 case BRANCH_next:
c255a977
DM
4420 case CURLY_B_max:
4421 case CURLY_B_min:
4422 case CURLY_B_min_known:
40a82448
DM
4423 break;
4424
aa283a38
DM
4425 default:
4426 Perl_croak(aTHX_ "regexp resume memory corruption");
4427 }
4428 }
4429 goto final_exit;
4633a7c4
LW
4430
4431no:
a3621e74 4432 DEBUG_EXECUTE_r(
7821416a
IZ
4433 PerlIO_printf(Perl_debug_log,
4434 "%*s %sfailed...%s\n",
e4584336 4435 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a 4436 );
7821416a
IZ
4437no_final:
4438do_no:
aa283a38 4439
4633a7c4 4440#ifdef DEBUGGING
3280af22 4441 PL_regindent--;
4633a7c4 4442#endif
95b24440 4443 result = 0;
5d9a96ca 4444
aa283a38
DM
4445 if (depth) {
4446 /* there's a previous state to backtrack to */
ab3bbdeb 4447 DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
40a82448
DM
4448 depth--;
4449 st--;
4450 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4451 PL_regmatch_slab = PL_regmatch_slab->prev;
4452 st = SLAB_LAST(PL_regmatch_slab);
4453 }
4454 PL_regmatch_state = st;
4455 scan = st->scan;
4456 next = st->next;
4457 n = st->n;
4458 locinput= st->locinput;
4459 nextchr = UCHARAT(locinput);
4460
5d9a96ca 4461 switch (st->resume_state) {
95b24440
DM
4462 case resume_CURLYX:
4463 goto resume_point_CURLYX;
4464 case resume_WHILEM1:
4465 goto resume_point_WHILEM1;
4466 case resume_WHILEM2:
4467 goto resume_point_WHILEM2;
4468 case resume_WHILEM3:
4469 goto resume_point_WHILEM3;
4470 case resume_WHILEM4:
4471 goto resume_point_WHILEM4;
4472 case resume_WHILEM5:
4473 goto resume_point_WHILEM5;
4474 case resume_WHILEM6:
4475 goto resume_point_WHILEM6;
dad79028 4476
166ba7cd 4477 case TRIE_next:
40a82448
DM
4478 case EVAL_A:
4479 case BRANCH_next:
4480 case CURLYM_A:
4481 case CURLYM_B:
4482 case IFMATCH_A:
c255a977
DM
4483 case CURLY_B_max:
4484 case CURLY_B_min:
4485 case CURLY_B_min_known:
40a82448
DM
4486 if (yes_state == st)
4487 yes_state = st->u.yes.prev_yes_state;
4488 state_num = st->resume_state + 1; /* failure = success + 1 */
4489 goto reenter_switch;
dad79028 4490
95b24440
DM
4491 default:
4492 Perl_croak(aTHX_ "regexp resume memory corruption");
4493 }
95b24440 4494 }
aa283a38
DM
4495
4496final_exit:
4497
5d9a96ca
DM
4498 /* restore original high-water mark */
4499 PL_regmatch_slab = orig_slab;
4500 PL_regmatch_state = orig_state;
4501
4502 /* free all slabs above current one */
4503 if (orig_slab->next) {
c4fd8992 4504 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
4505 orig_slab->next = NULL;
4506 while (sl) {
c4fd8992 4507 regmatch_slab * const osl = sl;
5d9a96ca 4508 sl = sl->next;
ad65c075 4509 Safefree(osl);
5d9a96ca
DM
4510 }
4511 }
4512
95b24440
DM
4513 return result;
4514
a687059c
LW
4515}
4516
4517/*
4518 - regrepeat - repeatedly match something simple, report how many
4519 */
4520/*
4521 * [This routine now assumes that it will only match on things of length 1.
4522 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4523 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4524 */
76e3520e 4525STATIC I32
32fc9b6a 4526S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
a687059c 4527{
27da23d5 4528 dVAR;
a0d0e21e 4529 register char *scan;
a0d0e21e 4530 register I32 c;
3280af22 4531 register char *loceol = PL_regeol;
a0ed51b3 4532 register I32 hardcount = 0;
53c4c00c 4533 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4534
3280af22 4535 scan = PL_reginput;
faf11cac
HS
4536 if (max == REG_INFTY)
4537 max = I32_MAX;
4538 else if (max < loceol - scan)
7f596f4c 4539 loceol = scan + max;
a0d0e21e 4540 switch (OP(p)) {
22c35a8c 4541 case REG_ANY:
1aa99e6b 4542 if (do_utf8) {
ffc61ed2 4543 loceol = PL_regeol;
1aa99e6b 4544 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4545 scan += UTF8SKIP(scan);
4546 hardcount++;
4547 }
4548 } else {
4549 while (scan < loceol && *scan != '\n')
4550 scan++;
a0ed51b3
LW
4551 }
4552 break;
ffc61ed2 4553 case SANY:
def8e4ea
JH
4554 if (do_utf8) {
4555 loceol = PL_regeol;
a0804c9e 4556 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4557 scan += UTF8SKIP(scan);
4558 hardcount++;
4559 }
4560 }
4561 else
4562 scan = loceol;
a0ed51b3 4563 break;
f33976b4
DB
4564 case CANY:
4565 scan = loceol;
4566 break;
090f7165
JH
4567 case EXACT: /* length of string is 1 */
4568 c = (U8)*STRING(p);
4569 while (scan < loceol && UCHARAT(scan) == c)
4570 scan++;
bbce6d69 4571 break;
4572 case EXACTF: /* length of string is 1 */
cd439c50 4573 c = (U8)*STRING(p);
bbce6d69 4574 while (scan < loceol &&
22c35a8c 4575 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4576 scan++;
4577 break;
4578 case EXACTFL: /* length of string is 1 */
3280af22 4579 PL_reg_flags |= RF_tainted;
cd439c50 4580 c = (U8)*STRING(p);
bbce6d69 4581 while (scan < loceol &&
22c35a8c 4582 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4583 scan++;
4584 break;
4585 case ANYOF:
ffc61ed2
JH
4586 if (do_utf8) {
4587 loceol = PL_regeol;
cfc92286 4588 while (hardcount < max && scan < loceol &&
32fc9b6a 4589 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4590 scan += UTF8SKIP(scan);
4591 hardcount++;
4592 }
4593 } else {
32fc9b6a 4594 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
4595 scan++;
4596 }
a0d0e21e
LW
4597 break;
4598 case ALNUM:
1aa99e6b 4599 if (do_utf8) {
ffc61ed2 4600 loceol = PL_regeol;
1a4fad37 4601 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4602 while (hardcount < max && scan < loceol &&
3568d838 4603 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4604 scan += UTF8SKIP(scan);
4605 hardcount++;
4606 }
4607 } else {
4608 while (scan < loceol && isALNUM(*scan))
4609 scan++;
a0ed51b3
LW
4610 }
4611 break;
bbce6d69 4612 case ALNUML:
3280af22 4613 PL_reg_flags |= RF_tainted;
1aa99e6b 4614 if (do_utf8) {
ffc61ed2 4615 loceol = PL_regeol;
1aa99e6b
IH
4616 while (hardcount < max && scan < loceol &&
4617 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4618 scan += UTF8SKIP(scan);
4619 hardcount++;
4620 }
4621 } else {
4622 while (scan < loceol && isALNUM_LC(*scan))
4623 scan++;
a0ed51b3
LW
4624 }
4625 break;
a0d0e21e 4626 case NALNUM:
1aa99e6b 4627 if (do_utf8) {
ffc61ed2 4628 loceol = PL_regeol;
1a4fad37 4629 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4630 while (hardcount < max && scan < loceol &&
3568d838 4631 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4632 scan += UTF8SKIP(scan);
4633 hardcount++;
4634 }
4635 } else {
4636 while (scan < loceol && !isALNUM(*scan))
4637 scan++;
a0ed51b3
LW
4638 }
4639 break;
bbce6d69 4640 case NALNUML:
3280af22 4641 PL_reg_flags |= RF_tainted;
1aa99e6b 4642 if (do_utf8) {
ffc61ed2 4643 loceol = PL_regeol;
1aa99e6b
IH
4644 while (hardcount < max && scan < loceol &&
4645 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4646 scan += UTF8SKIP(scan);
4647 hardcount++;
4648 }
4649 } else {
4650 while (scan < loceol && !isALNUM_LC(*scan))
4651 scan++;
a0ed51b3
LW
4652 }
4653 break;
a0d0e21e 4654 case SPACE:
1aa99e6b 4655 if (do_utf8) {
ffc61ed2 4656 loceol = PL_regeol;
1a4fad37 4657 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4658 while (hardcount < max && scan < loceol &&
3568d838
JH
4659 (*scan == ' ' ||
4660 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4661 scan += UTF8SKIP(scan);
4662 hardcount++;
4663 }
4664 } else {
4665 while (scan < loceol && isSPACE(*scan))
4666 scan++;
a0ed51b3
LW
4667 }
4668 break;
bbce6d69 4669 case SPACEL:
3280af22 4670 PL_reg_flags |= RF_tainted;
1aa99e6b 4671 if (do_utf8) {
ffc61ed2 4672 loceol = PL_regeol;
1aa99e6b 4673 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4674 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4675 scan += UTF8SKIP(scan);
4676 hardcount++;
4677 }
4678 } else {
4679 while (scan < loceol && isSPACE_LC(*scan))
4680 scan++;
a0ed51b3
LW
4681 }
4682 break;
a0d0e21e 4683 case NSPACE:
1aa99e6b 4684 if (do_utf8) {
ffc61ed2 4685 loceol = PL_regeol;
1a4fad37 4686 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4687 while (hardcount < max && scan < loceol &&
3568d838
JH
4688 !(*scan == ' ' ||
4689 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4690 scan += UTF8SKIP(scan);
4691 hardcount++;
4692 }
4693 } else {
4694 while (scan < loceol && !isSPACE(*scan))
4695 scan++;
4696 break;
a0ed51b3 4697 }
bbce6d69 4698 case NSPACEL:
3280af22 4699 PL_reg_flags |= RF_tainted;
1aa99e6b 4700 if (do_utf8) {
ffc61ed2 4701 loceol = PL_regeol;
1aa99e6b 4702 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4703 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4704 scan += UTF8SKIP(scan);
4705 hardcount++;
4706 }
4707 } else {
4708 while (scan < loceol && !isSPACE_LC(*scan))
4709 scan++;
a0ed51b3
LW
4710 }
4711 break;
a0d0e21e 4712 case DIGIT:
1aa99e6b 4713 if (do_utf8) {
ffc61ed2 4714 loceol = PL_regeol;
1a4fad37 4715 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4716 while (hardcount < max && scan < loceol &&
3568d838 4717 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4718 scan += UTF8SKIP(scan);
4719 hardcount++;
4720 }
4721 } else {
4722 while (scan < loceol && isDIGIT(*scan))
4723 scan++;
a0ed51b3
LW
4724 }
4725 break;
a0d0e21e 4726 case NDIGIT:
1aa99e6b 4727 if (do_utf8) {
ffc61ed2 4728 loceol = PL_regeol;
1a4fad37 4729 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4730 while (hardcount < max && scan < loceol &&
3568d838 4731 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4732 scan += UTF8SKIP(scan);
4733 hardcount++;
4734 }
4735 } else {
4736 while (scan < loceol && !isDIGIT(*scan))
4737 scan++;
a0ed51b3
LW
4738 }
4739 break;
a0d0e21e
LW
4740 default: /* Called on something of 0 width. */
4741 break; /* So match right here or not at all. */
4742 }
a687059c 4743
a0ed51b3
LW
4744 if (hardcount)
4745 c = hardcount;
4746 else
4747 c = scan - PL_reginput;
3280af22 4748 PL_reginput = scan;
a687059c 4749
a3621e74 4750 DEBUG_r({
e68ec53f 4751 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 4752 DEBUG_EXECUTE_r({
e68ec53f
YO
4753 SV * const prop = sv_newmortal();
4754 regprop(prog, prop, p);
4755 PerlIO_printf(Perl_debug_log,
be8e71aa
YO
4756 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4757 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 4758 });
be8e71aa 4759 });
9041c2e3 4760
a0d0e21e 4761 return(c);
a687059c
LW
4762}
4763
c277df42 4764
be8e71aa 4765#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 4766/*
ffc61ed2
JH
4767- regclass_swash - prepare the utf8 swash
4768*/
4769
4770SV *
32fc9b6a 4771Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4772{
97aff369 4773 dVAR;
9e55ce06
JH
4774 SV *sw = NULL;
4775 SV *si = NULL;
4776 SV *alt = NULL;
3dab1dad 4777 const struct reg_data * const data = prog ? prog->data : NULL;
ffc61ed2 4778
4f639d21 4779 if (data && data->count) {
a3b680e6 4780 const U32 n = ARG(node);
ffc61ed2 4781
4f639d21
DM
4782 if (data->what[n] == 's') {
4783 SV * const rv = (SV*)data->data[n];
890ce7af 4784 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 4785 SV **const ary = AvARRAY(av);
9e55ce06 4786 SV **a, **b;
9041c2e3 4787
711a919c 4788 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
4789 * documentation of these array elements. */
4790
b11f357e 4791 si = *ary;
8f7f7219 4792 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
4793 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4794
ffc61ed2
JH
4795 if (a)
4796 sw = *a;
4797 else if (si && doinit) {
4798 sw = swash_init("utf8", "", si, 1, 0);
4799 (void)av_store(av, 1, sw);
4800 }
9e55ce06
JH
4801 if (b)
4802 alt = *b;
ffc61ed2
JH
4803 }
4804 }
4805
9e55ce06
JH
4806 if (listsvp)
4807 *listsvp = si;
4808 if (altsvp)
4809 *altsvp = alt;
ffc61ed2
JH
4810
4811 return sw;
4812}
76234dfb 4813#endif
ffc61ed2
JH
4814
4815/*
ba7b4546 4816 - reginclass - determine if a character falls into a character class
832705d4
JH
4817
4818 The n is the ANYOF regnode, the p is the target string, lenp
4819 is pointer to the maximum length of how far to go in the p
4820 (if the lenp is zero, UTF8SKIP(p) is used),
4821 do_utf8 tells whether the target string is in UTF-8.
4822
bbce6d69 4823 */
4824
76e3520e 4825STATIC bool
32fc9b6a 4826S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4827{
27da23d5 4828 dVAR;
a3b680e6 4829 const char flags = ANYOF_FLAGS(n);
bbce6d69 4830 bool match = FALSE;
cc07378b 4831 UV c = *p;
ae9ddab8 4832 STRLEN len = 0;
9e55ce06 4833 STRLEN plen;
1aa99e6b 4834
19f67299
TS
4835 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
4836 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
4837 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
4838 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
19f67299
TS
4839 if (len == (STRLEN)-1)
4840 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
4841 }
bbce6d69 4842
0f0076b4 4843 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4844 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4845 if (lenp)
4846 *lenp = 0;
ffc61ed2 4847 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4848 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4849 match = TRUE;
bbce6d69 4850 }
3568d838 4851 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4852 match = TRUE;
ffc61ed2 4853 if (!match) {
9e55ce06 4854 AV *av;
32fc9b6a 4855 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4856
4857 if (sw) {
3568d838 4858 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4859 match = TRUE;
4860 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4861 if (!match && lenp && av) {
4862 I32 i;
9e55ce06 4863 for (i = 0; i <= av_len(av); i++) {
890ce7af 4864 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 4865 STRLEN len;
890ce7af 4866 const char * const s = SvPV_const(sv, len);
9e55ce06 4867
061b10df 4868 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4869 *lenp = len;
4870 match = TRUE;
4871 break;
4872 }
4873 }
4874 }
4875 if (!match) {
89ebb4a3 4876 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
4877 STRLEN tmplen;
4878
9e55ce06
JH
4879 to_utf8_fold(p, tmpbuf, &tmplen);
4880 if (swash_fetch(sw, tmpbuf, do_utf8))
4881 match = TRUE;
4882 }
ffc61ed2
JH
4883 }
4884 }
bbce6d69 4885 }
9e55ce06 4886 if (match && lenp && *lenp == 0)
0f0076b4 4887 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4888 }
1aa99e6b 4889 if (!match && c < 256) {
ffc61ed2
JH
4890 if (ANYOF_BITMAP_TEST(n, c))
4891 match = TRUE;
4892 else if (flags & ANYOF_FOLD) {
eb160463 4893 U8 f;
a0ed51b3 4894
ffc61ed2
JH
4895 if (flags & ANYOF_LOCALE) {
4896 PL_reg_flags |= RF_tainted;
4897 f = PL_fold_locale[c];
4898 }
4899 else
4900 f = PL_fold[c];
4901 if (f != c && ANYOF_BITMAP_TEST(n, f))
4902 match = TRUE;
4903 }
4904
4905 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4906 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4907 if (
4908 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4909 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4910 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4911 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4912 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4913 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4914 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4915 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4916 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4917 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4918 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4919 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4920 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4921 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4922 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4923 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4924 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4925 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4926 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4927 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4928 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4929 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4930 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4931 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4932 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4933 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4934 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4935 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4936 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4937 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4938 ) /* How's that for a conditional? */
4939 {
4940 match = TRUE;
4941 }
a0ed51b3 4942 }
a0ed51b3
LW
4943 }
4944
a0ed51b3
LW
4945 return (flags & ANYOF_INVERT) ? !match : match;
4946}
161b471a 4947
dfe13c55 4948STATIC U8 *
0ce71af7 4949S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 4950{
97aff369 4951 dVAR;
a0ed51b3 4952 if (off >= 0) {
1aa99e6b 4953 while (off-- && s < lim) {
ffc61ed2 4954 /* XXX could check well-formedness here */
a0ed51b3 4955 s += UTF8SKIP(s);
ffc61ed2 4956 }
a0ed51b3
LW
4957 }
4958 else {
4959 while (off++) {
1aa99e6b 4960 if (s > lim) {
a0ed51b3 4961 s--;
ffc61ed2 4962 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4963 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4964 s--;
ffc61ed2
JH
4965 }
4966 /* XXX could check well-formedness here */
a0ed51b3
LW
4967 }
4968 }
4969 }
4970 return s;
4971}
161b471a 4972
dfe13c55 4973STATIC U8 *
0ce71af7 4974S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 4975{
97aff369 4976 dVAR;
a0ed51b3 4977 if (off >= 0) {
1aa99e6b 4978 while (off-- && s < lim) {
ffc61ed2 4979 /* XXX could check well-formedness here */
a0ed51b3 4980 s += UTF8SKIP(s);
ffc61ed2 4981 }
a0ed51b3 4982 if (off >= 0)
3dab1dad 4983 return NULL;
a0ed51b3
LW
4984 }
4985 else {
4986 while (off++) {
1aa99e6b 4987 if (s > lim) {
a0ed51b3 4988 s--;
ffc61ed2 4989 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4990 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4991 s--;
ffc61ed2
JH
4992 }
4993 /* XXX could check well-formedness here */
a0ed51b3
LW
4994 }
4995 else
4996 break;
4997 }
4998 if (off <= 0)
3dab1dad 4999 return NULL;
a0ed51b3
LW
5000 }
5001 return s;
5002}
51371543 5003
51371543 5004static void
acfe0abc 5005restore_pos(pTHX_ void *arg)
51371543 5006{
97aff369 5007 dVAR;
097eb12c 5008 regexp * const rex = (regexp *)arg;
51371543
GS
5009 if (PL_reg_eval_set) {
5010 if (PL_reg_oldsaved) {
4f639d21
DM
5011 rex->subbeg = PL_reg_oldsaved;
5012 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5013#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5014 rex->saved_copy = PL_nrs;
ed252734 5015#endif
4f639d21 5016 RX_MATCH_COPIED_on(rex);
51371543
GS
5017 }
5018 PL_reg_magic->mg_len = PL_reg_oldpos;
5019 PL_reg_eval_set = 0;
5020 PL_curpm = PL_reg_oldcurpm;
5021 }
5022}
33b8afdf
JH
5023
5024STATIC void
5025S_to_utf8_substr(pTHX_ register regexp *prog)
5026{
33b8afdf 5027 if (prog->float_substr && !prog->float_utf8) {
097eb12c
AL
5028 SV* const sv = newSVsv(prog->float_substr);
5029 prog->float_utf8 = sv;
33b8afdf
JH
5030 sv_utf8_upgrade(sv);
5031 if (SvTAIL(prog->float_substr))
5032 SvTAIL_on(sv);
5033 if (prog->float_substr == prog->check_substr)
5034 prog->check_utf8 = sv;
5035 }
5036 if (prog->anchored_substr && !prog->anchored_utf8) {
097eb12c
AL
5037 SV* const sv = newSVsv(prog->anchored_substr);
5038 prog->anchored_utf8 = sv;
33b8afdf
JH
5039 sv_utf8_upgrade(sv);
5040 if (SvTAIL(prog->anchored_substr))
5041 SvTAIL_on(sv);
5042 if (prog->anchored_substr == prog->check_substr)
5043 prog->check_utf8 = sv;
5044 }
5045}
5046
5047STATIC void
5048S_to_byte_substr(pTHX_ register regexp *prog)
5049{
97aff369 5050 dVAR;
33b8afdf 5051 if (prog->float_utf8 && !prog->float_substr) {
097eb12c
AL
5052 SV* sv = newSVsv(prog->float_utf8);
5053 prog->float_substr = sv;
33b8afdf
JH
5054 if (sv_utf8_downgrade(sv, TRUE)) {
5055 if (SvTAIL(prog->float_utf8))
5056 SvTAIL_on(sv);
5057 } else {
5058 SvREFCNT_dec(sv);
5059 prog->float_substr = sv = &PL_sv_undef;
5060 }
5061 if (prog->float_utf8 == prog->check_utf8)
5062 prog->check_substr = sv;
5063 }
5064 if (prog->anchored_utf8 && !prog->anchored_substr) {
097eb12c
AL
5065 SV* sv = newSVsv(prog->anchored_utf8);
5066 prog->anchored_substr = sv;
33b8afdf
JH
5067 if (sv_utf8_downgrade(sv, TRUE)) {
5068 if (SvTAIL(prog->anchored_utf8))
5069 SvTAIL_on(sv);
5070 } else {
5071 SvREFCNT_dec(sv);
5072 prog->anchored_substr = sv = &PL_sv_undef;
5073 }
5074 if (prog->anchored_utf8 == prog->check_utf8)
5075 prog->check_substr = sv;
5076 }
5077}
66610fdd
RGS
5078
5079/*
5080 * Local variables:
5081 * c-indentation-style: bsd
5082 * c-basic-offset: 4
5083 * indent-tabs-mode: t
5084 * End:
5085 *
37442d52
RGS
5086 * ex: set ts=8 sts=4 sw=4 noet:
5087 */