This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[perl #40239] New hints file for MidnightBSD
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
9041c2e3 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 **** Alterations to Henry's code are...
57 ****
4bb101f2 58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 60 ****
9ef589d8
LW
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
a687059c
LW
63 *
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
67 */
68#include "EXTERN.h"
864dbfa3 69#define PERL_IN_REGEXEC_C
a687059c 70#include "perl.h"
0f5d15d6 71
54df2634
NC
72#ifdef PERL_IN_XSUB_RE
73# include "re_comp.h"
74#else
75# include "regcomp.h"
76#endif
a687059c 77
c277df42
IZ
78#define RF_tainted 1 /* tainted information used? */
79#define RF_warned 2 /* warned about big count? */
ce862d02 80#define RF_evaled 4 /* Did an EVAL with setting? */
ab3bbdeb 81#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 82
eb160463 83#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
84
85#define RS_init 1 /* eval environment created */
86#define RS_set 2 /* replsv value is set */
c277df42 87
a687059c
LW
88#ifndef STATIC
89#define STATIC static
90#endif
91
32fc9b6a 92#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 93
c277df42
IZ
94/*
95 * Forwards.
96 */
97
33b8afdf 98#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 99#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 100
3dab1dad
YO
101#define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
52657f30 103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
104 : (U8*)(pos + off))
105#define HOPBACKc(pos, off) \
07be1b83
YO
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
8e11feef 109 ? (U8*)pos - off \
3dab1dad 110 : NULL)
efb30f32 111
53c4c00c 112#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 113#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 114
1a4fad37
AL
115#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 121
3dab1dad
YO
122/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
123
5f80c4cf 124/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
125#define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
130)
131
cca55fe3 132#define HAS_TEXT(rn) ( \
3dab1dad 133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 134)
e2d8ce26 135
a84d97b6
HS
136/*
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
139*/
cca55fe3 140#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 144 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 145 else if (type == PLUS) \
cca55fe3 146 rn = NEXTOPER(rn); \
3dab1dad 147 else if (type == IFMATCH) \
a84d97b6 148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 149 else rn += NEXT_OFF(rn); \
3dab1dad 150 } \
5f80c4cf 151} STMT_END
74750237 152
acfe0abc 153static void restore_pos(pTHX_ void *arg);
51371543 154
76e3520e 155STATIC CHECKPOINT
cea2e8a9 156S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 157{
97aff369 158 dVAR;
a3b680e6 159 const int retval = PL_savestack_ix;
b1ce53c5 160#define REGCP_PAREN_ELEMS 4
a3b680e6 161 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 162 int p;
40a82448 163 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 164
e49a9654
IH
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
167
a01268b5 168#define REGCP_OTHER_ELEMS 6
4b3c1a47 169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 170 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 171/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
3280af22 174 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 175 SSPUSHINT(p);
40a82448
DM
176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
177 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
178 (UV)p, (IV)PL_regstartp[p],
179 (IV)(PL_reg_start_tmp[p] - PL_bostr),
180 (IV)PL_regendp[p]
181 ));
a0d0e21e 182 }
b1ce53c5 183/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
184 SSPUSHINT(PL_regsize);
185 SSPUSHINT(*PL_reglastparen);
a01268b5 186 SSPUSHINT(*PL_reglastcloseparen);
3280af22 187 SSPUSHPTR(PL_reginput);
41123dfd
JH
188#define REGCP_FRAME_ELEMS 2
189/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
190 * are needed for the regexp context stack bookkeeping. */
191 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 192 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 193
a0d0e21e
LW
194 return retval;
195}
196
c277df42 197/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
198#define REGCP_SET(cp) \
199 DEBUG_STATE_r( \
ab3bbdeb 200 PerlIO_printf(Perl_debug_log, \
e4f74956 201 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
202 (IV)PL_savestack_ix)); \
203 cp = PL_savestack_ix
c3464db5 204
ab3bbdeb 205#define REGCP_UNWIND(cp) \
e4f74956 206 DEBUG_STATE_r( \
ab3bbdeb 207 if (cp != PL_savestack_ix) \
e4f74956
YO
208 PerlIO_printf(Perl_debug_log, \
209 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
210 (IV)(cp), (IV)PL_savestack_ix)); \
211 regcpblow(cp)
c277df42 212
76e3520e 213STATIC char *
097eb12c 214S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 215{
97aff369 216 dVAR;
b1ce53c5 217 I32 i;
a0d0e21e 218 char *input;
b1ce53c5 219
a3621e74
YO
220 GET_RE_DEBUG_FLAGS_DECL;
221
b1ce53c5 222 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 223 i = SSPOPINT;
b1ce53c5
JH
224 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
225 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 226 input = (char *) SSPOPPTR;
a01268b5 227 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
228 *PL_reglastparen = SSPOPINT;
229 PL_regsize = SSPOPINT;
b1ce53c5
JH
230
231 /* Now restore the parentheses context. */
41123dfd
JH
232 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
233 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 234 I32 tmps;
097eb12c 235 U32 paren = (U32)SSPOPINT;
3280af22 236 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
237 PL_regstartp[paren] = SSPOPINT;
238 tmps = SSPOPINT;
3280af22
NIS
239 if (paren <= *PL_reglastparen)
240 PL_regendp[paren] = tmps;
a3621e74 241 DEBUG_EXECUTE_r(
c3464db5 242 PerlIO_printf(Perl_debug_log,
b900a521 243 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 244 (UV)paren, (IV)PL_regstartp[paren],
b900a521 245 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 246 (IV)PL_regendp[paren],
3280af22 247 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 248 );
a0d0e21e 249 }
a3621e74 250 DEBUG_EXECUTE_r(
bb7a0f54 251 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 252 PerlIO_printf(Perl_debug_log,
faccc32b 253 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 254 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
255 }
256 );
daf18116 257#if 1
dafc8851
JH
258 /* It would seem that the similar code in regtry()
259 * already takes care of this, and in fact it is in
260 * a better location to since this code can #if 0-ed out
261 * but the code in regtry() is needed or otherwise tests
262 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
263 * (as of patchlevel 7877) will fail. Then again,
264 * this code seems to be necessary or otherwise
265 * building DynaLoader will fail:
266 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
267 * --jhi */
bb7a0f54 268 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
269 if (i > PL_regsize)
270 PL_regstartp[i] = -1;
271 PL_regendp[i] = -1;
a0d0e21e 272 }
dafc8851 273#endif
a0d0e21e
LW
274 return input;
275}
276
02db2b7b 277#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 278
a687059c 279/*
e50aee73 280 * pregexec and friends
a687059c
LW
281 */
282
76234dfb 283#ifndef PERL_IN_XSUB_RE
a687059c 284/*
c277df42 285 - pregexec - match a regexp against a string
a687059c 286 */
c277df42 287I32
864dbfa3 288Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 289 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
290/* strend: pointer to null at end of string */
291/* strbeg: real beginning of string */
292/* minend: end of match must be >=minend after stringarg. */
293/* nosave: For optimizations. */
294{
295 return
9041c2e3 296 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
297 nosave ? 0 : REXEC_COPY_STR);
298}
76234dfb 299#endif
22e551b9 300
9041c2e3 301/*
cad2e5aa
JH
302 * Need to implement the following flags for reg_anch:
303 *
304 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
305 * USE_INTUIT_ML
306 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
307 * INTUIT_AUTORITATIVE_ML
308 * INTUIT_ONCE_NOML - Intuit can match in one location only.
309 * INTUIT_ONCE_ML
310 *
311 * Another flag for this function: SECOND_TIME (so that float substrs
312 * with giant delta may be not rechecked).
313 */
314
315/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
316
3f7c398e 317/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
318 Otherwise, only SvCUR(sv) is used to get strbeg. */
319
320/* XXXX We assume that strpos is strbeg unless sv. */
321
6eb5f6b9
JH
322/* XXXX Some places assume that there is a fixed substring.
323 An update may be needed if optimizer marks as "INTUITable"
324 RExen without fixed substrings. Similarly, it is assumed that
325 lengths of all the strings are no more than minlen, thus they
326 cannot come from lookahead.
327 (Or minlen should take into account lookahead.) */
328
2c2d71f5
JH
329/* A failure to find a constant substring means that there is no need to make
330 an expensive call to REx engine, thus we celebrate a failure. Similarly,
331 finding a substring too deep into the string means that less calls to
30944b6d
IZ
332 regtry() should be needed.
333
334 REx compiler's optimizer found 4 possible hints:
335 a) Anchored substring;
336 b) Fixed substring;
337 c) Whether we are anchored (beginning-of-line or \G);
338 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 339 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
340 string which does not contradict any of them.
341 */
2c2d71f5 342
6eb5f6b9
JH
343/* Most of decisions we do here should have been done at compile time.
344 The nodes of the REx which we used for the search should have been
345 deleted from the finite automaton. */
346
cad2e5aa
JH
347char *
348Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
349 char *strend, U32 flags, re_scream_pos_data *data)
350{
97aff369 351 dVAR;
b7953727 352 register I32 start_shift = 0;
cad2e5aa 353 /* Should be nonnegative! */
b7953727 354 register I32 end_shift = 0;
2c2d71f5
JH
355 register char *s;
356 register SV *check;
a1933d95 357 char *strbeg;
cad2e5aa 358 char *t;
a3b680e6 359 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 360 I32 ml_anch;
bd61b366
SS
361 register char *other_last = NULL; /* other substr checked before this */
362 char *check_at = NULL; /* check substr found at this pos */
1df70142 363 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 364#ifdef DEBUGGING
890ce7af 365 const char * const i_strpos = strpos;
30944b6d 366#endif
a3621e74
YO
367
368 GET_RE_DEBUG_FLAGS_DECL;
369
a30b2f1f 370 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 371
b8d68ded 372 if (prog->reganch & ROPT_UTF8) {
b8d68ded
JH
373 PL_reg_flags |= RF_utf8;
374 }
ab3bbdeb
YO
375 DEBUG_EXECUTE_r(
376 debug_start_match(prog, do_utf8, strpos, strend,
377 "Guessing start of match for");
2a782b5b 378 );
cad2e5aa 379
c344f387
JH
380 /* CHR_DIST() would be more correct here but it makes things slow. */
381 if (prog->minlen > strend - strpos) {
a3621e74 382 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 383 "String too short... [re_intuit_start]\n"));
cad2e5aa 384 goto fail;
2c2d71f5 385 }
a1933d95 386 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 387 PL_regeol = strend;
33b8afdf
JH
388 if (do_utf8) {
389 if (!prog->check_utf8 && prog->check_substr)
390 to_utf8_substr(prog);
391 check = prog->check_utf8;
392 } else {
393 if (!prog->check_substr && prog->check_utf8)
394 to_byte_substr(prog);
395 check = prog->check_substr;
396 }
397 if (check == &PL_sv_undef) {
a3621e74 398 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
399 "Non-utf string cannot match utf check string\n"));
400 goto fail;
401 }
2c2d71f5 402 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
403 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
404 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 405 && !multiline ) ); /* Check after \n? */
cad2e5aa 406
7e25d62c
JH
407 if (!ml_anch) {
408 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
409 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 410 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
411 && sv && !SvROK(sv)
412 && (strpos != strbeg)) {
a3621e74 413 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
414 goto fail;
415 }
416 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 417 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 418 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
419 I32 slen;
420
1aa99e6b 421 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
422 if (SvTAIL(check)) {
423 slen = SvCUR(check); /* >= 1 */
cad2e5aa 424
9041c2e3 425 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 426 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 427 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 428 goto fail_finish;
cad2e5aa
JH
429 }
430 /* Now should match s[0..slen-2] */
431 slen--;
3f7c398e 432 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 433 || (slen > 1
3f7c398e 434 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 435 report_neq:
a3621e74 436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
437 goto fail_finish;
438 }
cad2e5aa 439 }
3f7c398e 440 else if (*SvPVX_const(check) != *s
653099ff 441 || ((slen = SvCUR(check)) > 1
3f7c398e 442 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 443 goto report_neq;
c315bfe8 444 check_at = s;
2c2d71f5 445 goto success_at_start;
7e25d62c 446 }
cad2e5aa 447 }
2c2d71f5 448 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 449 s = strpos;
2c2d71f5 450 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 451 end_shift = prog->minlen - start_shift -
653099ff 452 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 453 if (!ml_anch) {
a3b680e6 454 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 455 - (SvTAIL(check) != 0);
a3b680e6 456 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
457
458 if (end_shift < eshift)
459 end_shift = eshift;
460 }
cad2e5aa 461 }
2c2d71f5 462 else { /* Can match at random position */
cad2e5aa
JH
463 ml_anch = 0;
464 s = strpos;
2c2d71f5
JH
465 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
466 /* Should be nonnegative! */
467 end_shift = prog->minlen - start_shift -
653099ff 468 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
469 }
470
2c2d71f5 471#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 472 if (end_shift < 0)
6bbae5e6 473 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
474#endif
475
2c2d71f5
JH
476 restart:
477 /* Find a possible match in the region s..strend by looking for
478 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 479 if (flags & REXEC_SCREAM) {
cad2e5aa 480 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 481 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 482
2c2d71f5
JH
483 if (PL_screamfirst[BmRARE(check)] >= 0
484 || ( BmRARE(check) == '\n'
485 && (BmPREVIOUS(check) == SvCUR(check) - 1)
486 && SvTAIL(check) ))
9041c2e3 487 s = screaminstr(sv, check,
2c2d71f5 488 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 489 else
2c2d71f5 490 goto fail_finish;
4addbd3b
HS
491 /* we may be pointing at the wrong string */
492 if (s && RX_MATCH_COPIED(prog))
3f7c398e 493 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
494 if (data)
495 *data->scream_olds = s;
496 }
f33976b4 497 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
498 s = fbm_instr((U8*)(s + start_shift),
499 (U8*)(strend - end_shift),
7fba1cd6 500 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 501 else
1aa99e6b
IH
502 s = fbm_instr(HOP3(s, start_shift, strend),
503 HOP3(strend, -end_shift, strbeg),
7fba1cd6 504 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
505
506 /* Update the count-of-usability, remove useless subpatterns,
507 unshift s. */
2c2d71f5 508
ab3bbdeb
YO
509 DEBUG_EXECUTE_r({
510 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
511 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
512 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 513 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
514 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
515 ? "anchored" : "floating"),
516 quoted,
517 RE_SV_TAIL(check),
518 (s ? " at offset " : "...\n") );
519 });
2c2d71f5
JH
520
521 if (!s)
522 goto fail_finish;
523
6eb5f6b9
JH
524 check_at = s;
525
2c2d71f5 526 /* Finish the diagnostic message */
a3621e74 527 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
528
529 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
530 Start with the other substr.
531 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 532 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
533 *always* match. Probably should be marked during compile...
534 Probably it is right to do no SCREAM here...
535 */
536
33b8afdf 537 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 538 /* Take into account the "other" substring. */
2c2d71f5
JH
539 /* XXXX May be hopelessly wrong for UTF... */
540 if (!other_last)
6eb5f6b9 541 other_last = strpos;
33b8afdf 542 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
543 do_other_anchored:
544 {
890ce7af
AL
545 char * const last = HOP3c(s, -start_shift, strbeg);
546 char *last1, *last2;
be8e71aa 547 char * const saved_s = s;
33b8afdf 548 SV* must;
2c2d71f5 549
2c2d71f5
JH
550 t = s - prog->check_offset_max;
551 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 552 && (!do_utf8
0ce71af7 553 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 554 && t > strpos)))
6f207bd3 555 NOOP;
2c2d71f5
JH
556 else
557 t = strpos;
1aa99e6b 558 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
559 if (t < other_last) /* These positions already checked */
560 t = other_last;
1aa99e6b 561 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
562 if (last < last1)
563 last1 = last;
564 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
565 /* On end-of-str: see comment below. */
33b8afdf
JH
566 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
567 if (must == &PL_sv_undef) {
568 s = (char*)NULL;
a3621e74 569 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
570 }
571 else
572 s = fbm_instr(
573 (unsigned char*)t,
574 HOP3(HOP3(last1, prog->anchored_offset, strend)
575 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
576 must,
7fba1cd6 577 multiline ? FBMrf_MULTILINE : 0
33b8afdf 578 );
ab3bbdeb
YO
579 DEBUG_EXECUTE_r({
580 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
581 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
582 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 583 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
584 quoted, RE_SV_TAIL(must));
585 });
586
587
2c2d71f5
JH
588 if (!s) {
589 if (last1 >= last2) {
a3621e74 590 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
591 ", giving up...\n"));
592 goto fail_finish;
593 }
a3621e74 594 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 595 ", trying floating at offset %ld...\n",
be8e71aa 596 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
597 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
598 s = HOP3c(last, 1, strend);
2c2d71f5
JH
599 goto restart;
600 }
601 else {
a3621e74 602 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 603 (long)(s - i_strpos)));
1aa99e6b
IH
604 t = HOP3c(s, -prog->anchored_offset, strbeg);
605 other_last = HOP3c(s, 1, strend);
be8e71aa 606 s = saved_s;
2c2d71f5
JH
607 if (t == strpos)
608 goto try_at_start;
2c2d71f5
JH
609 goto try_at_offset;
610 }
30944b6d 611 }
2c2d71f5
JH
612 }
613 else { /* Take into account the floating substring. */
33b8afdf 614 char *last, *last1;
be8e71aa 615 char * const saved_s = s;
33b8afdf
JH
616 SV* must;
617
618 t = HOP3c(s, -start_shift, strbeg);
619 last1 = last =
620 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
621 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
622 last = HOP3c(t, prog->float_max_offset, strend);
623 s = HOP3c(t, prog->float_min_offset, strend);
624 if (s < other_last)
625 s = other_last;
2c2d71f5 626 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
627 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
628 /* fbm_instr() takes into account exact value of end-of-str
629 if the check is SvTAIL(ed). Since false positives are OK,
630 and end-of-str is not later than strend we are OK. */
631 if (must == &PL_sv_undef) {
632 s = (char*)NULL;
a3621e74 633 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
634 }
635 else
2c2d71f5 636 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
637 (unsigned char*)last + SvCUR(must)
638 - (SvTAIL(must)!=0),
7fba1cd6 639 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
640 DEBUG_EXECUTE_r({
641 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
642 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
643 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 644 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
645 quoted, RE_SV_TAIL(must));
646 });
33b8afdf
JH
647 if (!s) {
648 if (last1 == last) {
a3621e74 649 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
650 ", giving up...\n"));
651 goto fail_finish;
2c2d71f5 652 }
a3621e74 653 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 654 ", trying anchored starting at offset %ld...\n",
be8e71aa 655 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
656 other_last = last;
657 s = HOP3c(t, 1, strend);
658 goto restart;
659 }
660 else {
a3621e74 661 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
662 (long)(s - i_strpos)));
663 other_last = s; /* Fix this later. --Hugo */
be8e71aa 664 s = saved_s;
33b8afdf
JH
665 if (t == strpos)
666 goto try_at_start;
667 goto try_at_offset;
668 }
2c2d71f5 669 }
cad2e5aa 670 }
2c2d71f5
JH
671
672 t = s - prog->check_offset_max;
2c2d71f5 673 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 674 && (!do_utf8
0ce71af7 675 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
1aa99e6b 676 && t > strpos))) {
2c2d71f5
JH
677 /* Fixed substring is found far enough so that the match
678 cannot start at strpos. */
679 try_at_offset:
cad2e5aa 680 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
681 /* Eventually fbm_*() should handle this, but often
682 anchored_offset is not 0, so this check will not be wasted. */
683 /* XXXX In the code below we prefer to look for "^" even in
684 presence of anchored substrings. And we search even
685 beyond the found float position. These pessimizations
686 are historical artefacts only. */
687 find_anchor:
2c2d71f5 688 while (t < strend - prog->minlen) {
cad2e5aa 689 if (*t == '\n') {
4ee3650e 690 if (t < check_at - prog->check_offset_min) {
33b8afdf 691 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
692 /* Since we moved from the found position,
693 we definitely contradict the found anchored
30944b6d
IZ
694 substr. Due to the above check we do not
695 contradict "check" substr.
696 Thus we can arrive here only if check substr
697 is float. Redo checking for "other"=="fixed".
698 */
9041c2e3 699 strpos = t + 1;
a3621e74 700 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 701 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
702 goto do_other_anchored;
703 }
4ee3650e
GS
704 /* We don't contradict the found floating substring. */
705 /* XXXX Why not check for STCLASS? */
cad2e5aa 706 s = t + 1;
a3621e74 707 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 708 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
709 goto set_useful;
710 }
4ee3650e
GS
711 /* Position contradicts check-string */
712 /* XXXX probably better to look for check-string
713 than for "\n", so one should lower the limit for t? */
a3621e74 714 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 715 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 716 other_last = strpos = s = t + 1;
cad2e5aa
JH
717 goto restart;
718 }
719 t++;
720 }
a3621e74 721 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 722 PL_colors[0], PL_colors[1]));
2c2d71f5 723 goto fail_finish;
cad2e5aa 724 }
f5952150 725 else {
a3621e74 726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 727 PL_colors[0], PL_colors[1]));
f5952150 728 }
cad2e5aa
JH
729 s = t;
730 set_useful:
33b8afdf 731 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
732 }
733 else {
f5952150 734 /* The found string does not prohibit matching at strpos,
2c2d71f5 735 - no optimization of calling REx engine can be performed,
f5952150
GS
736 unless it was an MBOL and we are not after MBOL,
737 or a future STCLASS check will fail this. */
2c2d71f5
JH
738 try_at_start:
739 /* Even in this situation we may use MBOL flag if strpos is offset
740 wrt the start of the string. */
05b4157f 741 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 742 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
743 /* May be due to an implicit anchor of m{.*foo} */
744 && !(prog->reganch & ROPT_IMPLICIT))
745 {
cad2e5aa
JH
746 t = strpos;
747 goto find_anchor;
748 }
a3621e74 749 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 750 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 751 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 752 );
2c2d71f5 753 success_at_start:
30944b6d 754 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
755 && (do_utf8 ? (
756 prog->check_utf8 /* Could be deleted already */
757 && --BmUSEFUL(prog->check_utf8) < 0
758 && (prog->check_utf8 == prog->float_utf8)
759 ) : (
760 prog->check_substr /* Could be deleted already */
761 && --BmUSEFUL(prog->check_substr) < 0
762 && (prog->check_substr == prog->float_substr)
763 )))
66e933ab 764 {
cad2e5aa 765 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 766 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
767 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
768 if (do_utf8 ? prog->check_substr : prog->check_utf8)
769 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
770 prog->check_substr = prog->check_utf8 = NULL; /* disable */
771 prog->float_substr = prog->float_utf8 = NULL; /* clear */
772 check = NULL; /* abort */
cad2e5aa 773 s = strpos;
3cf5c195
IZ
774 /* XXXX This is a remnant of the old implementation. It
775 looks wasteful, since now INTUIT can use many
6eb5f6b9 776 other heuristics. */
cad2e5aa
JH
777 prog->reganch &= ~RE_USE_INTUIT;
778 }
779 else
780 s = strpos;
781 }
782
6eb5f6b9
JH
783 /* Last resort... */
784 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
07be1b83 785 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
6eb5f6b9
JH
786 /* minlen == 0 is possible if regstclass is \b or \B,
787 and the fixed substr is ''$.
788 Since minlen is already taken into account, s+1 is before strend;
789 accidentally, minlen >= 1 guaranties no false positives at s + 1
790 even for \b or \B. But (minlen? 1 : 0) below assumes that
791 regstclass does not come from lookahead... */
792 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
793 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 794 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 795 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 796 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 797 : 1);
07be1b83 798 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 799 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 800 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
801 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
802 cl_l, strend)
803 : strend);
07be1b83
YO
804 /*if (OP(prog->regstclass) == TRIE)
805 endpos++;*/
6eb5f6b9 806 t = s;
3b0527fe 807 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
808 if (!s) {
809#ifdef DEBUGGING
cbbf8932 810 const char *what = NULL;
6eb5f6b9
JH
811#endif
812 if (endpos == strend) {
a3621e74 813 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
814 "Could not match STCLASS...\n") );
815 goto fail;
816 }
a3621e74 817 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 818 "This position contradicts STCLASS...\n") );
653099ff
GS
819 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
820 goto fail;
6eb5f6b9 821 /* Contradict one of substrings */
33b8afdf
JH
822 if (prog->anchored_substr || prog->anchored_utf8) {
823 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 824 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 825 hop_and_restart:
1aa99e6b 826 s = HOP3c(t, 1, strend);
66e933ab
GS
827 if (s + start_shift + end_shift > strend) {
828 /* XXXX Should be taken into account earlier? */
a3621e74 829 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
830 "Could not match STCLASS...\n") );
831 goto fail;
832 }
5e39e1e5
HS
833 if (!check)
834 goto giveup;
a3621e74 835 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 836 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
837 what, (long)(s + start_shift - i_strpos)) );
838 goto restart;
839 }
66e933ab 840 /* Have both, check_string is floating */
6eb5f6b9
JH
841 if (t + start_shift >= check_at) /* Contradicts floating=check */
842 goto retry_floating_check;
843 /* Recheck anchored substring, but not floating... */
9041c2e3 844 s = check_at;
5e39e1e5
HS
845 if (!check)
846 goto giveup;
a3621e74 847 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 848 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
849 (long)(other_last - i_strpos)) );
850 goto do_other_anchored;
851 }
60e71179
GS
852 /* Another way we could have checked stclass at the
853 current position only: */
854 if (ml_anch) {
855 s = t = t + 1;
5e39e1e5
HS
856 if (!check)
857 goto giveup;
a3621e74 858 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 859 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 860 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 861 goto try_at_offset;
66e933ab 862 }
33b8afdf 863 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 864 goto fail;
6eb5f6b9
JH
865 /* Check is floating subtring. */
866 retry_floating_check:
867 t = check_at - start_shift;
a3621e74 868 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
869 goto hop_and_restart;
870 }
b7953727 871 if (t != s) {
a3621e74 872 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 873 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
874 (long)(t - i_strpos), (long)(s - i_strpos))
875 );
876 }
877 else {
a3621e74 878 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
879 "Does not contradict STCLASS...\n");
880 );
881 }
6eb5f6b9 882 }
5e39e1e5 883 giveup:
a3621e74 884 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
885 PL_colors[4], (check ? "Guessed" : "Giving up"),
886 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 887 return s;
2c2d71f5
JH
888
889 fail_finish: /* Substring not found */
33b8afdf
JH
890 if (prog->check_substr || prog->check_utf8) /* could be removed already */
891 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 892 fail:
a3621e74 893 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 894 PL_colors[4], PL_colors[5]));
bd61b366 895 return NULL;
cad2e5aa 896}
9661b544 897
6eb5f6b9 898/* We know what class REx starts with. Try to find this position... */
3b0527fe 899/* if reginfo is NULL, its a dryrun */
07be1b83
YO
900/* annoyingly all the vars in this routine have different names from their counterparts
901 in regmatch. /grrr */
3b0527fe 902
4cadc6a9
YO
903#define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
904foldlen, foldbuf, uniflags) STMT_START { \
905 switch (trie_type) { \
906 case trie_utf8_fold: \
907 if ( foldlen>0 ) { \
908 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
909 foldlen -= len; \
910 uscan += len; \
911 len=0; \
912 } else { \
913 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
914 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
915 foldlen -= UNISKIP( uvc ); \
916 uscan = foldbuf + UNISKIP( uvc ); \
917 } \
918 break; \
919 case trie_utf8: \
920 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
921 break; \
922 case trie_plain: \
923 uvc = (UV)*uc; \
924 len = 1; \
925 } \
926 \
927 if (uvc < 256) { \
928 charid = trie->charmap[ uvc ]; \
929 } \
930 else { \
931 charid = 0; \
932 if (trie->widecharmap) { \
933 SV** const svpp = hv_fetch(trie->widecharmap, \
934 (char*)&uvc, sizeof(UV), 0); \
935 if (svpp) \
936 charid = (U16)SvIV(*svpp); \
937 } \
938 } \
939} STMT_END
940
941#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
942 if ( (CoNd) \
943 && (ln == len || \
944 ibcmp_utf8(s, NULL, 0, do_utf8, \
945 m, NULL, ln, (bool)UTF)) \
946 && (!reginfo || regtry(reginfo, s)) ) \
947 goto got_it; \
948 else { \
949 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
950 uvchr_to_utf8(tmpbuf, c); \
951 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
952 if ( f != c \
953 && (f == c1 || f == c2) \
954 && (ln == foldlen || \
955 !ibcmp_utf8((char *) foldbuf, \
956 NULL, foldlen, do_utf8, \
957 m, \
958 NULL, ln, (bool)UTF)) \
959 && (!reginfo || regtry(reginfo, s)) ) \
960 goto got_it; \
961 } \
962 s += len
963
964#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
965STMT_START { \
966 while (s <= e) { \
967 if ( (CoNd) \
968 && (ln == 1 || !(OP(c) == EXACTF \
969 ? ibcmp(s, m, ln) \
970 : ibcmp_locale(s, m, ln))) \
971 && (!reginfo || regtry(reginfo, s)) ) \
972 goto got_it; \
973 s++; \
974 } \
975} STMT_END
976
977#define REXEC_FBC_UTF8_SCAN(CoDe) \
978STMT_START { \
979 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
980 CoDe \
981 s += uskip; \
982 } \
983} STMT_END
984
985#define REXEC_FBC_SCAN(CoDe) \
986STMT_START { \
987 while (s < strend) { \
988 CoDe \
989 s++; \
990 } \
991} STMT_END
992
993#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
994REXEC_FBC_UTF8_SCAN( \
995 if (CoNd) { \
996 if (tmp && (!reginfo || regtry(reginfo, s))) \
997 goto got_it; \
998 else \
999 tmp = doevery; \
1000 } \
1001 else \
1002 tmp = 1; \
1003)
1004
1005#define REXEC_FBC_CLASS_SCAN(CoNd) \
1006REXEC_FBC_SCAN( \
1007 if (CoNd) { \
1008 if (tmp && (!reginfo || regtry(reginfo, s))) \
1009 goto got_it; \
1010 else \
1011 tmp = doevery; \
1012 } \
1013 else \
1014 tmp = 1; \
1015)
1016
1017#define REXEC_FBC_TRYIT \
1018if ((!reginfo || regtry(reginfo, s))) \
1019 goto got_it
1020
1021#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1022 if (do_utf8) { \
1023 UtFpReLoAd; \
1024 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1025 } \
1026 else { \
1027 REXEC_FBC_CLASS_SCAN(CoNd); \
1028 } \
1029 break
1030
1031#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1032 PL_reg_flags |= RF_tainted; \
1033 if (do_utf8) { \
1034 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1035 } \
1036 else { \
1037 REXEC_FBC_CLASS_SCAN(CoNd); \
1038 } \
1039 break
1040
3c3eec57 1041STATIC char *
07be1b83
YO
1042S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1043 const char *strend, const regmatch_info *reginfo)
a687059c 1044{
27da23d5 1045 dVAR;
1df70142 1046 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 1047 char *m;
d8093b23 1048 STRLEN ln;
5dab1207 1049 STRLEN lnc;
078c425b 1050 register STRLEN uskip;
d8093b23
G
1051 unsigned int c1;
1052 unsigned int c2;
6eb5f6b9
JH
1053 char *e;
1054 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1055 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 1056
6eb5f6b9
JH
1057 /* We know what class it must start with. */
1058 switch (OP(c)) {
6eb5f6b9 1059 case ANYOF:
388cc4de 1060 if (do_utf8) {
4cadc6a9 1061 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1062 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1063 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1064 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1065 }
1066 else {
1067 while (s < strend) {
1068 STRLEN skip = 1;
1069
32fc9b6a 1070 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1071 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1072 /* The assignment of 2 is intentional:
1073 * for the folded sharp s, the skip is 2. */
1074 (skip = SHARP_S_SKIP))) {
3b0527fe 1075 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
1076 goto got_it;
1077 else
1078 tmp = doevery;
1079 }
1080 else
1081 tmp = 1;
1082 s += skip;
1083 }
a0d0e21e 1084 }
6eb5f6b9 1085 break;
f33976b4 1086 case CANY:
4cadc6a9 1087 REXEC_FBC_SCAN(
3b0527fe 1088 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
1089 goto got_it;
1090 else
1091 tmp = doevery;
4cadc6a9 1092 );
f33976b4 1093 break;
6eb5f6b9 1094 case EXACTF:
5dab1207
NIS
1095 m = STRING(c);
1096 ln = STR_LEN(c); /* length to match in octets/bytes */
1097 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1098 if (UTF) {
a2a2844f 1099 STRLEN ulen1, ulen2;
5dab1207 1100 U8 *sm = (U8 *) m;
89ebb4a3
JH
1101 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1102 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1103 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1104
1105 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1106 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1107
89ebb4a3 1108 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1109 0, uniflags);
89ebb4a3 1110 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1111 0, uniflags);
5dab1207
NIS
1112 lnc = 0;
1113 while (sm < ((U8 *) m + ln)) {
1114 lnc++;
1115 sm += UTF8SKIP(sm);
1116 }
1aa99e6b
IH
1117 }
1118 else {
1119 c1 = *(U8*)m;
1120 c2 = PL_fold[c1];
1121 }
6eb5f6b9
JH
1122 goto do_exactf;
1123 case EXACTFL:
5dab1207
NIS
1124 m = STRING(c);
1125 ln = STR_LEN(c);
1126 lnc = (I32) ln;
d8093b23 1127 c1 = *(U8*)m;
6eb5f6b9
JH
1128 c2 = PL_fold_locale[c1];
1129 do_exactf:
db12adc6 1130 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1131
3b0527fe 1132 if (!reginfo && e < s)
6eb5f6b9 1133 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1134
60a8b682
JH
1135 /* The idea in the EXACTF* cases is to first find the
1136 * first character of the EXACTF* node and then, if
1137 * necessary, case-insensitively compare the full
1138 * text of the node. The c1 and c2 are the first
1139 * characters (though in Unicode it gets a bit
1140 * more complicated because there are more cases
7f16dd3d
JH
1141 * than just upper and lower: one needs to use
1142 * the so-called folding case for case-insensitive
1143 * matching (called "loose matching" in Unicode).
1144 * ibcmp_utf8() will do just that. */
60a8b682 1145
1aa99e6b 1146 if (do_utf8) {
575cac57 1147 UV c, f;
89ebb4a3 1148 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1149 STRLEN len, foldlen;
4ad0818d 1150 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1151 if (c1 == c2) {
5dab1207
NIS
1152 /* Upper and lower of 1st char are equal -
1153 * probably not a "letter". */
1aa99e6b 1154 while (s <= e) {
89ebb4a3 1155 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1156 uniflags);
4cadc6a9 1157 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1158 }
09091399
JH
1159 }
1160 else {
1aa99e6b 1161 while (s <= e) {
89ebb4a3 1162 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1163 uniflags);
80aecb99 1164
60a8b682 1165 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1166 * Note that not all the possible combinations
1167 * are handled here: some of them are handled
1168 * by the standard folding rules, and some of
1169 * them (the character class or ANYOF cases)
1170 * are handled during compiletime in
1171 * regexec.c:S_regclass(). */
880bd946
JH
1172 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1173 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1174 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1175
4cadc6a9 1176 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1177 }
09091399 1178 }
1aa99e6b
IH
1179 }
1180 else {
1181 if (c1 == c2)
4cadc6a9 1182 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1183 else
4cadc6a9 1184 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1185 }
1186 break;
bbce6d69 1187 case BOUNDL:
3280af22 1188 PL_reg_flags |= RF_tainted;
bbce6d69 1189 /* FALL THROUGH */
a0d0e21e 1190 case BOUND:
ffc61ed2 1191 if (do_utf8) {
12d33761 1192 if (s == PL_bostr)
ffc61ed2
JH
1193 tmp = '\n';
1194 else {
6136c704 1195 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1196 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1197 }
1198 tmp = ((OP(c) == BOUND ?
9041c2e3 1199 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1200 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1201 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1202 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1203 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1204 isALNUM_LC_utf8((U8*)s)))
1205 {
1206 tmp = !tmp;
4cadc6a9 1207 REXEC_FBC_TRYIT;
a687059c 1208 }
4cadc6a9 1209 );
a0d0e21e 1210 }
667bb95a 1211 else {
12d33761 1212 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1213 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1214 REXEC_FBC_SCAN(
ffc61ed2
JH
1215 if (tmp ==
1216 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1217 tmp = !tmp;
4cadc6a9 1218 REXEC_FBC_TRYIT;
a0ed51b3 1219 }
4cadc6a9 1220 );
a0ed51b3 1221 }
3b0527fe 1222 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1223 goto got_it;
1224 break;
bbce6d69 1225 case NBOUNDL:
3280af22 1226 PL_reg_flags |= RF_tainted;
bbce6d69 1227 /* FALL THROUGH */
a0d0e21e 1228 case NBOUND:
ffc61ed2 1229 if (do_utf8) {
12d33761 1230 if (s == PL_bostr)
ffc61ed2
JH
1231 tmp = '\n';
1232 else {
6136c704 1233 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1234 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1235 }
1236 tmp = ((OP(c) == NBOUND ?
9041c2e3 1237 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1238 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1239 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1240 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1241 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1242 isALNUM_LC_utf8((U8*)s)))
1243 tmp = !tmp;
4cadc6a9
YO
1244 else REXEC_FBC_TRYIT;
1245 );
a0d0e21e 1246 }
667bb95a 1247 else {
12d33761 1248 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1249 tmp = ((OP(c) == NBOUND ?
1250 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1251 REXEC_FBC_SCAN(
ffc61ed2
JH
1252 if (tmp ==
1253 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1254 tmp = !tmp;
4cadc6a9
YO
1255 else REXEC_FBC_TRYIT;
1256 );
a0ed51b3 1257 }
3b0527fe 1258 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1259 goto got_it;
1260 break;
a0d0e21e 1261 case ALNUM:
4cadc6a9
YO
1262 REXEC_FBC_CSCAN_PRELOAD(
1263 LOAD_UTF8_CHARCLASS_ALNUM(),
1264 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1265 isALNUM(*s)
1266 );
bbce6d69 1267 case ALNUML:
4cadc6a9
YO
1268 REXEC_FBC_CSCAN_TAINT(
1269 isALNUM_LC_utf8((U8*)s),
1270 isALNUM_LC(*s)
1271 );
a0d0e21e 1272 case NALNUM:
4cadc6a9
YO
1273 REXEC_FBC_CSCAN_PRELOAD(
1274 LOAD_UTF8_CHARCLASS_ALNUM(),
1275 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1276 !isALNUM(*s)
1277 );
bbce6d69 1278 case NALNUML:
4cadc6a9
YO
1279 REXEC_FBC_CSCAN_TAINT(
1280 !isALNUM_LC_utf8((U8*)s),
1281 !isALNUM_LC(*s)
1282 );
a0d0e21e 1283 case SPACE:
4cadc6a9
YO
1284 REXEC_FBC_CSCAN_PRELOAD(
1285 LOAD_UTF8_CHARCLASS_SPACE(),
1286 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1287 isSPACE(*s)
1288 );
bbce6d69 1289 case SPACEL:
4cadc6a9
YO
1290 REXEC_FBC_CSCAN_TAINT(
1291 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1292 isSPACE_LC(*s)
1293 );
a0d0e21e 1294 case NSPACE:
4cadc6a9
YO
1295 REXEC_FBC_CSCAN_PRELOAD(
1296 LOAD_UTF8_CHARCLASS_SPACE(),
1297 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1298 !isSPACE(*s)
1299 );
bbce6d69 1300 case NSPACEL:
4cadc6a9
YO
1301 REXEC_FBC_CSCAN_TAINT(
1302 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1303 !isSPACE_LC(*s)
1304 );
a0d0e21e 1305 case DIGIT:
4cadc6a9
YO
1306 REXEC_FBC_CSCAN_PRELOAD(
1307 LOAD_UTF8_CHARCLASS_DIGIT(),
1308 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1309 isDIGIT(*s)
1310 );
b8c5462f 1311 case DIGITL:
4cadc6a9
YO
1312 REXEC_FBC_CSCAN_TAINT(
1313 isDIGIT_LC_utf8((U8*)s),
1314 isDIGIT_LC(*s)
1315 );
a0d0e21e 1316 case NDIGIT:
4cadc6a9
YO
1317 REXEC_FBC_CSCAN_PRELOAD(
1318 LOAD_UTF8_CHARCLASS_DIGIT(),
1319 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1320 !isDIGIT(*s)
1321 );
b8c5462f 1322 case NDIGITL:
4cadc6a9
YO
1323 REXEC_FBC_CSCAN_TAINT(
1324 !isDIGIT_LC_utf8((U8*)s),
1325 !isDIGIT_LC(*s)
1326 );
07be1b83
YO
1327 case TRIE:
1328 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1329 {
1330 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1331 trie_type = do_utf8 ?
1332 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1333 : trie_plain;
1334 /* what trie are we using right now */
1335 reg_ac_data *aho
1336 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1337 reg_trie_data *trie=aho->trie;
1338
1339 const char *last_start = strend - trie->minlen;
6148ee25 1340#ifdef DEBUGGING
07be1b83 1341 const char *real_start = s;
6148ee25 1342#endif
07be1b83 1343 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1344 SV *sv_points;
1345 U8 **points; /* map of where we were in the input string
1346 when reading a given string. For ASCII this
1347 is unnecessary overhead as the relationship
1348 is always 1:1, but for unicode, especially
1349 case folded unicode this is not true. */
f9e705e8 1350 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
07be1b83
YO
1351
1352 GET_RE_DEBUG_FLAGS_DECL;
1353
be8e71aa
YO
1354 /* We can't just allocate points here. We need to wrap it in
1355 * an SV so it gets freed properly if there is a croak while
1356 * running the match */
1357 ENTER;
1358 SAVETMPS;
1359 sv_points=newSV(maxlen * sizeof(U8 *));
1360 SvCUR_set(sv_points,
1361 maxlen * sizeof(U8 *));
1362 SvPOK_on(sv_points);
1363 sv_2mortal(sv_points);
1364 points=(U8**)SvPV_nolen(sv_points );
07be1b83
YO
1365
1366 if (trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1367 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
07be1b83
YO
1368 s++;
1369 }
1370 }
1371
1372 while (s <= last_start) {
1373 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1374 U8 *uc = (U8*)s;
1375 U16 charid = 0;
1376 U32 base = 1;
1377 U32 state = 1;
1378 UV uvc = 0;
1379 STRLEN len = 0;
1380 STRLEN foldlen = 0;
1381 U8 *uscan = (U8*)NULL;
1382 U8 *leftmost = NULL;
1383
1384 U32 pointpos = 0;
1385
1386 while ( state && uc <= (U8*)strend ) {
1387 int failed=0;
1388 if (aho->states[ state ].wordnum) {
1389 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1390 if (!leftmost || lpos < leftmost)
1391 leftmost= lpos;
1392 if (base==0) break;
1393 }
1394 points[pointpos++ % maxlen]= uc;
4cadc6a9
YO
1395 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1396 uvc, charid, foldlen, foldbuf, uniflags);
07be1b83
YO
1397 DEBUG_TRIE_EXECUTE_r(
1398 PerlIO_printf(Perl_debug_log,
1399 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1400 (int)((const char*)uc - real_start), charid, uvc)
1401 );
1402 uc += len;
1403
1404 do {
6148ee25 1405#ifdef DEBUGGING
07be1b83 1406 U32 word = aho->states[ state ].wordnum;
6148ee25 1407#endif
07be1b83
YO
1408 base = aho->states[ state ].trans.base;
1409
1410 DEBUG_TRIE_EXECUTE_r(
1411 PerlIO_printf( Perl_debug_log,
1412 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1413 failed ? "Fail transition to " : "",
e4f74956 1414 (UV)state, (UV)base, (UV)uvc, (UV)word)
07be1b83
YO
1415 );
1416 if ( base ) {
1417 U32 tmp;
1418 if (charid &&
1419 (base + charid > trie->uniquecharcount )
1420 && (base + charid - 1 - trie->uniquecharcount
1421 < trie->lasttrans)
1422 && trie->trans[base + charid - 1 -
1423 trie->uniquecharcount].check == state
1424 && (tmp=trie->trans[base + charid - 1 -
1425 trie->uniquecharcount ].next))
1426 {
1427 state = tmp;
1428 break;
1429 }
1430 else {
1431 failed++;
1432 if ( state == 1 )
1433 break;
1434 else
1435 state = aho->fail[state];
1436 }
1437 }
1438 else {
1439 /* we must be accepting here */
1440 failed++;
1441 break;
1442 }
1443 } while(state);
1444 if (failed) {
1445 if (leftmost)
1446 break;
1447 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1448 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
07be1b83
YO
1449 uc++;
1450 }
1451 }
1452 }
1453 }
1454 if ( aho->states[ state ].wordnum ) {
1455 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1456 if (!leftmost || lpos < leftmost)
1457 leftmost = lpos;
1458 }
1459 DEBUG_TRIE_EXECUTE_r(
1460 PerlIO_printf( Perl_debug_log,
1461 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1462 "All done: ",
e4f74956 1463 (UV)state, (UV)base, (UV)uvc)
07be1b83
YO
1464 );
1465 if (leftmost) {
1466 s = (char*)leftmost;
be8e71aa
YO
1467 if (!reginfo || regtry(reginfo, s)) {
1468 FREETMPS;
1469 LEAVE;
07be1b83 1470 goto got_it;
be8e71aa 1471 }
07be1b83
YO
1472 s = HOPc(s,1);
1473 } else {
1474 break;
1475 }
1476 }
be8e71aa
YO
1477 FREETMPS;
1478 LEAVE;
07be1b83
YO
1479 }
1480 break;
b3c9acc1 1481 default:
3c3eec57
GS
1482 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1483 break;
d6a28714 1484 }
6eb5f6b9
JH
1485 return 0;
1486 got_it:
1487 return s;
1488}
1489
1490/*
1491 - regexec_flags - match a regexp against a string
1492 */
1493I32
1494Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1495 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1496/* strend: pointer to null at end of string */
1497/* strbeg: real beginning of string */
1498/* minend: end of match must be >=minend after stringarg. */
1499/* data: May be used for some additional optimizations. */
1500/* nosave: For optimizations. */
1501{
97aff369 1502 dVAR;
6eb5f6b9
JH
1503 register char *s;
1504 register regnode *c;
1505 register char *startpos = stringarg;
6eb5f6b9
JH
1506 I32 minlen; /* must match at least this many chars */
1507 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1508 I32 end_shift = 0; /* Same for the end. */ /* CC */
1509 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1510 char *scream_olds = NULL;
3dab1dad 1511 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1512 const bool do_utf8 = DO_UTF8(sv);
2757e526 1513 I32 multiline;
0df25f3d 1514
3b0527fe 1515 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1516
1517 GET_RE_DEBUG_FLAGS_DECL;
1518
9d4ba2ae 1519 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1520
1521 /* Be paranoid... */
1522 if (prog == NULL || startpos == NULL) {
1523 Perl_croak(aTHX_ "NULL regexp parameter");
1524 return 0;
1525 }
1526
2757e526 1527 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1528 reginfo.prog = prog;
2757e526 1529
bac06658
JH
1530 RX_MATCH_UTF8_set(prog, do_utf8);
1531
6eb5f6b9 1532 minlen = prog->minlen;
61a36c01 1533 if (strend - startpos < minlen) {
a3621e74 1534 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1535 "String too short [regexec_flags]...\n"));
1536 goto phooey;
1aa99e6b 1537 }
6eb5f6b9 1538
6eb5f6b9
JH
1539 /* Check validity of program. */
1540 if (UCHARAT(prog->program) != REG_MAGIC) {
1541 Perl_croak(aTHX_ "corrupted regexp program");
1542 }
1543
1544 PL_reg_flags = 0;
1545 PL_reg_eval_set = 0;
1546 PL_reg_maxiter = 0;
1547
1548 if (prog->reganch & ROPT_UTF8)
1549 PL_reg_flags |= RF_utf8;
1550
1551 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1552 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1553 PL_bostr = strbeg;
3b0527fe 1554 reginfo.sv = sv;
6eb5f6b9
JH
1555
1556 /* Mark end of line for $ (and such) */
1557 PL_regeol = strend;
1558
1559 /* see how far we have to get to not match where we matched before */
3b0527fe 1560 reginfo.till = startpos+minend;
6eb5f6b9 1561
6eb5f6b9
JH
1562 /* If there is a "must appear" string, look for it. */
1563 s = startpos;
1564
3b0527fe 1565 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1566 MAGIC *mg;
1567
1568 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1569 reginfo.ganch = startpos;
6eb5f6b9
JH
1570 else if (sv && SvTYPE(sv) >= SVt_PVMG
1571 && SvMAGIC(sv)
14befaf4
DM
1572 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1573 && mg->mg_len >= 0) {
3b0527fe 1574 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1575 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1576 if (s > reginfo.ganch)
6eb5f6b9 1577 goto phooey;
3b0527fe 1578 s = reginfo.ganch;
6eb5f6b9
JH
1579 }
1580 }
1581 else /* pos() not defined */
3b0527fe 1582 reginfo.ganch = strbeg;
6eb5f6b9
JH
1583 }
1584
a0714e2c 1585 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1586 re_scream_pos_data d;
1587
1588 d.scream_olds = &scream_olds;
1589 d.scream_pos = &scream_pos;
1590 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1591 if (!s) {
a3621e74 1592 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1593 goto phooey; /* not present */
3fa9c3d7 1594 }
6eb5f6b9
JH
1595 }
1596
ab3bbdeb
YO
1597 DEBUG_EXECUTE_r(
1598 debug_start_match(prog, do_utf8, startpos, strend,
1599 "Matching");
2a782b5b 1600 );
6eb5f6b9
JH
1601
1602 /* Simplest case: anchored match need be tried only once. */
1603 /* [unless only anchor is BOL and multiline is set] */
1604 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1605 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1606 goto got_it;
7fba1cd6 1607 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1608 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1609 {
1610 char *end;
1611
1612 if (minlen)
1613 dontbother = minlen - 1;
1aa99e6b 1614 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1615 /* for multiline we only have to try after newlines */
33b8afdf 1616 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1617 if (s == startpos)
1618 goto after_try;
1619 while (1) {
3b0527fe 1620 if (regtry(&reginfo, s))
6eb5f6b9
JH
1621 goto got_it;
1622 after_try:
1623 if (s >= end)
1624 goto phooey;
1625 if (prog->reganch & RE_USE_INTUIT) {
1626 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1627 if (!s)
1628 goto phooey;
1629 }
1630 else
1631 s++;
1632 }
1633 } else {
1634 if (s > startpos)
1635 s--;
1636 while (s < end) {
1637 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1638 if (regtry(&reginfo, s))
6eb5f6b9
JH
1639 goto got_it;
1640 }
1641 }
1642 }
1643 }
1644 goto phooey;
1645 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1646 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1647 goto got_it;
1648 goto phooey;
1649 }
1650
1651 /* Messy cases: unanchored match. */
33b8afdf 1652 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1653 /* we have /x+whatever/ */
1654 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1655 char ch;
bf93d4cc
GS
1656#ifdef DEBUGGING
1657 int did_match = 0;
1658#endif
33b8afdf
JH
1659 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1660 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1661 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1662
1aa99e6b 1663 if (do_utf8) {
4cadc6a9 1664 REXEC_FBC_SCAN(
6eb5f6b9 1665 if (*s == ch) {
a3621e74 1666 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1667 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1668 s += UTF8SKIP(s);
1669 while (s < strend && *s == ch)
1670 s += UTF8SKIP(s);
1671 }
4cadc6a9 1672 );
6eb5f6b9
JH
1673 }
1674 else {
4cadc6a9 1675 REXEC_FBC_SCAN(
6eb5f6b9 1676 if (*s == ch) {
a3621e74 1677 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1678 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1679 s++;
1680 while (s < strend && *s == ch)
1681 s++;
1682 }
4cadc6a9 1683 );
6eb5f6b9 1684 }
a3621e74 1685 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1686 PerlIO_printf(Perl_debug_log,
b7953727
JH
1687 "Did not find anchored character...\n")
1688 );
6eb5f6b9 1689 }
a0714e2c
SS
1690 else if (prog->anchored_substr != NULL
1691 || prog->anchored_utf8 != NULL
1692 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1693 && prog->float_max_offset < strend - s)) {
1694 SV *must;
1695 I32 back_max;
1696 I32 back_min;
1697 char *last;
6eb5f6b9 1698 char *last1; /* Last position checked before */
bf93d4cc
GS
1699#ifdef DEBUGGING
1700 int did_match = 0;
1701#endif
33b8afdf
JH
1702 if (prog->anchored_substr || prog->anchored_utf8) {
1703 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1704 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1705 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1706 back_max = back_min = prog->anchored_offset;
1707 } else {
1708 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1709 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1710 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1711 back_max = prog->float_max_offset;
1712 back_min = prog->float_min_offset;
1713 }
1714 if (must == &PL_sv_undef)
1715 /* could not downgrade utf8 check substring, so must fail */
1716 goto phooey;
1717
1718 last = HOP3c(strend, /* Cannot start after this */
1719 -(I32)(CHR_SVLEN(must)
1720 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1721
1722 if (s > PL_bostr)
1723 last1 = HOPc(s, -1);
1724 else
1725 last1 = s - 1; /* bogus */
1726
a0288114 1727 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1728 check_substr==must. */
1729 scream_pos = -1;
1730 dontbother = end_shift;
1731 strend = HOPc(strend, -dontbother);
1732 while ( (s <= last) &&
9041c2e3 1733 ((flags & REXEC_SCREAM)
1aa99e6b 1734 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1735 end_shift, &scream_pos, 0))
1aa99e6b 1736 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1737 (unsigned char*)strend, must,
7fba1cd6 1738 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1739 /* we may be pointing at the wrong string */
1740 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1741 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1742 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1743 if (HOPc(s, -back_max) > last1) {
1744 last1 = HOPc(s, -back_min);
1745 s = HOPc(s, -back_max);
1746 }
1747 else {
52657f30 1748 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1749
1750 last1 = HOPc(s, -back_min);
52657f30 1751 s = t;
6eb5f6b9 1752 }
1aa99e6b 1753 if (do_utf8) {
6eb5f6b9 1754 while (s <= last1) {
3b0527fe 1755 if (regtry(&reginfo, s))
6eb5f6b9
JH
1756 goto got_it;
1757 s += UTF8SKIP(s);
1758 }
1759 }
1760 else {
1761 while (s <= last1) {
3b0527fe 1762 if (regtry(&reginfo, s))
6eb5f6b9
JH
1763 goto got_it;
1764 s++;
1765 }
1766 }
1767 }
ab3bbdeb
YO
1768 DEBUG_EXECUTE_r(if (!did_match) {
1769 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1770 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1771 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 1772 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 1773 ? "anchored" : "floating"),
ab3bbdeb
YO
1774 quoted, RE_SV_TAIL(must));
1775 });
6eb5f6b9
JH
1776 goto phooey;
1777 }
155aba94 1778 else if ((c = prog->regstclass)) {
f14c76ed 1779 if (minlen) {
be8e71aa 1780 const OPCODE op = OP(prog->regstclass);
66e933ab 1781 /* don't bother with what can't match */
07be1b83 1782 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
f14c76ed
RGS
1783 strend = HOPc(strend, -(minlen - 1));
1784 }
a3621e74 1785 DEBUG_EXECUTE_r({
be8e71aa 1786 SV * const prop = sv_newmortal();
32fc9b6a 1787 regprop(prog, prop, c);
0df25f3d 1788 {
ab3bbdeb
YO
1789 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1790 s,strend-s,60);
0df25f3d 1791 PerlIO_printf(Perl_debug_log,
ab3bbdeb 1792 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 1793 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 1794 quoted, (int)(strend - s));
0df25f3d 1795 }
ffc61ed2 1796 });
3b0527fe 1797 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 1798 goto got_it;
07be1b83 1799 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
1800 }
1801 else {
1802 dontbother = 0;
a0714e2c 1803 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1804 /* Trim the end. */
d6a28714 1805 char *last;
33b8afdf
JH
1806 SV* float_real;
1807
1808 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1809 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1810 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1811
1812 if (flags & REXEC_SCREAM) {
33b8afdf 1813 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1814 end_shift, &scream_pos, 1); /* last one */
1815 if (!last)
ffc61ed2 1816 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1817 /* we may be pointing at the wrong string */
1818 else if (RX_MATCH_COPIED(prog))
3f7c398e 1819 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1820 }
d6a28714
JH
1821 else {
1822 STRLEN len;
cfd0369c 1823 const char * const little = SvPV_const(float_real, len);
d6a28714 1824
33b8afdf 1825 if (SvTAIL(float_real)) {
d6a28714
JH
1826 if (memEQ(strend - len + 1, little, len - 1))
1827 last = strend - len + 1;
7fba1cd6 1828 else if (!multiline)
9041c2e3 1829 last = memEQ(strend - len, little, len)
bd61b366 1830 ? strend - len : NULL;
b8c5462f 1831 else
d6a28714
JH
1832 goto find_last;
1833 } else {
1834 find_last:
9041c2e3 1835 if (len)
d6a28714 1836 last = rninstr(s, strend, little, little + len);
b8c5462f 1837 else
a0288114 1838 last = strend; /* matching "$" */
b8c5462f 1839 }
b8c5462f 1840 }
bf93d4cc 1841 if (last == NULL) {
a3621e74 1842 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 1843 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 1844 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1845 goto phooey; /* Should not happen! */
1846 }
d6a28714
JH
1847 dontbother = strend - last + prog->float_min_offset;
1848 }
1849 if (minlen && (dontbother < minlen))
1850 dontbother = minlen - 1;
1851 strend -= dontbother; /* this one's always in bytes! */
1852 /* We don't know much -- general case. */
1aa99e6b 1853 if (do_utf8) {
d6a28714 1854 for (;;) {
3b0527fe 1855 if (regtry(&reginfo, s))
d6a28714
JH
1856 goto got_it;
1857 if (s >= strend)
1858 break;
b8c5462f 1859 s += UTF8SKIP(s);
d6a28714
JH
1860 };
1861 }
1862 else {
1863 do {
3b0527fe 1864 if (regtry(&reginfo, s))
d6a28714
JH
1865 goto got_it;
1866 } while (s++ < strend);
1867 }
1868 }
1869
1870 /* Failure. */
1871 goto phooey;
1872
1873got_it:
1874 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1875
1876 if (PL_reg_eval_set) {
1877 /* Preserve the current value of $^R */
1878 if (oreplsv != GvSV(PL_replgv))
1879 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1880 restored, the value remains
1881 the same. */
4f639d21 1882 restore_pos(aTHX_ prog);
d6a28714
JH
1883 }
1884
1885 /* make sure $`, $&, $', and $digit will work later */
1886 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 1887 RX_MATCH_COPY_FREE(prog);
d6a28714 1888 if (flags & REXEC_COPY_STR) {
be8e71aa 1889 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 1890#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
1891 if ((SvIsCOW(sv)
1892 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
1893 if (DEBUG_C_TEST) {
1894 PerlIO_printf(Perl_debug_log,
1895 "Copy on write: regexp capture, type %d\n",
1896 (int) SvTYPE(sv));
1897 }
1898 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 1899 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
1900 assert (SvPOKp(prog->saved_copy));
1901 } else
1902#endif
1903 {
1904 RX_MATCH_COPIED_on(prog);
1905 s = savepvn(strbeg, i);
1906 prog->subbeg = s;
1907 }
d6a28714 1908 prog->sublen = i;
d6a28714
JH
1909 }
1910 else {
1911 prog->subbeg = strbeg;
1912 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1913 }
1914 }
9041c2e3 1915
d6a28714
JH
1916 return 1;
1917
1918phooey:
a3621e74 1919 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 1920 PL_colors[4], PL_colors[5]));
d6a28714 1921 if (PL_reg_eval_set)
4f639d21 1922 restore_pos(aTHX_ prog);
d6a28714
JH
1923 return 0;
1924}
1925
1926/*
1927 - regtry - try match at specific point
1928 */
1929STATIC I32 /* 0 failure, 1 success */
3b0527fe 1930S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 1931{
97aff369 1932 dVAR;
d6a28714
JH
1933 register I32 *sp;
1934 register I32 *ep;
1935 CHECKPOINT lastcp;
3b0527fe 1936 regexp *prog = reginfo->prog;
a3621e74 1937 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 1938
02db2b7b
IZ
1939#ifdef DEBUGGING
1940 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1941#endif
d6a28714
JH
1942 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1943 MAGIC *mg;
1944
1945 PL_reg_eval_set = RS_init;
a3621e74 1946 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
1947 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1948 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 1949 ));
e8347627 1950 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
1951 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1952 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1953 SAVETMPS;
1954 /* Apparently this is not needed, judging by wantarray. */
e8347627 1955 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
1956 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1957
3b0527fe 1958 if (reginfo->sv) {
d6a28714 1959 /* Make $_ available to executed code. */
3b0527fe 1960 if (reginfo->sv != DEFSV) {
59f00321 1961 SAVE_DEFSV;
3b0527fe 1962 DEFSV = reginfo->sv;
b8c5462f 1963 }
d6a28714 1964
3b0527fe
DM
1965 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
1966 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 1967 /* prepare for quick setting of pos */
d300d9fa
NC
1968#ifdef PERL_OLD_COPY_ON_WRITE
1969 if (SvIsCOW(sv))
1970 sv_force_normal_flags(sv, 0);
1971#endif
3dab1dad 1972 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 1973 &PL_vtbl_mglob, NULL, 0);
d6a28714 1974 mg->mg_len = -1;
b8c5462f 1975 }
d6a28714
JH
1976 PL_reg_magic = mg;
1977 PL_reg_oldpos = mg->mg_len;
4f639d21 1978 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 1979 }
09687e5a 1980 if (!PL_reg_curpm) {
a02a5408 1981 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
1982#ifdef USE_ITHREADS
1983 {
be8e71aa 1984 SV* const repointer = newSViv(0);
577e12cc 1985 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 1986 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
1987 av_push(PL_regex_padav,repointer);
1988 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
1989 PL_regex_pad = AvARRAY(PL_regex_padav);
1990 }
1991#endif
1992 }
aaa362c4 1993 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
1994 PL_reg_oldcurpm = PL_curpm;
1995 PL_curpm = PL_reg_curpm;
1996 if (RX_MATCH_COPIED(prog)) {
1997 /* Here is a serious problem: we cannot rewrite subbeg,
1998 since it may be needed if this match fails. Thus
1999 $` inside (?{}) could fail... */
2000 PL_reg_oldsaved = prog->subbeg;
2001 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2002#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2003 PL_nrs = prog->saved_copy;
2004#endif
d6a28714
JH
2005 RX_MATCH_COPIED_off(prog);
2006 }
2007 else
bd61b366 2008 PL_reg_oldsaved = NULL;
d6a28714
JH
2009 prog->subbeg = PL_bostr;
2010 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2011 }
973dddac 2012 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2013 PL_reginput = startpos;
2014 PL_regstartp = prog->startp;
2015 PL_regendp = prog->endp;
2016 PL_reglastparen = &prog->lastparen;
a01268b5 2017 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2018 prog->lastparen = 0;
03994de8 2019 prog->lastcloseparen = 0;
d6a28714 2020 PL_regsize = 0;
a3621e74 2021 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2022 if (PL_reg_start_tmpl <= prog->nparens) {
2023 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2024 if(PL_reg_start_tmp)
2025 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2026 else
a02a5408 2027 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2028 }
2029
2030 /* XXXX What this code is doing here?!!! There should be no need
2031 to do this again and again, PL_reglastparen should take care of
3dd2943c 2032 this! --ilya*/
dafc8851
JH
2033
2034 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2035 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2036 * PL_reglastparen), is not needed at all by the test suite
2037 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2038 * enough, for building DynaLoader, or otherwise this
2039 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2040 * will happen. Meanwhile, this code *is* needed for the
2041 * above-mentioned test suite tests to succeed. The common theme
2042 * on those tests seems to be returning null fields from matches.
2043 * --jhi */
dafc8851 2044#if 1
d6a28714
JH
2045 sp = prog->startp;
2046 ep = prog->endp;
2047 if (prog->nparens) {
097eb12c 2048 register I32 i;
eb160463 2049 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2050 *++sp = -1;
2051 *++ep = -1;
2052 }
2053 }
dafc8851 2054#endif
02db2b7b 2055 REGCP_SET(lastcp);
3b0527fe 2056 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2057 prog->endp[0] = PL_reginput - PL_bostr;
2058 return 1;
2059 }
02db2b7b 2060 REGCP_UNWIND(lastcp);
d6a28714
JH
2061 return 0;
2062}
2063
02db2b7b 2064
8ba1375e
MJD
2065#define sayYES goto yes
2066#define sayNO goto no
e0f9d4a8 2067#define sayNO_ANYOF goto no_anyof
8ba1375e 2068#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2069#define sayNO_FINAL goto no_final
2070#define sayNO_SILENT goto do_no
2071#define saySAME(x) if (x) goto yes; else goto no
2072
3ab3c9b4 2073#define CACHEsayNO STMT_START { \
3298f257
DM
2074 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) \
2075 PL_reg_poscache[st->u.whilem.cache_offset] |= \
2076 (1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2077 sayNO; \
2078} STMT_END
2079
3298f257 2080
a3621e74
YO
2081/* this is used to determine how far from the left messages like
2082 'failed...' are printed. Currently 29 makes these messages line
2083 up with the opcode they refer to. Earlier perls used 25 which
2084 left these messages outdented making reviewing a debug output
2085 quite difficult.
2086*/
2087#define REPORT_CODE_OFF 29
2088
2089
2090/* Make sure there is a test for this +1 options in re_tests */
2091#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2092
40a82448
DM
2093#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2094#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2095
86545054
DM
2096#define SLAB_FIRST(s) (&(s)->states[0])
2097#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2098
5d9a96ca
DM
2099/* grab a new slab and return the first slot in it */
2100
2101STATIC regmatch_state *
2102S_push_slab(pTHX)
2103{
54df2634
NC
2104#if PERL_VERSION < 9
2105 dMY_CXT;
2106#endif
5d9a96ca
DM
2107 regmatch_slab *s = PL_regmatch_slab->next;
2108 if (!s) {
2109 Newx(s, 1, regmatch_slab);
2110 s->prev = PL_regmatch_slab;
2111 s->next = NULL;
2112 PL_regmatch_slab->next = s;
2113 }
2114 PL_regmatch_slab = s;
86545054 2115 return SLAB_FIRST(s);
5d9a96ca 2116}
5b47454d 2117
95b24440
DM
2118/* simulate a recursive call to regmatch */
2119
2120#define REGMATCH(ns, where) \
5d9a96ca
DM
2121 st->scan = scan; \
2122 scan = (ns); \
2123 st->resume_state = resume_##where; \
95b24440
DM
2124 goto start_recurse; \
2125 resume_point_##where:
2126
40a82448
DM
2127/* push a new state then goto it */
2128
2129#define PUSH_STATE_GOTO(state, node) \
2130 scan = node; \
2131 st->resume_state = state; \
2132 goto push_state;
2133
2134/* push a new state with success backtracking, then goto it */
2135
2136#define PUSH_YES_STATE_GOTO(state, node) \
2137 scan = node; \
2138 st->resume_state = state; \
2139 goto push_yes_state;
2140
aa283a38 2141
aa283a38 2142
d6a28714
JH
2143/*
2144 - regmatch - main matching routine
2145 *
2146 * Conceptually the strategy is simple: check to see whether the current
2147 * node matches, call self recursively to see whether the rest matches,
2148 * and then act accordingly. In practice we make some effort to avoid
2149 * recursion, in particular by going through "ordinary" nodes (that don't
2150 * need to know whether the rest of the match failed) by a loop instead of
2151 * by recursion.
2152 */
2153/* [lwall] I've hoisted the register declarations to the outer block in order to
2154 * maybe save a little bit of pushing and popping on the stack. It also takes
2155 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2156 *
2157 * This function used to be heavily recursive, but since this had the
2158 * effect of blowing the CPU stack on complex regexes, it has been
2159 * restructured to be iterative, and to save state onto the heap rather
2160 * than the stack. Essentially whereever regmatch() used to be called, it
2161 * pushes the current state, notes where to return, then jumps back into
2162 * the main loop.
2163 *
2164 * Originally the structure of this function used to look something like
2165
2166 S_regmatch() {
2167 int a = 1, b = 2;
2168 ...
2169 while (scan != NULL) {
5d9a96ca 2170 a++; // do stuff with a and b
95b24440
DM
2171 ...
2172 switch (OP(scan)) {
2173 case FOO: {
2174 int local = 3;
2175 ...
2176 if (regmatch(...)) // recurse
2177 goto yes;
2178 }
2179 ...
2180 }
2181 }
2182 yes:
2183 return 1;
2184 }
2185
2186 * Now it looks something like this:
2187
5d9a96ca 2188 typedef struct {
95b24440
DM
2189 int a, b, local;
2190 int resume_state;
5d9a96ca 2191 } regmatch_state;
95b24440
DM
2192
2193 S_regmatch() {
5d9a96ca
DM
2194 regmatch_state *st = new();
2195 int depth=0;
2196 st->a++; // do stuff with a and b
95b24440
DM
2197 ...
2198 while (scan != NULL) {
2199 ...
2200 switch (OP(scan)) {
2201 case FOO: {
5d9a96ca 2202 st->local = 3;
95b24440 2203 ...
5d9a96ca
DM
2204 st->scan = scan;
2205 scan = ...;
2206 st->resume_state = resume_FOO;
2207 goto start_recurse; // recurse
95b24440 2208
5d9a96ca
DM
2209 resume_point_FOO:
2210 if (result)
95b24440
DM
2211 goto yes;
2212 }
2213 ...
2214 }
5d9a96ca
DM
2215 start_recurse:
2216 st = new(); push a new state
2217 st->a = 1; st->b = 2;
2218 depth++;
95b24440 2219 }
5d9a96ca 2220 yes:
95b24440 2221 result = 1;
5d9a96ca
DM
2222 if (depth--) {
2223 st = pop();
95b24440
DM
2224 switch (resume_state) {
2225 case resume_FOO:
2226 goto resume_point_FOO;
2227 ...
2228 }
2229 }
2230 return result
2231 }
2232
2233 * WARNING: this means that any line in this function that contains a
2234 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2235 * regmatch() using gotos instead. Thus the values of any local variables
2236 * not saved in the regmatch_state structure will have been lost when
2237 * execution resumes on the next line .
5d9a96ca
DM
2238 *
2239 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2240 * PL_regmatch_state always points to the currently active state, and
2241 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2242 * The first time regmatch is called, the first slab is allocated, and is
2243 * never freed until interpreter desctruction. When the slab is full,
2244 * a new one is allocated chained to the end. At exit from regmatch, slabs
2245 * allocated since entry are freed.
d6a28714 2246 */
95b24440 2247
40a82448 2248/* *** every FOO_fail should = FOO+1 */
c255a977
DM
2249#define TRIE_next (REGNODE_MAX+1)
2250#define TRIE_next_fail (REGNODE_MAX+2)
2251#define EVAL_A (REGNODE_MAX+3)
2252#define EVAL_A_fail (REGNODE_MAX+4)
2253#define resume_CURLYX (REGNODE_MAX+5)
2254#define resume_WHILEM1 (REGNODE_MAX+6)
2255#define resume_WHILEM2 (REGNODE_MAX+7)
2256#define resume_WHILEM3 (REGNODE_MAX+8)
2257#define resume_WHILEM4 (REGNODE_MAX+9)
2258#define resume_WHILEM5 (REGNODE_MAX+10)
2259#define resume_WHILEM6 (REGNODE_MAX+11)
2260#define BRANCH_next (REGNODE_MAX+12)
2261#define BRANCH_next_fail (REGNODE_MAX+13)
2262#define CURLYM_A (REGNODE_MAX+14)
2263#define CURLYM_A_fail (REGNODE_MAX+15)
2264#define CURLYM_B (REGNODE_MAX+16)
2265#define CURLYM_B_fail (REGNODE_MAX+17)
2266#define IFMATCH_A (REGNODE_MAX+18)
2267#define IFMATCH_A_fail (REGNODE_MAX+19)
2268#define CURLY_B_min_known (REGNODE_MAX+20)
2269#define CURLY_B_min_known_fail (REGNODE_MAX+21)
2270#define CURLY_B_min (REGNODE_MAX+22)
2271#define CURLY_B_min_fail (REGNODE_MAX+23)
2272#define CURLY_B_max (REGNODE_MAX+24)
2273#define CURLY_B_max_fail (REGNODE_MAX+25)
40a82448
DM
2274
2275
3dab1dad 2276#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2277
3df15adc 2278#ifdef DEBUGGING
ab3bbdeb
YO
2279STATIC void
2280S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2281 const char *start, const char *end, const char *blurb)
2282{
2283 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2284 if (!PL_colorset)
2285 reginitcolors();
2286 {
2287 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2288 prog->precomp, prog->prelen, 60);
2289
2290 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2291 start, end - start, 60);
2292
2293 PerlIO_printf(Perl_debug_log,
2294 "%s%s REx%s %s against %s\n",
2295 PL_colors[4], blurb, PL_colors[5], s0, s1);
2296
2297 if (do_utf8||utf8_pat)
2298 PerlIO_printf(Perl_debug_log, "UTF-8 %s...\n",
2299 !do_utf8 ? "pattern" : !utf8_pat ? "string" :
2300 "pattern and string"
2301 );
2302 }
2303}
3df15adc
YO
2304
2305STATIC void
07be1b83
YO
2306S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2307{
2308 const int docolor = *PL_colors[0];
2309 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2310 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2311 /* The part of the string before starttry has one color
2312 (pref0_len chars), between starttry and current
2313 position another one (pref_len - pref0_len chars),
2314 after the current position the third one.
2315 We assume that pref0_len <= pref_len, otherwise we
2316 decrease pref0_len. */
2317 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2318 ? (5 + taill) - l : locinput - PL_bostr;
2319 int pref0_len;
2320
2321 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2322 pref_len++;
2323 pref0_len = pref_len - (locinput - PL_reg_starttry);
2324 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2325 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2326 ? (5 + taill) - pref_len : PL_regeol - locinput);
2327 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2328 l--;
2329 if (pref0_len < 0)
2330 pref0_len = 0;
2331 if (pref0_len > pref_len)
2332 pref0_len = pref_len;
2333 {
3df15adc 2334 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2335
ab3bbdeb
YO
2336 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2337 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2338
ab3bbdeb 2339 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2340 (locinput - pref_len + pref0_len),
ab3bbdeb 2341 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2342
ab3bbdeb
YO
2343 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2344 locinput, PL_regeol - locinput, 60, 0, 1);
0df25f3d 2345
3df15adc 2346 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2347 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
07be1b83 2348 (IV)(locinput - PL_bostr),
07be1b83 2349 len0, s0,
07be1b83 2350 len1, s1,
07be1b83 2351 (docolor ? "" : "> <"),
07be1b83 2352 len2, s2,
07be1b83
YO
2353 15 - l - pref_len + 1,
2354 "");
2355 }
2356}
3df15adc 2357
07be1b83
YO
2358#endif
2359
d6a28714 2360STATIC I32 /* 0 failure, 1 success */
3b0527fe 2361S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2362{
54df2634
NC
2363#if PERL_VERSION < 9
2364 dMY_CXT;
2365#endif
27da23d5 2366 dVAR;
95b24440 2367 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2368 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2369
3b0527fe
DM
2370 regexp *rex = reginfo->prog;
2371
5d9a96ca
DM
2372 regmatch_slab *orig_slab;
2373 regmatch_state *orig_state;
a3621e74 2374
5d9a96ca
DM
2375 /* the current state. This is a cached copy of PL_regmatch_state */
2376 register regmatch_state *st;
95b24440 2377
5d9a96ca
DM
2378 /* cache heavy used fields of st in registers */
2379 register regnode *scan;
2380 register regnode *next;
2381 register I32 n = 0; /* initialize to shut up compiler warning */
2382 register char *locinput = PL_reginput;
95b24440 2383
5d9a96ca
DM
2384 /* these variables are NOT saved during a recusive RFEGMATCH: */
2385 register I32 nextchr; /* is always set to UCHARAT(locinput) */
b69b0499 2386 bool result = 0; /* return value of S_regmatch */
5d9a96ca 2387 int depth = 0; /* depth of recursion */
77cb431f
DM
2388 regmatch_state *yes_state = NULL; /* state to pop to on success of
2389 subpattern */
40a82448 2390 U32 state_num;
95b24440 2391
10edeb5d
JH
2392 I32 parenfloor = 0;
2393
95b24440 2394#ifdef DEBUGGING
e68ec53f 2395 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2396 PL_regindent++;
2397#endif
2398
5d9a96ca
DM
2399 /* on first ever call to regmatch, allocate first slab */
2400 if (!PL_regmatch_slab) {
2401 Newx(PL_regmatch_slab, 1, regmatch_slab);
2402 PL_regmatch_slab->prev = NULL;
2403 PL_regmatch_slab->next = NULL;
86545054 2404 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2405 }
2406
2407 /* remember current high-water mark for exit */
2408 /* XXX this should be done with SAVE* instead */
2409 orig_slab = PL_regmatch_slab;
2410 orig_state = PL_regmatch_state;
2411
2412 /* grab next free state slot */
2413 st = ++PL_regmatch_state;
86545054 2414 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2415 st = PL_regmatch_state = S_push_slab(aTHX);
2416
2417 st->minmod = 0;
2418 st->sw = 0;
2419 st->logical = 0;
5d9a96ca 2420 st->cc = NULL;
d6a28714
JH
2421 /* Note that nextchr is a byte even in UTF */
2422 nextchr = UCHARAT(locinput);
2423 scan = prog;
2424 while (scan != NULL) {
8ba1375e 2425
a3621e74 2426 DEBUG_EXECUTE_r( {
6136c704 2427 SV * const prop = sv_newmortal();
07be1b83 2428 dump_exec_pos( locinput, scan, do_utf8 );
32fc9b6a 2429 regprop(rex, prop, scan);
07be1b83
YO
2430
2431 PerlIO_printf(Perl_debug_log,
2432 "%3"IVdf":%*s%s(%"IVdf")\n",
2433 (IV)(scan - rex->program), PL_regindent*2, "",
2434 SvPVX_const(prop),
2435 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2a782b5b 2436 });
d6a28714
JH
2437
2438 next = scan + NEXT_OFF(scan);
2439 if (next == scan)
2440 next = NULL;
40a82448 2441 state_num = OP(scan);
d6a28714 2442
40a82448
DM
2443 reenter_switch:
2444 switch (state_num) {
d6a28714 2445 case BOL:
7fba1cd6 2446 if (locinput == PL_bostr)
d6a28714 2447 {
3b0527fe 2448 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2449 break;
2450 }
d6a28714
JH
2451 sayNO;
2452 case MBOL:
12d33761
HS
2453 if (locinput == PL_bostr ||
2454 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2455 {
b8c5462f
JH
2456 break;
2457 }
d6a28714
JH
2458 sayNO;
2459 case SBOL:
c2a73568 2460 if (locinput == PL_bostr)
b8c5462f 2461 break;
d6a28714
JH
2462 sayNO;
2463 case GPOS:
3b0527fe 2464 if (locinput == reginfo->ganch)
d6a28714
JH
2465 break;
2466 sayNO;
2467 case EOL:
d6a28714
JH
2468 goto seol;
2469 case MEOL:
d6a28714 2470 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2471 sayNO;
b8c5462f 2472 break;
d6a28714
JH
2473 case SEOL:
2474 seol:
2475 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2476 sayNO;
d6a28714 2477 if (PL_regeol - locinput > 1)
b8c5462f 2478 sayNO;
b8c5462f 2479 break;
d6a28714
JH
2480 case EOS:
2481 if (PL_regeol != locinput)
b8c5462f 2482 sayNO;
d6a28714 2483 break;
ffc61ed2 2484 case SANY:
d6a28714 2485 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2486 sayNO;
f33976b4
DB
2487 if (do_utf8) {
2488 locinput += PL_utf8skip[nextchr];
2489 if (locinput > PL_regeol)
2490 sayNO;
2491 nextchr = UCHARAT(locinput);
2492 }
2493 else
2494 nextchr = UCHARAT(++locinput);
2495 break;
2496 case CANY:
2497 if (!nextchr && locinput >= PL_regeol)
2498 sayNO;
b8c5462f 2499 nextchr = UCHARAT(++locinput);
a0d0e21e 2500 break;
ffc61ed2 2501 case REG_ANY:
1aa99e6b
IH
2502 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2503 sayNO;
2504 if (do_utf8) {
b8c5462f 2505 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2506 if (locinput > PL_regeol)
2507 sayNO;
a0ed51b3 2508 nextchr = UCHARAT(locinput);
a0ed51b3 2509 }
1aa99e6b
IH
2510 else
2511 nextchr = UCHARAT(++locinput);
a0ed51b3 2512 break;
166ba7cd
DM
2513
2514#undef ST
2515#define ST st->u.trie
2516
5b47454d 2517 case TRIE:
3dab1dad 2518 {
07be1b83 2519 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2520 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2521 trie_type = do_utf8 ?
2522 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2523 : trie_plain;
2524
2525 /* what trie are we using right now */
be8e71aa 2526 reg_trie_data * const trie
3dab1dad
YO
2527 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2528 U32 state = trie->startstate;
166ba7cd
DM
2529
2530 U8 *uc = ( U8* )locinput;
2531 U16 charid = 0;
2532 U32 base = 0;
2533 UV uvc = 0;
2534 STRLEN len = 0;
2535 STRLEN foldlen = 0;
2536 U8 *uscan = (U8*)NULL;
2537 STRLEN bufflen=0;
2538 SV *sv_accept_buff = NULL;
2539 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2540
2541 ST.accepted = 0; /* how many accepting states we have seen */
2542 ST.B = next;
2543#ifdef DEBUGGING
2544 ST.me = scan;
2545#endif
3dab1dad
YO
2546
2547 if (trie->bitmap && trie_type != trie_utf8_fold &&
2548 !TRIE_BITMAP_TEST(trie,*locinput)
2549 ) {
2550 if (trie->states[ state ].wordnum) {
2551 DEBUG_EXECUTE_r(
2552 PerlIO_printf(Perl_debug_log,
2553 "%*s %smatched empty string...%s\n",
2554 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2555 );
2556 break;
2557 } else {
2558 DEBUG_EXECUTE_r(
2559 PerlIO_printf(Perl_debug_log,
2560 "%*s %sfailed to match start class...%s\n",
2561 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2562 );
2563 sayNO_SILENT;
2564 }
2565 }
166ba7cd 2566
07be1b83
YO
2567 /*
2568 traverse the TRIE keeping track of all accepting states
2569 we transition through until we get to a failing node.
2570 */
2571
a3621e74
YO
2572 while ( state && uc <= (U8*)PL_regeol ) {
2573
5b47454d 2574 if (trie->states[ state ].wordnum) {
166ba7cd 2575 if (!ST.accepted ) {
5b47454d
DM
2576 ENTER;
2577 SAVETMPS;
2578 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2579 sv_accept_buff=newSV(bufflen *
2580 sizeof(reg_trie_accepted) - 1);
2581 SvCUR_set(sv_accept_buff,
2582 sizeof(reg_trie_accepted));
2583 SvPOK_on(sv_accept_buff);
2584 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2585 SAVETMPS;
2586 ST.accept_buff =
5b47454d
DM
2587 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2588 }
2589 else {
166ba7cd 2590 if (ST.accepted >= bufflen) {
5b47454d 2591 bufflen *= 2;
166ba7cd 2592 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2593 SvGROW(sv_accept_buff,
2594 bufflen * sizeof(reg_trie_accepted));
2595 }
2596 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2597 + sizeof(reg_trie_accepted));
2598 }
166ba7cd
DM
2599 ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum;
2600 ST.accept_buff[ST.accepted].endpos = uc;
2601 ++ST.accepted;
5b47454d 2602 }
a3621e74
YO
2603
2604 base = trie->states[ state ].trans.base;
2605
07be1b83
YO
2606 DEBUG_TRIE_EXECUTE_r({
2607 dump_exec_pos( (char *)uc, scan, do_utf8 );
a3621e74 2608 PerlIO_printf( Perl_debug_log,
e4584336 2609 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
07be1b83 2610 2+PL_regindent * 2, "", PL_colors[4],
166ba7cd 2611 (UV)state, (UV)base, (UV)ST.accepted );
07be1b83 2612 });
a3621e74
YO
2613
2614 if ( base ) {
4cadc6a9
YO
2615 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2616 uvc, charid, foldlen, foldbuf, uniflags);
a3621e74 2617
5b47454d
DM
2618 if (charid &&
2619 (base + charid > trie->uniquecharcount )
2620 && (base + charid - 1 - trie->uniquecharcount
2621 < trie->lasttrans)
2622 && trie->trans[base + charid - 1 -
2623 trie->uniquecharcount].check == state)
2624 {
2625 state = trie->trans[base + charid - 1 -
2626 trie->uniquecharcount ].next;
2627 }
2628 else {
2629 state = 0;
2630 }
2631 uc += len;
2632
2633 }
2634 else {
a3621e74
YO
2635 state = 0;
2636 }
2637 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2638 PerlIO_printf( Perl_debug_log,
2639 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2640 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2641 );
2642 }
166ba7cd 2643 if (!ST.accepted )
a3621e74 2644 sayNO;
a3621e74 2645
166ba7cd
DM
2646 DEBUG_EXECUTE_r(
2647 PerlIO_printf( Perl_debug_log,
2648 "%*s %sgot %"IVdf" possible matches%s\n",
2649 REPORT_CODE_OFF + PL_regindent * 2, "",
2650 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2651 );
2652 }
2653
2654 /* FALL THROUGH */
2655
2656 case TRIE_next_fail: /* we failed - try next alterative */
2657
2658 if ( ST.accepted == 1 ) {
2659 /* only one choice left - just continue */
2660 DEBUG_EXECUTE_r({
2661 reg_trie_data * const trie
2662 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2663 SV ** const tmp = RX_DEBUG(reginfo->prog)
2664 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2665 : NULL;
2666 PerlIO_printf( Perl_debug_log,
2667 "%*s %sonly one match left: #%d <%s>%s\n",
2668 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2669 ST.accept_buff[ 0 ].wordnum,
2670 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2671 PL_colors[5] );
2672 });
2673 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2674 /* in this case we free tmps/leave before we call regmatch
2675 as we wont be using accept_buff again. */
2676 FREETMPS;
2677 LEAVE;
2678 locinput = PL_reginput;
2679 nextchr = UCHARAT(locinput);
2680 scan = ST.B;
2681 continue; /* execute rest of RE */
2682 }
2683
2684 if (!ST.accepted-- ) {
2685 FREETMPS;
2686 LEAVE;
2687 sayNO;
2688 }
2689
a3621e74 2690 /*
166ba7cd
DM
2691 There are at least two accepting states left. Presumably
2692 the number of accepting states is going to be low,
2693 typically two. So we simply scan through to find the one
2694 with lowest wordnum. Once we find it, we swap the last
2695 state into its place and decrement the size. We then try to
2696 match the rest of the pattern at the point where the word
2697 ends. If we succeed, control just continues along the
2698 regex; if we fail we return here to try the next accepting
2699 state
2700 */
a3621e74 2701
166ba7cd
DM
2702 {
2703 U32 best = 0;
2704 U32 cur;
2705 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2706 DEBUG_TRIE_EXECUTE_r(
f2278c82 2707 PerlIO_printf( Perl_debug_log,
166ba7cd
DM
2708 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2709 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2710 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2711 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2712 );
2713
2714 if (ST.accept_buff[cur].wordnum <
2715 ST.accept_buff[best].wordnum)
2716 best = cur;
a3621e74 2717 }
166ba7cd
DM
2718
2719 DEBUG_EXECUTE_r({
2720 reg_trie_data * const trie
2721 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2722 SV ** const tmp = RX_DEBUG(reginfo->prog)
2723 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2724 : NULL;
2725 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
2726 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2727 ST.accept_buff[best].wordnum,
2728 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
2729 PL_colors[5] );
2730 });
2731
2732 if ( best<ST.accepted ) {
2733 reg_trie_accepted tmp = ST.accept_buff[ best ];
2734 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2735 ST.accept_buff[ ST.accepted ] = tmp;
2736 best = ST.accepted;
a3621e74 2737 }
166ba7cd
DM
2738 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
2739 }
2740 PUSH_STATE_GOTO(TRIE_next, ST.B);
2741 /* NOTREACHED */
2742
2743#undef ST
2744
95b24440
DM
2745 case EXACT: {
2746 char *s = STRING(scan);
5d9a96ca 2747 st->ln = STR_LEN(scan);
eb160463 2748 if (do_utf8 != UTF) {
bc517b45 2749 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2750 char *l = locinput;
be8e71aa 2751 const char * const e = s + st->ln;
a72c7584 2752
5ff6fc6d
JH
2753 if (do_utf8) {
2754 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2755 while (s < e) {
a3b680e6 2756 STRLEN ulen;
1aa99e6b 2757 if (l >= PL_regeol)
5ff6fc6d
JH
2758 sayNO;
2759 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2760 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2761 uniflags))
5ff6fc6d 2762 sayNO;
bc517b45 2763 l += ulen;
5ff6fc6d 2764 s ++;
1aa99e6b 2765 }
5ff6fc6d
JH
2766 }
2767 else {
2768 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2769 while (s < e) {
a3b680e6 2770 STRLEN ulen;
1aa99e6b
IH
2771 if (l >= PL_regeol)
2772 sayNO;
5ff6fc6d 2773 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2774 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2775 uniflags))
1aa99e6b 2776 sayNO;
bc517b45 2777 s += ulen;
a72c7584 2778 l ++;
1aa99e6b 2779 }
5ff6fc6d 2780 }
1aa99e6b
IH
2781 locinput = l;
2782 nextchr = UCHARAT(locinput);
2783 break;
2784 }
bc517b45 2785 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2786 /* Inline the first character, for speed. */
2787 if (UCHARAT(s) != nextchr)
2788 sayNO;
5d9a96ca 2789 if (PL_regeol - locinput < st->ln)
d6a28714 2790 sayNO;
5d9a96ca 2791 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 2792 sayNO;
5d9a96ca 2793 locinput += st->ln;
d6a28714
JH
2794 nextchr = UCHARAT(locinput);
2795 break;
95b24440 2796 }
d6a28714 2797 case EXACTFL:
b8c5462f
JH
2798 PL_reg_flags |= RF_tainted;
2799 /* FALL THROUGH */
95b24440 2800 case EXACTF: {
be8e71aa 2801 char * const s = STRING(scan);
5d9a96ca 2802 st->ln = STR_LEN(scan);
d6a28714 2803
d07ddd77
JH
2804 if (do_utf8 || UTF) {
2805 /* Either target or the pattern are utf8. */
be8e71aa 2806 const char * const l = locinput;
d07ddd77 2807 char *e = PL_regeol;
bc517b45 2808
5d9a96ca 2809 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 2810 l, &e, 0, do_utf8)) {
5486206c
JH
2811 /* One more case for the sharp s:
2812 * pack("U0U*", 0xDF) =~ /ss/i,
2813 * the 0xC3 0x9F are the UTF-8
2814 * byte sequence for the U+00DF. */
2815 if (!(do_utf8 &&
2816 toLOWER(s[0]) == 's' &&
5d9a96ca 2817 st->ln >= 2 &&
5486206c
JH
2818 toLOWER(s[1]) == 's' &&
2819 (U8)l[0] == 0xC3 &&
2820 e - l >= 2 &&
2821 (U8)l[1] == 0x9F))
2822 sayNO;
2823 }
d07ddd77
JH
2824 locinput = e;
2825 nextchr = UCHARAT(locinput);
2826 break;
a0ed51b3 2827 }
d6a28714 2828
bc517b45
JH
2829 /* Neither the target and the pattern are utf8. */
2830
d6a28714
JH
2831 /* Inline the first character, for speed. */
2832 if (UCHARAT(s) != nextchr &&
2833 UCHARAT(s) != ((OP(scan) == EXACTF)
2834 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2835 sayNO;
5d9a96ca 2836 if (PL_regeol - locinput < st->ln)
b8c5462f 2837 sayNO;
5d9a96ca
DM
2838 if (st->ln > 1 && (OP(scan) == EXACTF
2839 ? ibcmp(s, locinput, st->ln)
2840 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 2841 sayNO;
5d9a96ca 2842 locinput += st->ln;
d6a28714 2843 nextchr = UCHARAT(locinput);
a0d0e21e 2844 break;
95b24440 2845 }
d6a28714 2846 case ANYOF:
ffc61ed2 2847 if (do_utf8) {
9e55ce06
JH
2848 STRLEN inclasslen = PL_regeol - locinput;
2849
32fc9b6a 2850 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2851 sayNO_ANYOF;
ffc61ed2
JH
2852 if (locinput >= PL_regeol)
2853 sayNO;
0f0076b4 2854 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2855 nextchr = UCHARAT(locinput);
e0f9d4a8 2856 break;
ffc61ed2
JH
2857 }
2858 else {
2859 if (nextchr < 0)
2860 nextchr = UCHARAT(locinput);
32fc9b6a 2861 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 2862 sayNO_ANYOF;
ffc61ed2
JH
2863 if (!nextchr && locinput >= PL_regeol)
2864 sayNO;
2865 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2866 break;
2867 }
2868 no_anyof:
2869 /* If we might have the case of the German sharp s
2870 * in a casefolding Unicode character class. */
2871
ebc501f0
JH
2872 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2873 locinput += SHARP_S_SKIP;
e0f9d4a8 2874 nextchr = UCHARAT(locinput);
ffc61ed2 2875 }
e0f9d4a8
JH
2876 else
2877 sayNO;
b8c5462f 2878 break;
d6a28714 2879 case ALNUML:
b8c5462f
JH
2880 PL_reg_flags |= RF_tainted;
2881 /* FALL THROUGH */
d6a28714 2882 case ALNUM:
b8c5462f 2883 if (!nextchr)
4633a7c4 2884 sayNO;
ffc61ed2 2885 if (do_utf8) {
1a4fad37 2886 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2887 if (!(OP(scan) == ALNUM
bb7a0f54 2888 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2889 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2890 {
2891 sayNO;
a0ed51b3 2892 }
b8c5462f 2893 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2894 nextchr = UCHARAT(locinput);
2895 break;
2896 }
ffc61ed2 2897 if (!(OP(scan) == ALNUM
d6a28714 2898 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2899 sayNO;
b8c5462f 2900 nextchr = UCHARAT(++locinput);
a0d0e21e 2901 break;
d6a28714 2902 case NALNUML:
b8c5462f
JH
2903 PL_reg_flags |= RF_tainted;
2904 /* FALL THROUGH */
d6a28714
JH
2905 case NALNUM:
2906 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2907 sayNO;
ffc61ed2 2908 if (do_utf8) {
1a4fad37 2909 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2910 if (OP(scan) == NALNUM
bb7a0f54 2911 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2912 : isALNUM_LC_utf8((U8*)locinput))
2913 {
b8c5462f 2914 sayNO;
d6a28714 2915 }
b8c5462f
JH
2916 locinput += PL_utf8skip[nextchr];
2917 nextchr = UCHARAT(locinput);
2918 break;
2919 }
ffc61ed2 2920 if (OP(scan) == NALNUM
d6a28714 2921 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2922 sayNO;
76e3520e 2923 nextchr = UCHARAT(++locinput);
a0d0e21e 2924 break;
d6a28714
JH
2925 case BOUNDL:
2926 case NBOUNDL:
3280af22 2927 PL_reg_flags |= RF_tainted;
bbce6d69 2928 /* FALL THROUGH */
d6a28714
JH
2929 case BOUND:
2930 case NBOUND:
2931 /* was last char in word? */
ffc61ed2 2932 if (do_utf8) {
12d33761 2933 if (locinput == PL_bostr)
5d9a96ca 2934 st->ln = '\n';
ffc61ed2 2935 else {
a3b680e6 2936 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2937
4ad0818d 2938 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
2939 }
2940 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 2941 st->ln = isALNUM_uni(st->ln);
1a4fad37 2942 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 2943 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2944 }
2945 else {
5d9a96ca 2946 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
2947 n = isALNUM_LC_utf8((U8*)locinput);
2948 }
a0ed51b3 2949 }
d6a28714 2950 else {
5d9a96ca 2951 st->ln = (locinput != PL_bostr) ?
12d33761 2952 UCHARAT(locinput - 1) : '\n';
ffc61ed2 2953 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 2954 st->ln = isALNUM(st->ln);
ffc61ed2
JH
2955 n = isALNUM(nextchr);
2956 }
2957 else {
5d9a96ca 2958 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
2959 n = isALNUM_LC(nextchr);
2960 }
d6a28714 2961 }
5d9a96ca 2962 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
2963 OP(scan) == BOUNDL))
2964 sayNO;
a0ed51b3 2965 break;
d6a28714 2966 case SPACEL:
3280af22 2967 PL_reg_flags |= RF_tainted;
bbce6d69 2968 /* FALL THROUGH */
d6a28714 2969 case SPACE:
9442cb0e 2970 if (!nextchr)
4633a7c4 2971 sayNO;
1aa99e6b 2972 if (do_utf8) {
fd400ab9 2973 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 2974 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 2975 if (!(OP(scan) == SPACE
bb7a0f54 2976 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2977 : isSPACE_LC_utf8((U8*)locinput)))
2978 {
2979 sayNO;
2980 }
2981 locinput += PL_utf8skip[nextchr];
2982 nextchr = UCHARAT(locinput);
2983 break;
d6a28714 2984 }
ffc61ed2
JH
2985 if (!(OP(scan) == SPACE
2986 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2987 sayNO;
2988 nextchr = UCHARAT(++locinput);
2989 }
2990 else {
2991 if (!(OP(scan) == SPACE
2992 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2993 sayNO;
2994 nextchr = UCHARAT(++locinput);
a0ed51b3 2995 }
a0ed51b3 2996 break;
d6a28714 2997 case NSPACEL:
3280af22 2998 PL_reg_flags |= RF_tainted;
bbce6d69 2999 /* FALL THROUGH */
d6a28714 3000 case NSPACE:
9442cb0e 3001 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3002 sayNO;
1aa99e6b 3003 if (do_utf8) {
1a4fad37 3004 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3005 if (OP(scan) == NSPACE
bb7a0f54 3006 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3007 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3008 {
3009 sayNO;
3010 }
3011 locinput += PL_utf8skip[nextchr];
3012 nextchr = UCHARAT(locinput);
3013 break;
a0ed51b3 3014 }
ffc61ed2 3015 if (OP(scan) == NSPACE
d6a28714 3016 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3017 sayNO;
76e3520e 3018 nextchr = UCHARAT(++locinput);
a0d0e21e 3019 break;
d6a28714 3020 case DIGITL:
a0ed51b3
LW
3021 PL_reg_flags |= RF_tainted;
3022 /* FALL THROUGH */
d6a28714 3023 case DIGIT:
9442cb0e 3024 if (!nextchr)
a0ed51b3 3025 sayNO;
1aa99e6b 3026 if (do_utf8) {
1a4fad37 3027 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3028 if (!(OP(scan) == DIGIT
bb7a0f54 3029 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3030 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3031 {
a0ed51b3 3032 sayNO;
dfe13c55 3033 }
6f06b55f 3034 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3035 nextchr = UCHARAT(locinput);
3036 break;
3037 }
ffc61ed2 3038 if (!(OP(scan) == DIGIT
9442cb0e 3039 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3040 sayNO;
3041 nextchr = UCHARAT(++locinput);
3042 break;
d6a28714 3043 case NDIGITL:
b8c5462f
JH
3044 PL_reg_flags |= RF_tainted;
3045 /* FALL THROUGH */
d6a28714 3046 case NDIGIT:
9442cb0e 3047 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3048 sayNO;
1aa99e6b 3049 if (do_utf8) {
1a4fad37 3050 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3051 if (OP(scan) == NDIGIT
bb7a0f54 3052 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3053 : isDIGIT_LC_utf8((U8*)locinput))
3054 {
a0ed51b3 3055 sayNO;
9442cb0e 3056 }
6f06b55f 3057 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3058 nextchr = UCHARAT(locinput);
3059 break;
3060 }
ffc61ed2 3061 if (OP(scan) == NDIGIT
9442cb0e 3062 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3063 sayNO;
3064 nextchr = UCHARAT(++locinput);
3065 break;
3066 case CLUMP:
b7c83a7e 3067 if (locinput >= PL_regeol)
a0ed51b3 3068 sayNO;
b7c83a7e 3069 if (do_utf8) {
1a4fad37 3070 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3071 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3072 sayNO;
3073 locinput += PL_utf8skip[nextchr];
3074 while (locinput < PL_regeol &&
3075 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3076 locinput += UTF8SKIP(locinput);
3077 if (locinput > PL_regeol)
3078 sayNO;
eb08e2da
JH
3079 }
3080 else
3081 locinput++;
a0ed51b3
LW
3082 nextchr = UCHARAT(locinput);
3083 break;
c8756f30 3084 case REFFL:
3280af22 3085 PL_reg_flags |= RF_tainted;
c8756f30 3086 /* FALL THROUGH */
c277df42 3087 case REF:
95b24440
DM
3088 case REFF: {
3089 char *s;
c277df42 3090 n = ARG(scan); /* which paren pair */
5d9a96ca 3091 st->ln = PL_regstartp[n];
2c2d71f5 3092 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3093 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3094 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3095 if (st->ln == PL_regendp[n])
a0d0e21e 3096 break;
a0ed51b3 3097
5d9a96ca 3098 s = PL_bostr + st->ln;
1aa99e6b 3099 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3100 char *l = locinput;
a3b680e6 3101 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3102 /*
3103 * Note that we can't do the "other character" lookup trick as
3104 * in the 8-bit case (no pun intended) because in Unicode we
3105 * have to map both upper and title case to lower case.
3106 */
3107 if (OP(scan) == REFF) {
3108 while (s < e) {
a3b680e6
AL
3109 STRLEN ulen1, ulen2;
3110 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3111 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3112
a0ed51b3
LW
3113 if (l >= PL_regeol)
3114 sayNO;
a2a2844f
JH
3115 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3116 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3117 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3118 sayNO;
a2a2844f
JH
3119 s += ulen1;
3120 l += ulen2;
a0ed51b3
LW
3121 }
3122 }
3123 locinput = l;
3124 nextchr = UCHARAT(locinput);
3125 break;
3126 }
3127
a0d0e21e 3128 /* Inline the first character, for speed. */
76e3520e 3129 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3130 (OP(scan) == REF ||
3131 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3132 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3133 sayNO;
5d9a96ca
DM
3134 st->ln = PL_regendp[n] - st->ln;
3135 if (locinput + st->ln > PL_regeol)
4633a7c4 3136 sayNO;
5d9a96ca
DM
3137 if (st->ln > 1 && (OP(scan) == REF
3138 ? memNE(s, locinput, st->ln)
c8756f30 3139 : (OP(scan) == REFF
5d9a96ca
DM
3140 ? ibcmp(s, locinput, st->ln)
3141 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3142 sayNO;
5d9a96ca 3143 locinput += st->ln;
76e3520e 3144 nextchr = UCHARAT(locinput);
a0d0e21e 3145 break;
95b24440 3146 }
a0d0e21e
LW
3147
3148 case NOTHING:
c277df42 3149 case TAIL:
a0d0e21e
LW
3150 break;
3151 case BACK:
3152 break;
40a82448
DM
3153
3154#undef ST
3155#define ST st->u.eval
3156
3157 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
c277df42 3158 {
c277df42 3159 SV *ret;
8e5e9ebe 3160 {
4aabdb9b
DM
3161 /* execute the code in the {...} */
3162 dSP;
6136c704 3163 SV ** const before = SP;
4aabdb9b
DM
3164 OP_4tree * const oop = PL_op;
3165 COP * const ocurcop = PL_curcop;
3166 PAD *old_comppad;
4aabdb9b
DM
3167
3168 n = ARG(scan);
32fc9b6a 3169 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3170 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3171 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3172 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3173
8e5e9ebe
RGS
3174 CALLRUNOPS(aTHX); /* Scalar context. */
3175 SPAGAIN;
3176 if (SP == before)
075aa684 3177 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3178 else {
3179 ret = POPs;
3180 PUTBACK;
3181 }
4aabdb9b
DM
3182
3183 PL_op = oop;
3184 PAD_RESTORE_LOCAL(old_comppad);
3185 PL_curcop = ocurcop;
3186 if (!st->logical) {
3187 /* /(?{...})/ */
3188 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3189 break;
3190 }
8e5e9ebe 3191 }
4aabdb9b
DM
3192 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3193 regexp *re;
4aabdb9b 3194 {
4f639d21
DM
3195 /* extract RE object from returned value; compiling if
3196 * necessary */
3197
6136c704 3198 MAGIC *mg = NULL;
be8e71aa 3199 const SV *sv;
faf82a0b
AE
3200 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3201 mg = mg_find(sv, PERL_MAGIC_qr);
3202 else if (SvSMAGICAL(ret)) {
3203 if (SvGMAGICAL(ret))
3204 sv_unmagic(ret, PERL_MAGIC_qr);
3205 else
3206 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3207 }
faf82a0b 3208
0f5d15d6
IZ
3209 if (mg) {
3210 re = (regexp *)mg->mg_obj;
df0003d4 3211 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3212 }
3213 else {
3214 STRLEN len;
6136c704 3215 const char * const t = SvPV_const(ret, len);
0f5d15d6 3216 PMOP pm;
a3b680e6 3217 const I32 osize = PL_regsize;
0f5d15d6 3218
5fcd1c1b 3219 Zero(&pm, 1, PMOP);
4aabdb9b 3220 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3221 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3222 if (!(SvFLAGS(ret)
faf82a0b
AE
3223 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3224 | SVs_GMG)))
14befaf4
DM
3225 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3226 PERL_MAGIC_qr,0,0);
0f5d15d6 3227 PL_regsize = osize;
0f5d15d6 3228 }
4aabdb9b 3229 }
aa283a38
DM
3230
3231 /* run the pattern returned from (??{...}) */
4aabdb9b 3232 DEBUG_EXECUTE_r(
ab3bbdeb
YO
3233 debug_start_match(re, do_utf8, locinput, PL_regeol,
3234 "Matching embedded");
4aabdb9b 3235 );
2c2d71f5 3236
40a82448
DM
3237 ST.cp = regcppush(0); /* Save *all* the positions. */
3238 REGCP_SET(ST.lastcp);
4aabdb9b
DM
3239 *PL_reglastparen = 0;
3240 *PL_reglastcloseparen = 0;
4aabdb9b 3241 PL_reginput = locinput;
4aabdb9b
DM
3242
3243 /* XXXX This is too dramatic a measure... */
3244 PL_reg_maxiter = 0;
3245
5d9a96ca 3246 st->logical = 0;
40a82448 3247 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
aa283a38 3248 ((re->reganch & ROPT_UTF8) != 0);
40a82448
DM
3249 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3250 ST.prev_rex = rex;
aa283a38 3251 rex = re;
aa283a38 3252
40a82448 3253 ST.B = next;
aa283a38 3254 /* now continue from first node in postoned RE */
40a82448 3255 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
4aabdb9b 3256 /* NOTREACHED */
a0ed51b3 3257 }
4aabdb9b
DM
3258 /* /(?(?{...})X|Y)/ */
3259 st->sw = SvTRUE(ret);
3260 st->logical = 0;
c277df42
IZ
3261 break;
3262 }
40a82448
DM
3263
3264 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3265 if (ST.toggleutf)
3266 PL_reg_flags ^= RF_utf8;
3267 ReREFCNT_dec(rex);
3268 rex = ST.prev_rex;
3269 /* XXXX This is too dramatic a measure... */
3270 PL_reg_maxiter = 0;
3271 /* Restore parens of the caller without popping the
3272 * savestack */
3273 {
3274 const I32 tmp = PL_savestack_ix;
3275 PL_savestack_ix = ST.lastcp;
3276 regcppop(rex);
3277 PL_savestack_ix = tmp;
3278 }
3279 PL_reginput = locinput;
3280 /* continue at the node following the (??{...}) */
3281 scan = ST.B;
3282 continue;
3283
3284 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3285 /* Restore state to the outer re then re-throw the failure */
3286 if (ST.toggleutf)
3287 PL_reg_flags ^= RF_utf8;
3288 ReREFCNT_dec(rex);
3289 rex = ST.prev_rex;
3290
3291 /* XXXX This is too dramatic a measure... */
3292 PL_reg_maxiter = 0;
3293
3294 PL_reginput = locinput;
3295 REGCP_UNWIND(ST.lastcp);
3296 regcppop(rex);
3297 sayNO_SILENT;
3298
3299#undef ST
3300
a0d0e21e 3301 case OPEN:
c277df42 3302 n = ARG(scan); /* which paren pair */
3280af22
NIS
3303 PL_reg_start_tmp[n] = locinput;
3304 if (n > PL_regsize)
3305 PL_regsize = n;
a0d0e21e
LW
3306 break;
3307 case CLOSE:
c277df42 3308 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3309 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3310 PL_regendp[n] = locinput - PL_bostr;
eb160463 3311 if (n > (I32)*PL_reglastparen)
3280af22 3312 *PL_reglastparen = n;
a01268b5 3313 *PL_reglastcloseparen = n;
a0d0e21e 3314 break;
c277df42
IZ
3315 case GROUPP:
3316 n = ARG(scan); /* which paren pair */
5d9a96ca 3317 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3318 break;
3319 case IFTHEN:
2c2d71f5 3320 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3321 if (st->sw)
c277df42
IZ
3322 next = NEXTOPER(NEXTOPER(scan));
3323 else {
3324 next = scan + ARG(scan);
3325 if (OP(next) == IFTHEN) /* Fake one. */
3326 next = NEXTOPER(NEXTOPER(next));
3327 }
3328 break;
3329 case LOGICAL:
5d9a96ca 3330 st->logical = scan->flags;
c277df42 3331 break;
2ab05381 3332/*******************************************************************
a0374537
DM
3333 cc points to the regmatch_state associated with the most recent CURLYX.
3334 This struct contains info about the innermost (...)* loop (an
3335 "infoblock"), and a pointer to the next outer cc.
2ab05381
IZ
3336
3337 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3338
95b24440 3339 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3340
a0374537 3341 2) This regnode populates cc, and calls regmatch() recursively
95b24440 3342 with the starting point at WHILEM node;
2ab05381
IZ
3343
3344 3) Each hit of WHILEM node tries to match A and Z (in the order
3345 depending on the current iteration, min/max of {min,max} and
3346 greediness). The information about where are nodes for "A"
a0374537 3347 and "Z" is read from cc, as is info on how many times "A"
2ab05381
IZ
3348 was already matched, and greediness.
3349
3350 4) After A matches, the same WHILEM node is hit again.
3351
95b24440 3352 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
2ab05381 3353 of the same pair. Thus when WHILEM tries to match Z, it temporarily
95b24440 3354 resets cc, since this Y(A)*Z can be a part of some other loop:
2ab05381
IZ
3355 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3356 of the external loop.
3357
a0374537 3358 Currently present infoblocks form a tree with a stem formed by st->cc
2ab05381
IZ
3359 and whatever it mentions via ->next, and additional attached trees
3360 corresponding to temporarily unset infoblocks as in "5" above.
3361
95b24440 3362 In the following picture, infoblocks for outer loop of
2ab05381
IZ
3363 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3364 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3365 infoblocks are drawn below the "reset" infoblock.
3366
3367 In fact in the picture below we do not show failed matches for Z and T
3368 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3369 more obvious *why* one needs to *temporary* unset infoblocks.]
3370
3371 Matched REx position InfoBlocks Comment
3372 (Y(A)*?Z)*?T x
3373 Y(A)*?Z)*?T x <- O
3374 Y (A)*?Z)*?T x <- O
3375 Y A)*?Z)*?T x <- O <- I
3376 YA )*?Z)*?T x <- O <- I
3377 YA A)*?Z)*?T x <- O <- I
3378 YAA )*?Z)*?T x <- O <- I
3379 YAA Z)*?T x <- O # Temporary unset I
3380 I
3381
3382 YAAZ Y(A)*?Z)*?T x <- O
3383 I
3384
3385 YAAZY (A)*?Z)*?T x <- O
3386 I
3387
3388 YAAZY A)*?Z)*?T x <- O <- I
3389 I
3390
3391 YAAZYA )*?Z)*?T x <- O <- I
3392 I
3393
3394 YAAZYA Z)*?T x <- O # Temporary unset I
3395 I,I
3396
3397 YAAZYAZ )*?T x <- O
3398 I,I
3399
3400 YAAZYAZ T x # Temporary unset O
3401 O
3402 I,I
3403
3404 YAAZYAZT x
3405 O
3406 I,I
3407 *******************************************************************/
95b24440 3408
a0d0e21e 3409 case CURLYX: {
cb434fcc 3410 /* No need to save/restore up to this paren */
10edeb5d
JH
3411 parenfloor = scan->flags;
3412
c2b7afd3
NC
3413 /* Dave says:
3414
3415 CURLYX and WHILEM are always paired: they're the moral
3416 equivalent of pp_enteriter anbd pp_iter.
3417
3418 The only time next could be null is if the node tree is
3419 corrupt. This was mentioned on p5p a few days ago.
3420
3421 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3422 So we'll assert that this is true:
3423 */
3424 assert(next);
30b2893d 3425 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
c277df42 3426 next += ARG(next);
cb434fcc
IZ
3427 /* XXXX Probably it is better to teach regpush to support
3428 parenfloor > PL_regsize... */
eb160463 3429 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc 3430 parenfloor = *PL_reglastparen; /* Pessimization... */
a0374537 3431
d8319b27
DM
3432 st->u.curlyx.cp = PL_savestack_ix;
3433 st->u.curlyx.outercc = st->cc;
a0374537
DM
3434 st->cc = st;
3435 /* these fields contain the state of the current curly.
3436 * they are accessed by subsequent WHILEMs;
3437 * cur and lastloc are also updated by WHILEM */
d8319b27
DM
3438 st->u.curlyx.parenfloor = parenfloor;
3439 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3440 st->u.curlyx.min = ARG1(scan);
3441 st->u.curlyx.max = ARG2(scan);
3442 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3443 st->u.curlyx.lastloc = 0;
a0374537
DM
3444 /* st->next and st->minmod are also read by WHILEM */
3445
3280af22 3446 PL_reginput = locinput;
95b24440
DM
3447 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3448 /*** all unsaved local vars undefined at this point */
d8319b27
DM
3449 regcpblow(st->u.curlyx.cp);
3450 st->cc = st->u.curlyx.outercc;
95b24440 3451 saySAME(result);
a0d0e21e 3452 }
5f66b61c 3453 /* NOTREACHED */
a0d0e21e
LW
3454 case WHILEM: {
3455 /*
3456 * This is really hard to understand, because after we match
3457 * what we're trying to match, we must make sure the rest of
2c2d71f5 3458 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3459 * to go back UP the parse tree by recursing ever deeper. And
3460 * if it fails, we have to reset our parent's current state
3461 * that we can try again after backing off.
3462 */
3463
c2b7afd3
NC
3464 /* Dave says:
3465
3466 st->cc gets initialised by CURLYX ready for use by WHILEM.
3467 So again, unless somethings been corrupted, st->cc cannot
3468 be null at that point in WHILEM.
3469
3470 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3471 So we'll assert that this is true:
3472 */
3473 assert(st->cc);
d8319b27
DM
3474 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3475 st->u.whilem.cache_offset = 0;
3476 st->u.whilem.cache_bit = 0;
c277df42 3477
d8319b27 3478 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3280af22 3479 PL_reginput = locinput;
a0d0e21e 3480
a3621e74 3481 DEBUG_EXECUTE_r(
9041c2e3 3482 PerlIO_printf(Perl_debug_log,
91f3b821 3483 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3484 REPORT_CODE_OFF+PL_regindent*2, "",
d8319b27
DM
3485 (long)n, (long)st->cc->u.curlyx.min,
3486 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
c277df42 3487 );
4633a7c4 3488
a0d0e21e
LW
3489 /* If degenerate scan matches "", assume scan done. */
3490
d8319b27
DM
3491 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3492 st->u.whilem.savecc = st->cc;
3493 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3494 if (st->cc)
d8319b27 3495 st->ln = st->cc->u.curlyx.cur;
a3621e74 3496 DEBUG_EXECUTE_r(
c3464db5
DD
3497 PerlIO_printf(Perl_debug_log,
3498 "%*s empty match detected, try continuation...\n",
3280af22 3499 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3500 );
d8319b27 3501 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
95b24440 3502 /*** all unsaved local vars undefined at this point */
d8319b27 3503 st->cc = st->u.whilem.savecc;
95b24440 3504 if (result)
4633a7c4 3505 sayYES;
d8319b27
DM
3506 if (st->cc->u.curlyx.outercc)
3507 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4633a7c4 3508 sayNO;
a0d0e21e
LW
3509 }
3510
3511 /* First just match a string of min scans. */
3512
d8319b27
DM
3513 if (n < st->cc->u.curlyx.min) {
3514 st->cc->u.curlyx.cur = n;
3515 st->cc->u.curlyx.lastloc = locinput;
3516 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
95b24440
DM
3517 /*** all unsaved local vars undefined at this point */
3518 if (result)
4633a7c4 3519 sayYES;
d8319b27
DM
3520 st->cc->u.curlyx.cur = n - 1;
3521 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
4633a7c4 3522 sayNO;
a0d0e21e
LW
3523 }
3524
2c2d71f5
JH
3525 if (scan->flags) {
3526 /* Check whether we already were at this position.
3527 Postpone detection until we know the match is not
3528 *that* much linear. */
3529 if (!PL_reg_maxiter) {
3530 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
3531 /* possible overflow for long strings and many CURLYX's */
3532 if (PL_reg_maxiter < 0)
3533 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
3534 PL_reg_leftiter = PL_reg_maxiter;
3535 }
3536 if (PL_reg_leftiter-- == 0) {
3298f257 3537 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 3538 if (PL_reg_poscache) {
eb160463 3539 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3540 Renew(PL_reg_poscache, size, char);
3541 PL_reg_poscache_size = size;
3542 }
3543 Zero(PL_reg_poscache, size, char);
3544 }
3545 else {
3546 PL_reg_poscache_size = size;
a02a5408 3547 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3548 }
a3621e74 3549 DEBUG_EXECUTE_r(
2c2d71f5
JH
3550 PerlIO_printf(Perl_debug_log,
3551 "%sDetected a super-linear match, switching on caching%s...\n",
3552 PL_colors[4], PL_colors[5])
3553 );
3554 }
3555 if (PL_reg_leftiter < 0) {
d8319b27 3556 st->u.whilem.cache_offset = locinput - PL_bostr;
2c2d71f5 3557
3298f257 3558 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1
d8319b27
DM
3559 + st->u.whilem.cache_offset * (scan->flags>>4);
3560 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3561 st->u.whilem.cache_offset /= 8;
3562 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
a3621e74 3563 DEBUG_EXECUTE_r(
2c2d71f5
JH
3564 PerlIO_printf(Perl_debug_log,
3565 "%*s already tried at this position...\n",
3566 REPORT_CODE_OFF+PL_regindent*2, "")
3567 );
3298f257 3568 sayNO; /* cache records failure */
2c2d71f5 3569 }
2c2d71f5
JH
3570 }
3571 }
3572
a0d0e21e
LW
3573 /* Prefer next over scan for minimal matching. */
3574
5d9a96ca 3575 if (st->cc->minmod) {
d8319b27
DM
3576 st->u.whilem.savecc = st->cc;
3577 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3578 if (st->cc)
d8319b27
DM
3579 st->ln = st->cc->u.curlyx.cur;
3580 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3581 REGCP_SET(st->u.whilem.lastcp);
3582 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
95b24440 3583 /*** all unsaved local vars undefined at this point */
d8319b27 3584 st->cc = st->u.whilem.savecc;
95b24440 3585 if (result) {
d8319b27 3586 regcpblow(st->u.whilem.cp);
3298f257 3587 sayYES; /* All done. */
5f05dabc 3588 }
d8319b27 3589 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3590 regcppop(rex);
d8319b27
DM
3591 if (st->cc->u.curlyx.outercc)
3592 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
a0d0e21e 3593
d8319b27 3594 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
9041c2e3 3595 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3596 && !(PL_reg_flags & RF_warned)) {
3597 PL_reg_flags |= RF_warned;
9014280d 3598 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3599 "Complex regular subexpression recursion",
3600 REG_INFTY - 1);
c277df42 3601 }
3ab3c9b4 3602 CACHEsayNO;
c277df42 3603 }
a687059c 3604
a3621e74 3605 DEBUG_EXECUTE_r(
c3464db5
DD
3606 PerlIO_printf(Perl_debug_log,
3607 "%*s trying longer...\n",
3280af22 3608 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3609 );
a0d0e21e 3610 /* Try scanning more and see if it helps. */
3280af22 3611 PL_reginput = locinput;
d8319b27
DM
3612 st->cc->u.curlyx.cur = n;
3613 st->cc->u.curlyx.lastloc = locinput;
3614 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3615 REGCP_SET(st->u.whilem.lastcp);
3616 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
95b24440
DM
3617 /*** all unsaved local vars undefined at this point */
3618 if (result) {
d8319b27 3619 regcpblow(st->u.whilem.cp);
3298f257 3620 sayYES;
5f05dabc 3621 }
d8319b27 3622 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3623 regcppop(rex);
d8319b27
DM
3624 st->cc->u.curlyx.cur = n - 1;
3625 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3626 CACHEsayNO;
a0d0e21e
LW
3627 }
3628
3629 /* Prefer scan over next for maximal matching. */
3630
d8319b27
DM
3631 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3632 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3633 st->cc->u.curlyx.cur = n;
3634 st->cc->u.curlyx.lastloc = locinput;
3635 REGCP_SET(st->u.whilem.lastcp);
3636 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
95b24440
DM
3637 /*** all unsaved local vars undefined at this point */
3638 if (result) {
d8319b27 3639 regcpblow(st->u.whilem.cp);
3298f257 3640 sayYES;
5f05dabc 3641 }
d8319b27 3642 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3643 regcppop(rex); /* Restore some previous $<digit>s? */
3280af22 3644 PL_reginput = locinput;
a3621e74 3645 DEBUG_EXECUTE_r(
c3464db5
DD
3646 PerlIO_printf(Perl_debug_log,
3647 "%*s failed, try continuation...\n",
3280af22 3648 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3649 );
3650 }
9041c2e3 3651 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3652 && !(PL_reg_flags & RF_warned)) {
3280af22 3653 PL_reg_flags |= RF_warned;
9014280d 3654 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3655 "Complex regular subexpression recursion",
3656 REG_INFTY - 1);
a0d0e21e
LW
3657 }
3658
3659 /* Failed deeper matches of scan, so see if this one works. */
d8319b27
DM
3660 st->u.whilem.savecc = st->cc;
3661 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3662 if (st->cc)
d8319b27
DM
3663 st->ln = st->cc->u.curlyx.cur;
3664 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
95b24440 3665 /*** all unsaved local vars undefined at this point */
d8319b27 3666 st->cc = st->u.whilem.savecc;
95b24440 3667 if (result)
3298f257 3668 sayYES;
d8319b27
DM
3669 if (st->cc->u.curlyx.outercc)
3670 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3671 st->cc->u.curlyx.cur = n - 1;
3672 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3673 CACHEsayNO;
a0d0e21e 3674 }
5f66b61c 3675 /* NOTREACHED */
40a82448
DM
3676
3677#undef ST
3678#define ST st->u.branch
3679
3680 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
3681 next = scan + ARG(scan);
3682 if (next == scan)
3683 next = NULL;
40a82448
DM
3684 scan = NEXTOPER(scan);
3685 /* FALL THROUGH */
c277df42 3686
40a82448
DM
3687 case BRANCH: /* /(...|A|...)/ */
3688 scan = NEXTOPER(scan); /* scan now points to inner node */
3689 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3690 /* last branch; skip state push and jump direct to node */
3691 continue;
3692 ST.lastparen = *PL_reglastparen;
3693 ST.next_branch = next;
3694 REGCP_SET(ST.cp);
3695 PL_reginput = locinput;
02db2b7b 3696
40a82448
DM
3697 /* Now go into the branch */
3698 PUSH_STATE_GOTO(BRANCH_next, scan);
3699 /* NOTREACHED */
3700
3701 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3702 REGCP_UNWIND(ST.cp);
3703 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3704 PL_regendp[n] = -1;
3705 *PL_reglastparen = n;
3706 scan = ST.next_branch;
3707 /* no more branches? */
3708 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
3709 sayNO;
3710 continue; /* execute next BRANCH[J] op */
3711 /* NOTREACHED */
3712
a0d0e21e 3713 case MINMOD:
5d9a96ca 3714 st->minmod = 1;
a0d0e21e 3715 break;
40a82448
DM
3716
3717#undef ST
3718#define ST st->u.curlym
3719
3720 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
3721
3722 /* This is an optimisation of CURLYX that enables us to push
3723 * only a single backtracking state, no matter now many matches
3724 * there are in {m,n}. It relies on the pattern being constant
3725 * length, with no parens to influence future backrefs
3726 */
3727
3728 ST.me = scan;
dc45a647 3729 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
3730
3731 /* if paren positive, emulate an OPEN/CLOSE around A */
3732 if (ST.me->flags) {
3733 I32 paren = ST.me->flags;
3734 if (paren > PL_regsize)
3735 PL_regsize = paren;
3736 if (paren > (I32)*PL_reglastparen)
3737 *PL_reglastparen = paren;
c277df42 3738 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 3739 }
40a82448
DM
3740 ST.A = scan;
3741 ST.B = next;
3742 ST.alen = 0;
3743 ST.count = 0;
3744 ST.minmod = st->minmod;
3745 st->minmod = 0;
3746 ST.c1 = CHRTEST_UNINIT;
3747 REGCP_SET(ST.cp);
6407bf3b 3748
40a82448
DM
3749 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
3750 goto curlym_do_B;
3751
3752 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 3753 PL_reginput = locinput;
40a82448
DM
3754 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
3755 /* NOTREACHED */
5f80c4cf 3756
40a82448
DM
3757 case CURLYM_A: /* we've just matched an A */
3758 locinput = st->locinput;
3759 nextchr = UCHARAT(locinput);
3760
3761 ST.count++;
3762 /* after first match, determine A's length: u.curlym.alen */
3763 if (ST.count == 1) {
3764 if (PL_reg_match_utf8) {
3765 char *s = locinput;
3766 while (s < PL_reginput) {
3767 ST.alen++;
3768 s += UTF8SKIP(s);
3769 }
3770 }
3771 else {
3772 ST.alen = PL_reginput - locinput;
3773 }
3774 if (ST.alen == 0)
3775 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
3776 }
0cadcf80
DM
3777 DEBUG_EXECUTE_r(
3778 PerlIO_printf(Perl_debug_log,
40a82448 3779 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
ab3bbdeb 3780 (int)(REPORT_CODE_OFF+(PL_regindent*2)), "",
40a82448 3781 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
3782 );
3783
40a82448
DM
3784 locinput = PL_reginput;
3785 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
3786 goto curlym_do_A; /* try to match another A */
3787 goto curlym_do_B; /* try to match B */
5f80c4cf 3788
40a82448
DM
3789 case CURLYM_A_fail: /* just failed to match an A */
3790 REGCP_UNWIND(ST.cp);
3791 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
3792 sayNO;
0cadcf80 3793
40a82448
DM
3794 curlym_do_B: /* execute the B in /A{m,n}B/ */
3795 PL_reginput = locinput;
3796 if (ST.c1 == CHRTEST_UNINIT) {
3797 /* calculate c1 and c2 for possible match of 1st char
3798 * following curly */
3799 ST.c1 = ST.c2 = CHRTEST_VOID;
3800 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
3801 regnode *text_node = ST.B;
3802 if (! HAS_TEXT(text_node))
3803 FIND_NEXT_IMPT(text_node);
3804 if (HAS_TEXT(text_node)
3805 && PL_regkind[OP(text_node)] != REF)
3806 {
3807 ST.c1 = (U8)*STRING(text_node);
3808 ST.c2 =
3809 (OP(text_node) == EXACTF || OP(text_node) == REFF)
3810 ? PL_fold[ST.c1]
3811 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3812 ? PL_fold_locale[ST.c1]
3813 : ST.c1;
c277df42 3814 }
c277df42 3815 }
40a82448
DM
3816 }
3817
3818 DEBUG_EXECUTE_r(
3819 PerlIO_printf(Perl_debug_log,
3820 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
ab3bbdeb 3821 (int)(REPORT_CODE_OFF+(PL_regindent*2)),
40a82448
DM
3822 "", (IV)ST.count)
3823 );
3824 if (ST.c1 != CHRTEST_VOID
3825 && UCHARAT(PL_reginput) != ST.c1
3826 && UCHARAT(PL_reginput) != ST.c2)
3827 {
3828 /* simulate B failing */
3829 state_num = CURLYM_B_fail;
3830 goto reenter_switch;
3831 }
3832
3833 if (ST.me->flags) {
3834 /* mark current A as captured */
3835 I32 paren = ST.me->flags;
3836 if (ST.count) {
3837 PL_regstartp[paren]
3838 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
3839 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3840 }
40a82448
DM
3841 else
3842 PL_regendp[paren] = -1;
c277df42 3843 }
40a82448 3844 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 3845 /* NOTREACHED */
40a82448
DM
3846
3847 case CURLYM_B_fail: /* just failed to match a B */
3848 REGCP_UNWIND(ST.cp);
3849 if (ST.minmod) {
3850 if (ST.count == ARG2(ST.me) /* max */)
3851 sayNO;
3852 goto curlym_do_A; /* try to match a further A */
3853 }
3854 /* backtrack one A */
3855 if (ST.count == ARG1(ST.me) /* min */)
3856 sayNO;
3857 ST.count--;
3858 locinput = HOPc(locinput, -ST.alen);
3859 goto curlym_do_B; /* try to match B */
3860
c255a977
DM
3861#undef ST
3862#define ST st->u.curly
40a82448 3863
c255a977
DM
3864#define CURLY_SETPAREN(paren, success) \
3865 if (paren) { \
3866 if (success) { \
3867 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
3868 PL_regendp[paren] = locinput - PL_bostr; \
3869 } \
3870 else \
3871 PL_regendp[paren] = -1; \
3872 }
3873
3874 case STAR: /* /A*B/ where A is width 1 */
3875 ST.paren = 0;
3876 ST.min = 0;
3877 ST.max = REG_INFTY;
a0d0e21e
LW
3878 scan = NEXTOPER(scan);
3879 goto repeat;
c255a977
DM
3880 case PLUS: /* /A+B/ where A is width 1 */
3881 ST.paren = 0;
3882 ST.min = 1;
3883 ST.max = REG_INFTY;
c277df42 3884 scan = NEXTOPER(scan);
c255a977
DM
3885 goto repeat;
3886 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
3887 ST.paren = scan->flags; /* Which paren to set */
3888 if (ST.paren > PL_regsize)
3889 PL_regsize = ST.paren;
3890 if (ST.paren > (I32)*PL_reglastparen)
3891 *PL_reglastparen = ST.paren;
3892 ST.min = ARG1(scan); /* min to match */
3893 ST.max = ARG2(scan); /* max to match */
3894 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3895 goto repeat;
3896 case CURLY: /* /A{m,n}B/ where A is width 1 */
3897 ST.paren = 0;
3898 ST.min = ARG1(scan); /* min to match */
3899 ST.max = ARG2(scan); /* max to match */
3900 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 3901 repeat:
a0d0e21e
LW
3902 /*
3903 * Lookahead to avoid useless match attempts
3904 * when we know what character comes next.
c255a977 3905 *
5f80c4cf
JP
3906 * Used to only do .*x and .*?x, but now it allows
3907 * for )'s, ('s and (?{ ... })'s to be in the way
3908 * of the quantifier and the EXACT-like node. -- japhy
3909 */
3910
c255a977
DM
3911 if (ST.min > ST.max) /* XXX make this a compile-time check? */
3912 sayNO;
cca55fe3 3913 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3914 U8 *s;
3915 regnode *text_node = next;
3916
3dab1dad
YO
3917 if (! HAS_TEXT(text_node))
3918 FIND_NEXT_IMPT(text_node);
5f80c4cf 3919
9e137952 3920 if (! HAS_TEXT(text_node))
c255a977 3921 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 3922 else {
3dab1dad 3923 if (PL_regkind[OP(text_node)] == REF) {
c255a977 3924 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 3925 goto assume_ok_easy;
cca55fe3 3926 }
be8e71aa
YO
3927 else
3928 s = (U8*)STRING(text_node);
5f80c4cf
JP
3929
3930 if (!UTF) {
c255a977 3931 ST.c2 = ST.c1 = *s;
f65d3ee7 3932 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
c255a977 3933 ST.c2 = PL_fold[ST.c1];
f65d3ee7 3934 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
c255a977 3935 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 3936 }
5f80c4cf 3937 else { /* UTF */
f65d3ee7 3938 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3939 STRLEN ulen1, ulen2;
89ebb4a3
JH
3940 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3941 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
3942
3943 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3944 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
e294cc5d
JH
3945#ifdef EBCDIC
3946 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
3947 ckWARN(WARN_UTF8) ?
3948 0 : UTF8_ALLOW_ANY);
3949 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
3950 ckWARN(WARN_UTF8) ?
3951 0 : UTF8_ALLOW_ANY);
3952#else
c255a977 3953 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
e294cc5d 3954 uniflags);
c255a977 3955 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
e294cc5d
JH
3956 uniflags);
3957#endif
5f80c4cf
JP
3958 }
3959 else {
c255a977 3960 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 3961 uniflags);
5f80c4cf 3962 }
1aa99e6b
IH
3963 }
3964 }
bbce6d69 3965 }
a0d0e21e 3966 else
c255a977 3967 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 3968 assume_ok_easy:
c255a977
DM
3969
3970 ST.A = scan;
3971 ST.B = next;
3280af22 3972 PL_reginput = locinput;
5d9a96ca
DM
3973 if (st->minmod) {
3974 st->minmod = 0;
c255a977 3975 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4633a7c4 3976 sayNO;
c255a977 3977 ST.count = ST.min;
a0ed51b3 3978 locinput = PL_reginput;
c255a977
DM
3979 REGCP_SET(ST.cp);
3980 if (ST.c1 == CHRTEST_VOID)
3981 goto curly_try_B_min;
3982
3983 ST.oldloc = locinput;
3984
3985 /* set ST.maxpos to the furthest point along the
3986 * string that could possibly match */
3987 if (ST.max == REG_INFTY) {
3988 ST.maxpos = PL_regeol - 1;
3989 if (do_utf8)
3990 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
3991 ST.maxpos--;
3992 }
3993 else if (do_utf8) {
3994 int m = ST.max - ST.min;
3995 for (ST.maxpos = locinput;
3996 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
3997 ST.maxpos += UTF8SKIP(ST.maxpos);
3998 }
3999 else {
4000 ST.maxpos = locinput + ST.max - ST.min;
4001 if (ST.maxpos >= PL_regeol)
4002 ST.maxpos = PL_regeol - 1;
4003 }
4004 goto curly_try_B_min_known;
4005
4006 }
4007 else {
4008 ST.count = regrepeat(rex, ST.A, ST.max);
4009 locinput = PL_reginput;
4010 if (ST.count < ST.min)
4011 sayNO;
4012 if ((ST.count > ST.min)
4013 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4014 {
4015 /* A{m,n} must come at the end of the string, there's
4016 * no point in backing off ... */
4017 ST.min = ST.count;
4018 /* ...except that $ and \Z can match before *and* after
4019 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4020 We may back off by one in this case. */
4021 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4022 ST.min--;
4023 }
4024 REGCP_SET(ST.cp);
4025 goto curly_try_B_max;
4026 }
4027 /* NOTREACHED */
4028
4029
4030 case CURLY_B_min_known_fail:
4031 /* failed to find B in a non-greedy match where c1,c2 valid */
4032 if (ST.paren && ST.count)
4033 PL_regendp[ST.paren] = -1;
4034
4035 PL_reginput = locinput; /* Could be reset... */
4036 REGCP_UNWIND(ST.cp);
4037 /* Couldn't or didn't -- move forward. */
4038 ST.oldloc = locinput;
4039 if (do_utf8)
4040 locinput += UTF8SKIP(locinput);
4041 else
4042 locinput++;
4043 ST.count++;
4044 curly_try_B_min_known:
4045 /* find the next place where 'B' could work, then call B */
4046 {
4047 int n;
4048 if (do_utf8) {
4049 n = (ST.oldloc == locinput) ? 0 : 1;
4050 if (ST.c1 == ST.c2) {
4051 STRLEN len;
4052 /* set n to utf8_distance(oldloc, locinput) */
4053 while (locinput <= ST.maxpos &&
4054 utf8n_to_uvchr((U8*)locinput,
4055 UTF8_MAXBYTES, &len,
4056 uniflags) != (UV)ST.c1) {
4057 locinput += len;
4058 n++;
4059 }
1aa99e6b
IH
4060 }
4061 else {
c255a977
DM
4062 /* set n to utf8_distance(oldloc, locinput) */
4063 while (locinput <= ST.maxpos) {
4064 STRLEN len;
4065 const UV c = utf8n_to_uvchr((U8*)locinput,
4066 UTF8_MAXBYTES, &len,
4067 uniflags);
4068 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4069 break;
4070 locinput += len;
4071 n++;
1aa99e6b 4072 }
0fe9bf95
IZ
4073 }
4074 }
c255a977
DM
4075 else {
4076 if (ST.c1 == ST.c2) {
4077 while (locinput <= ST.maxpos &&
4078 UCHARAT(locinput) != ST.c1)
4079 locinput++;
bbce6d69 4080 }
c255a977
DM
4081 else {
4082 while (locinput <= ST.maxpos
4083 && UCHARAT(locinput) != ST.c1
4084 && UCHARAT(locinput) != ST.c2)
4085 locinput++;
a0ed51b3 4086 }
c255a977
DM
4087 n = locinput - ST.oldloc;
4088 }
4089 if (locinput > ST.maxpos)
4090 sayNO;
4091 /* PL_reginput == oldloc now */
4092 if (n) {
4093 ST.count += n;
4094 if (regrepeat(rex, ST.A, n) < n)
4633a7c4 4095 sayNO;
a0d0e21e 4096 }
c255a977
DM
4097 PL_reginput = locinput;
4098 CURLY_SETPAREN(ST.paren, ST.count);
4099 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 4100 }
c255a977
DM
4101 /* NOTREACHED */
4102
4103
4104 case CURLY_B_min_fail:
4105 /* failed to find B in a non-greedy match where c1,c2 invalid */
4106 if (ST.paren && ST.count)
4107 PL_regendp[ST.paren] = -1;
4108
4109 REGCP_UNWIND(ST.cp);
4110 /* failed -- move forward one */
4111 PL_reginput = locinput;
4112 if (regrepeat(rex, ST.A, 1)) {
4113 ST.count++;
a0ed51b3 4114 locinput = PL_reginput;
c255a977
DM
4115 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4116 ST.count > 0)) /* count overflow ? */
15272685 4117 {
c255a977
DM
4118 curly_try_B_min:
4119 CURLY_SETPAREN(ST.paren, ST.count);
4120 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
4121 }
4122 }
4633a7c4 4123 sayNO;
c255a977
DM
4124 /* NOTREACHED */
4125
4126
4127 curly_try_B_max:
4128 /* a successful greedy match: now try to match B */
4129 {
4130 UV c = 0;
4131 if (ST.c1 != CHRTEST_VOID)
4132 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4133 UTF8_MAXBYTES, 0, uniflags)
466787eb 4134 : (UV) UCHARAT(PL_reginput);
c255a977
DM
4135 /* If it could work, try it. */
4136 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4137 CURLY_SETPAREN(ST.paren, ST.count);
4138 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4139 /* NOTREACHED */
4140 }
4141 }
4142 /* FALL THROUGH */
4143 case CURLY_B_max_fail:
4144 /* failed to find B in a greedy match */
4145 if (ST.paren && ST.count)
4146 PL_regendp[ST.paren] = -1;
4147
4148 REGCP_UNWIND(ST.cp);
4149 /* back up. */
4150 if (--ST.count < ST.min)
4151 sayNO;
4152 PL_reginput = locinput = HOPc(locinput, -1);
4153 goto curly_try_B_max;
4154
4155#undef ST
4156
4157
a0d0e21e 4158 case END:
3b0527fe 4159 if (locinput < reginfo->till) {
a3621e74 4160 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4161 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4162 PL_colors[4],
4163 (long)(locinput - PL_reg_starttry),
3b0527fe 4164 (long)(reginfo->till - PL_reg_starttry),
7821416a
IZ
4165 PL_colors[5]));
4166 sayNO_FINAL; /* Cannot match: too short. */
4167 }
4168 PL_reginput = locinput; /* put where regtry can find it */
4169 sayYES_FINAL; /* Success! */
dad79028
DM
4170
4171 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4172 DEBUG_EXECUTE_r(
4173 PerlIO_printf(Perl_debug_log,
4174 "%*s %ssubpattern success...%s\n",
4175 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
3280af22 4176 PL_reginput = locinput; /* put where regtry can find it */
dad79028
DM
4177 sayYES_FINAL; /* Success! */
4178
40a82448
DM
4179#undef ST
4180#define ST st->u.ifmatch
4181
4182 case SUSPEND: /* (?>A) */
4183 ST.wanted = 1;
9fe1d20c 4184 PL_reginput = locinput;
9041c2e3 4185 goto do_ifmatch;
dad79028 4186
40a82448
DM
4187 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4188 ST.wanted = 0;
dad79028
DM
4189 goto ifmatch_trivial_fail_test;
4190
40a82448
DM
4191 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4192 ST.wanted = 1;
dad79028 4193 ifmatch_trivial_fail_test:
a0ed51b3 4194 if (scan->flags) {
52657f30 4195 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4196 if (!s) {
4197 /* trivial fail */
4198 if (st->logical) {
4199 st->logical = 0;
40a82448 4200 st->sw = 1 - (bool)ST.wanted;
dad79028 4201 }
40a82448 4202 else if (ST.wanted)
dad79028
DM
4203 sayNO;
4204 next = scan + ARG(scan);
4205 if (next == scan)
4206 next = NULL;
4207 break;
4208 }
efb30f32 4209 PL_reginput = s;
a0ed51b3
LW
4210 }
4211 else
4212 PL_reginput = locinput;
4213
c277df42 4214 do_ifmatch:
40a82448
DM
4215 ST.me = scan;
4216 /* execute body of (?...A) */
4217 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4218 /* NOTREACHED */
4219
4220 case IFMATCH_A_fail: /* body of (?...A) failed */
4221 ST.wanted = !ST.wanted;
4222 /* FALL THROUGH */
4223
4224 case IFMATCH_A: /* body of (?...A) succeeded */
4225 if (st->logical) {
4226 st->logical = 0;
4227 st->sw = (bool)ST.wanted;
4228 }
4229 else if (!ST.wanted)
4230 sayNO;
4231
4232 if (OP(ST.me) == SUSPEND)
4233 locinput = PL_reginput;
4234 else {
4235 locinput = PL_reginput = st->locinput;
4236 nextchr = UCHARAT(locinput);
4237 }
4238 scan = ST.me + ARG(ST.me);
4239 if (scan == ST.me)
4240 scan = NULL;
4241 continue; /* execute B */
4242
4243#undef ST
dad79028 4244
c277df42 4245 case LONGJMP:
c277df42
IZ
4246 next = scan + ARG(scan);
4247 if (next == scan)
4248 next = NULL;
a0d0e21e
LW
4249 break;
4250 default:
b900a521 4251 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4252 PTR2UV(scan), OP(scan));
cea2e8a9 4253 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4254 }
95b24440 4255
a0d0e21e 4256 scan = next;
95b24440
DM
4257 continue;
4258 /* NOTREACHED */
4259
40a82448
DM
4260 push_yes_state:
4261 /* push a state that backtracks on success */
4262 st->u.yes.prev_yes_state = yes_state;
4263 yes_state = st;
4264 /* FALL THROUGH */
4265 push_state:
4266 /* push a new regex state, then continue at scan */
4267 {
4268 regmatch_state *newst;
4269
4270 depth++;
ab3bbdeb 4271 DEBUG_STATE_r(PerlIO_printf(Perl_debug_log,
40a82448
DM
4272 "PUSH STATE(%d)\n", depth));
4273 st->locinput = locinput;
4274 newst = st+1;
4275 if (newst > SLAB_LAST(PL_regmatch_slab))
4276 newst = S_push_slab(aTHX);
4277 PL_regmatch_state = newst;
4278 newst->cc = st->cc;
4279 /* XXX probably don't need to initialise these */
4280 newst->minmod = 0;
4281 newst->sw = 0;
4282 newst->logical = 0;
4283
4284 locinput = PL_reginput;
4285 nextchr = UCHARAT(locinput);
4286 st = newst;
4287 continue;
4288 /* NOTREACHED */
4289 }
4290
95b24440
DM
4291 /* simulate recursively calling regmatch(), but without actually
4292 * recursing - ie save the current state on the heap rather than on
4293 * the stack, then re-enter the loop. This avoids complex regexes
4294 * blowing the processor stack */
4295
4296 start_recurse:
4297 {
5d9a96ca
DM
4298 /* push new state */
4299 regmatch_state *oldst = st;
4300
4301 depth++;
40a82448 4302 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH RECURSE STATE(%d)\n", depth));
5d9a96ca
DM
4303
4304 /* grab the next free state slot */
4305 st++;
86545054 4306 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
4307 st = S_push_slab(aTHX);
4308 PL_regmatch_state = st;
4309
4310 oldst->next = next;
4311 oldst->n = n;
4312 oldst->locinput = locinput;
5d9a96ca
DM
4313
4314 st->cc = oldst->cc;
95b24440
DM
4315 locinput = PL_reginput;
4316 nextchr = UCHARAT(locinput);
5d9a96ca
DM
4317 st->minmod = 0;
4318 st->sw = 0;
4319 st->logical = 0;
95b24440
DM
4320#ifdef DEBUGGING
4321 PL_regindent++;
4322#endif
4323 }
a0d0e21e 4324 }
a687059c 4325
aa283a38
DM
4326
4327
a0d0e21e
LW
4328 /*
4329 * We get here only if there's trouble -- normally "case END" is
4330 * the terminating point.
4331 */
cea2e8a9 4332 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4333 /*NOTREACHED*/
4633a7c4
LW
4334 sayNO;
4335
7821416a 4336yes_final:
77cb431f
DM
4337
4338 if (yes_state) {
4339 /* we have successfully completed a subexpression, but we must now
4340 * pop to the state marked by yes_state and continue from there */
4341
4342 assert(st != yes_state);
4343 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4344 || yes_state > SLAB_LAST(PL_regmatch_slab))
4345 {
4346 /* not in this slab, pop slab */
4347 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4348 PL_regmatch_slab = PL_regmatch_slab->prev;
4349 st = SLAB_LAST(PL_regmatch_slab);
4350 }
4351 depth -= (st - yes_state);
e4f74956
YO
4352 DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%"UVuf"..%"UVuf")\n",
4353 (UV)(depth+1), (UV)(depth+(st - yes_state))));
77cb431f
DM
4354 st = yes_state;
4355 yes_state = st->u.yes.prev_yes_state;
4356 PL_regmatch_state = st;
4357
4358 switch (st->resume_state) {
40a82448
DM
4359 case IFMATCH_A:
4360 case CURLYM_A:
40a82448
DM
4361 case EVAL_A:
4362 state_num = st->resume_state;
4363 goto reenter_switch;
4364
4365 case CURLYM_B:
166ba7cd
DM
4366 case BRANCH_next:
4367 case TRIE_next:
c255a977 4368 case CURLY_B_max:
77cb431f 4369 default:
40a82448 4370 Perl_croak(aTHX_ "unexpected yes resume state");
77cb431f
DM
4371 }
4372 }
4373
a3621e74 4374 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4375 PL_colors[4], PL_colors[5]));
4633a7c4
LW
4376yes:
4377#ifdef DEBUGGING
3280af22 4378 PL_regindent--;
4633a7c4 4379#endif
02db2b7b 4380
95b24440 4381 result = 1;
aa283a38 4382 /* XXX this is duplicate(ish) code to that in the do_no section.
40a82448 4383 * will disappear when REGFMATCH goes */
aa283a38
DM
4384 if (depth) {
4385 /* restore previous state and re-enter */
ab3bbdeb 4386 DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
40a82448
DM
4387 depth--;
4388 st--;
4389 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4390 PL_regmatch_slab = PL_regmatch_slab->prev;
4391 st = SLAB_LAST(PL_regmatch_slab);
4392 }
4393 PL_regmatch_state = st;
4394 scan = st->scan;
4395 next = st->next;
4396 n = st->n;
4397 locinput= st->locinput;
4398 nextchr = UCHARAT(locinput);
aa283a38
DM
4399
4400 switch (st->resume_state) {
aa283a38
DM
4401 case resume_CURLYX:
4402 goto resume_point_CURLYX;
4403 case resume_WHILEM1:
4404 goto resume_point_WHILEM1;
4405 case resume_WHILEM2:
4406 goto resume_point_WHILEM2;
4407 case resume_WHILEM3:
4408 goto resume_point_WHILEM3;
4409 case resume_WHILEM4:
4410 goto resume_point_WHILEM4;
4411 case resume_WHILEM5:
4412 goto resume_point_WHILEM5;
4413 case resume_WHILEM6:
4414 goto resume_point_WHILEM6;
77cb431f 4415
166ba7cd 4416 case TRIE_next:
40a82448
DM
4417 case CURLYM_A:
4418 case CURLYM_B:
4419 case EVAL_A:
4420 case IFMATCH_A:
4421 case BRANCH_next:
c255a977
DM
4422 case CURLY_B_max:
4423 case CURLY_B_min:
4424 case CURLY_B_min_known:
40a82448
DM
4425 break;
4426
aa283a38
DM
4427 default:
4428 Perl_croak(aTHX_ "regexp resume memory corruption");
4429 }
4430 }
4431 goto final_exit;
4633a7c4
LW
4432
4433no:
a3621e74 4434 DEBUG_EXECUTE_r(
7821416a
IZ
4435 PerlIO_printf(Perl_debug_log,
4436 "%*s %sfailed...%s\n",
e4584336 4437 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a 4438 );
7821416a
IZ
4439no_final:
4440do_no:
aa283a38 4441
4633a7c4 4442#ifdef DEBUGGING
3280af22 4443 PL_regindent--;
4633a7c4 4444#endif
95b24440 4445 result = 0;
5d9a96ca 4446
aa283a38
DM
4447 if (depth) {
4448 /* there's a previous state to backtrack to */
ab3bbdeb 4449 DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
40a82448
DM
4450 depth--;
4451 st--;
4452 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4453 PL_regmatch_slab = PL_regmatch_slab->prev;
4454 st = SLAB_LAST(PL_regmatch_slab);
4455 }
4456 PL_regmatch_state = st;
4457 scan = st->scan;
4458 next = st->next;
4459 n = st->n;
4460 locinput= st->locinput;
4461 nextchr = UCHARAT(locinput);
4462
5d9a96ca 4463 switch (st->resume_state) {
95b24440
DM
4464 case resume_CURLYX:
4465 goto resume_point_CURLYX;
4466 case resume_WHILEM1:
4467 goto resume_point_WHILEM1;
4468 case resume_WHILEM2:
4469 goto resume_point_WHILEM2;
4470 case resume_WHILEM3:
4471 goto resume_point_WHILEM3;
4472 case resume_WHILEM4:
4473 goto resume_point_WHILEM4;
4474 case resume_WHILEM5:
4475 goto resume_point_WHILEM5;
4476 case resume_WHILEM6:
4477 goto resume_point_WHILEM6;
dad79028 4478
166ba7cd 4479 case TRIE_next:
40a82448
DM
4480 case EVAL_A:
4481 case BRANCH_next:
4482 case CURLYM_A:
4483 case CURLYM_B:
4484 case IFMATCH_A:
c255a977
DM
4485 case CURLY_B_max:
4486 case CURLY_B_min:
4487 case CURLY_B_min_known:
40a82448
DM
4488 if (yes_state == st)
4489 yes_state = st->u.yes.prev_yes_state;
4490 state_num = st->resume_state + 1; /* failure = success + 1 */
4491 goto reenter_switch;
dad79028 4492
95b24440
DM
4493 default:
4494 Perl_croak(aTHX_ "regexp resume memory corruption");
4495 }
95b24440 4496 }
aa283a38
DM
4497
4498final_exit:
4499
5d9a96ca
DM
4500 /* restore original high-water mark */
4501 PL_regmatch_slab = orig_slab;
4502 PL_regmatch_state = orig_state;
4503
4504 /* free all slabs above current one */
4505 if (orig_slab->next) {
c4fd8992 4506 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
4507 orig_slab->next = NULL;
4508 while (sl) {
c4fd8992 4509 regmatch_slab * const osl = sl;
5d9a96ca 4510 sl = sl->next;
ad65c075 4511 Safefree(osl);
5d9a96ca
DM
4512 }
4513 }
4514
95b24440
DM
4515 return result;
4516
a687059c
LW
4517}
4518
4519/*
4520 - regrepeat - repeatedly match something simple, report how many
4521 */
4522/*
4523 * [This routine now assumes that it will only match on things of length 1.
4524 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4525 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4526 */
76e3520e 4527STATIC I32
32fc9b6a 4528S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
a687059c 4529{
27da23d5 4530 dVAR;
a0d0e21e 4531 register char *scan;
a0d0e21e 4532 register I32 c;
3280af22 4533 register char *loceol = PL_regeol;
a0ed51b3 4534 register I32 hardcount = 0;
53c4c00c 4535 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4536
3280af22 4537 scan = PL_reginput;
faf11cac
HS
4538 if (max == REG_INFTY)
4539 max = I32_MAX;
4540 else if (max < loceol - scan)
7f596f4c 4541 loceol = scan + max;
a0d0e21e 4542 switch (OP(p)) {
22c35a8c 4543 case REG_ANY:
1aa99e6b 4544 if (do_utf8) {
ffc61ed2 4545 loceol = PL_regeol;
1aa99e6b 4546 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4547 scan += UTF8SKIP(scan);
4548 hardcount++;
4549 }
4550 } else {
4551 while (scan < loceol && *scan != '\n')
4552 scan++;
a0ed51b3
LW
4553 }
4554 break;
ffc61ed2 4555 case SANY:
def8e4ea
JH
4556 if (do_utf8) {
4557 loceol = PL_regeol;
a0804c9e 4558 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4559 scan += UTF8SKIP(scan);
4560 hardcount++;
4561 }
4562 }
4563 else
4564 scan = loceol;
a0ed51b3 4565 break;
f33976b4
DB
4566 case CANY:
4567 scan = loceol;
4568 break;
090f7165
JH
4569 case EXACT: /* length of string is 1 */
4570 c = (U8)*STRING(p);
4571 while (scan < loceol && UCHARAT(scan) == c)
4572 scan++;
bbce6d69 4573 break;
4574 case EXACTF: /* length of string is 1 */
cd439c50 4575 c = (U8)*STRING(p);
bbce6d69 4576 while (scan < loceol &&
22c35a8c 4577 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4578 scan++;
4579 break;
4580 case EXACTFL: /* length of string is 1 */
3280af22 4581 PL_reg_flags |= RF_tainted;
cd439c50 4582 c = (U8)*STRING(p);
bbce6d69 4583 while (scan < loceol &&
22c35a8c 4584 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4585 scan++;
4586 break;
4587 case ANYOF:
ffc61ed2
JH
4588 if (do_utf8) {
4589 loceol = PL_regeol;
cfc92286 4590 while (hardcount < max && scan < loceol &&
32fc9b6a 4591 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4592 scan += UTF8SKIP(scan);
4593 hardcount++;
4594 }
4595 } else {
32fc9b6a 4596 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
4597 scan++;
4598 }
a0d0e21e
LW
4599 break;
4600 case ALNUM:
1aa99e6b 4601 if (do_utf8) {
ffc61ed2 4602 loceol = PL_regeol;
1a4fad37 4603 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4604 while (hardcount < max && scan < loceol &&
3568d838 4605 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4606 scan += UTF8SKIP(scan);
4607 hardcount++;
4608 }
4609 } else {
4610 while (scan < loceol && isALNUM(*scan))
4611 scan++;
a0ed51b3
LW
4612 }
4613 break;
bbce6d69 4614 case ALNUML:
3280af22 4615 PL_reg_flags |= RF_tainted;
1aa99e6b 4616 if (do_utf8) {
ffc61ed2 4617 loceol = PL_regeol;
1aa99e6b
IH
4618 while (hardcount < max && scan < loceol &&
4619 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4620 scan += UTF8SKIP(scan);
4621 hardcount++;
4622 }
4623 } else {
4624 while (scan < loceol && isALNUM_LC(*scan))
4625 scan++;
a0ed51b3
LW
4626 }
4627 break;
a0d0e21e 4628 case NALNUM:
1aa99e6b 4629 if (do_utf8) {
ffc61ed2 4630 loceol = PL_regeol;
1a4fad37 4631 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4632 while (hardcount < max && scan < loceol &&
3568d838 4633 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4634 scan += UTF8SKIP(scan);
4635 hardcount++;
4636 }
4637 } else {
4638 while (scan < loceol && !isALNUM(*scan))
4639 scan++;
a0ed51b3
LW
4640 }
4641 break;
bbce6d69 4642 case NALNUML:
3280af22 4643 PL_reg_flags |= RF_tainted;
1aa99e6b 4644 if (do_utf8) {
ffc61ed2 4645 loceol = PL_regeol;
1aa99e6b
IH
4646 while (hardcount < max && scan < loceol &&
4647 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4648 scan += UTF8SKIP(scan);
4649 hardcount++;
4650 }
4651 } else {
4652 while (scan < loceol && !isALNUM_LC(*scan))
4653 scan++;
a0ed51b3
LW
4654 }
4655 break;
a0d0e21e 4656 case SPACE:
1aa99e6b 4657 if (do_utf8) {
ffc61ed2 4658 loceol = PL_regeol;
1a4fad37 4659 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4660 while (hardcount < max && scan < loceol &&
3568d838
JH
4661 (*scan == ' ' ||
4662 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4663 scan += UTF8SKIP(scan);
4664 hardcount++;
4665 }
4666 } else {
4667 while (scan < loceol && isSPACE(*scan))
4668 scan++;
a0ed51b3
LW
4669 }
4670 break;
bbce6d69 4671 case SPACEL:
3280af22 4672 PL_reg_flags |= RF_tainted;
1aa99e6b 4673 if (do_utf8) {
ffc61ed2 4674 loceol = PL_regeol;
1aa99e6b 4675 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4676 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4677 scan += UTF8SKIP(scan);
4678 hardcount++;
4679 }
4680 } else {
4681 while (scan < loceol && isSPACE_LC(*scan))
4682 scan++;
a0ed51b3
LW
4683 }
4684 break;
a0d0e21e 4685 case NSPACE:
1aa99e6b 4686 if (do_utf8) {
ffc61ed2 4687 loceol = PL_regeol;
1a4fad37 4688 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4689 while (hardcount < max && scan < loceol &&
3568d838
JH
4690 !(*scan == ' ' ||
4691 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4692 scan += UTF8SKIP(scan);
4693 hardcount++;
4694 }
4695 } else {
4696 while (scan < loceol && !isSPACE(*scan))
4697 scan++;
4698 break;
a0ed51b3 4699 }
bbce6d69 4700 case NSPACEL:
3280af22 4701 PL_reg_flags |= RF_tainted;
1aa99e6b 4702 if (do_utf8) {
ffc61ed2 4703 loceol = PL_regeol;
1aa99e6b 4704 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4705 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4706 scan += UTF8SKIP(scan);
4707 hardcount++;
4708 }
4709 } else {
4710 while (scan < loceol && !isSPACE_LC(*scan))
4711 scan++;
a0ed51b3
LW
4712 }
4713 break;
a0d0e21e 4714 case DIGIT:
1aa99e6b 4715 if (do_utf8) {
ffc61ed2 4716 loceol = PL_regeol;
1a4fad37 4717 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4718 while (hardcount < max && scan < loceol &&
3568d838 4719 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4720 scan += UTF8SKIP(scan);
4721 hardcount++;
4722 }
4723 } else {
4724 while (scan < loceol && isDIGIT(*scan))
4725 scan++;
a0ed51b3
LW
4726 }
4727 break;
a0d0e21e 4728 case NDIGIT:
1aa99e6b 4729 if (do_utf8) {
ffc61ed2 4730 loceol = PL_regeol;
1a4fad37 4731 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4732 while (hardcount < max && scan < loceol &&
3568d838 4733 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4734 scan += UTF8SKIP(scan);
4735 hardcount++;
4736 }
4737 } else {
4738 while (scan < loceol && !isDIGIT(*scan))
4739 scan++;
a0ed51b3
LW
4740 }
4741 break;
a0d0e21e
LW
4742 default: /* Called on something of 0 width. */
4743 break; /* So match right here or not at all. */
4744 }
a687059c 4745
a0ed51b3
LW
4746 if (hardcount)
4747 c = hardcount;
4748 else
4749 c = scan - PL_reginput;
3280af22 4750 PL_reginput = scan;
a687059c 4751
a3621e74 4752 DEBUG_r({
e68ec53f 4753 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 4754 DEBUG_EXECUTE_r({
e68ec53f
YO
4755 SV * const prop = sv_newmortal();
4756 regprop(prog, prop, p);
4757 PerlIO_printf(Perl_debug_log,
be8e71aa
YO
4758 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4759 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 4760 });
be8e71aa 4761 });
9041c2e3 4762
a0d0e21e 4763 return(c);
a687059c
LW
4764}
4765
c277df42 4766
be8e71aa 4767#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 4768/*
ffc61ed2
JH
4769- regclass_swash - prepare the utf8 swash
4770*/
4771
4772SV *
32fc9b6a 4773Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4774{
97aff369 4775 dVAR;
9e55ce06
JH
4776 SV *sw = NULL;
4777 SV *si = NULL;
4778 SV *alt = NULL;
3dab1dad 4779 const struct reg_data * const data = prog ? prog->data : NULL;
ffc61ed2 4780
4f639d21 4781 if (data && data->count) {
a3b680e6 4782 const U32 n = ARG(node);
ffc61ed2 4783
4f639d21
DM
4784 if (data->what[n] == 's') {
4785 SV * const rv = (SV*)data->data[n];
890ce7af 4786 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 4787 SV **const ary = AvARRAY(av);
9e55ce06 4788 SV **a, **b;
9041c2e3 4789
711a919c 4790 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
4791 * documentation of these array elements. */
4792
b11f357e 4793 si = *ary;
8f7f7219 4794 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
4795 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4796
ffc61ed2
JH
4797 if (a)
4798 sw = *a;
4799 else if (si && doinit) {
4800 sw = swash_init("utf8", "", si, 1, 0);
4801 (void)av_store(av, 1, sw);
4802 }
9e55ce06
JH
4803 if (b)
4804 alt = *b;
ffc61ed2
JH
4805 }
4806 }
4807
9e55ce06
JH
4808 if (listsvp)
4809 *listsvp = si;
4810 if (altsvp)
4811 *altsvp = alt;
ffc61ed2
JH
4812
4813 return sw;
4814}
76234dfb 4815#endif
ffc61ed2
JH
4816
4817/*
ba7b4546 4818 - reginclass - determine if a character falls into a character class
832705d4
JH
4819
4820 The n is the ANYOF regnode, the p is the target string, lenp
4821 is pointer to the maximum length of how far to go in the p
4822 (if the lenp is zero, UTF8SKIP(p) is used),
4823 do_utf8 tells whether the target string is in UTF-8.
4824
bbce6d69 4825 */
4826
76e3520e 4827STATIC bool
32fc9b6a 4828S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4829{
27da23d5 4830 dVAR;
a3b680e6 4831 const char flags = ANYOF_FLAGS(n);
bbce6d69 4832 bool match = FALSE;
cc07378b 4833 UV c = *p;
ae9ddab8 4834 STRLEN len = 0;
9e55ce06 4835 STRLEN plen;
1aa99e6b 4836
19f67299
TS
4837 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
4838 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
4839 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
4840 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
19f67299
TS
4841 if (len == (STRLEN)-1)
4842 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
4843 }
bbce6d69 4844
0f0076b4 4845 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4846 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4847 if (lenp)
4848 *lenp = 0;
ffc61ed2 4849 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4850 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4851 match = TRUE;
bbce6d69 4852 }
3568d838 4853 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4854 match = TRUE;
ffc61ed2 4855 if (!match) {
9e55ce06 4856 AV *av;
32fc9b6a 4857 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4858
4859 if (sw) {
3568d838 4860 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4861 match = TRUE;
4862 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4863 if (!match && lenp && av) {
4864 I32 i;
9e55ce06 4865 for (i = 0; i <= av_len(av); i++) {
890ce7af 4866 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 4867 STRLEN len;
890ce7af 4868 const char * const s = SvPV_const(sv, len);
9e55ce06 4869
061b10df 4870 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4871 *lenp = len;
4872 match = TRUE;
4873 break;
4874 }
4875 }
4876 }
4877 if (!match) {
89ebb4a3 4878 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
4879 STRLEN tmplen;
4880
9e55ce06
JH
4881 to_utf8_fold(p, tmpbuf, &tmplen);
4882 if (swash_fetch(sw, tmpbuf, do_utf8))
4883 match = TRUE;
4884 }
ffc61ed2
JH
4885 }
4886 }
bbce6d69 4887 }
9e55ce06 4888 if (match && lenp && *lenp == 0)
0f0076b4 4889 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4890 }
1aa99e6b 4891 if (!match && c < 256) {
ffc61ed2
JH
4892 if (ANYOF_BITMAP_TEST(n, c))
4893 match = TRUE;
4894 else if (flags & ANYOF_FOLD) {
eb160463 4895 U8 f;
a0ed51b3 4896
ffc61ed2
JH
4897 if (flags & ANYOF_LOCALE) {
4898 PL_reg_flags |= RF_tainted;
4899 f = PL_fold_locale[c];
4900 }
4901 else
4902 f = PL_fold[c];
4903 if (f != c && ANYOF_BITMAP_TEST(n, f))
4904 match = TRUE;
4905 }
4906
4907 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4908 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4909 if (
4910 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4911 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4912 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4913 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4914 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4915 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4916 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4917 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4918 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4919 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4920 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4921 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4922 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4923 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4924 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4925 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4926 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4927 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4928 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4929 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4930 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4931 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4932 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4933 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4934 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4935 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4936 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4937 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4938 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4939 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4940 ) /* How's that for a conditional? */
4941 {
4942 match = TRUE;
4943 }
a0ed51b3 4944 }
a0ed51b3
LW
4945 }
4946
a0ed51b3
LW
4947 return (flags & ANYOF_INVERT) ? !match : match;
4948}
161b471a 4949
dfe13c55 4950STATIC U8 *
0ce71af7 4951S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 4952{
97aff369 4953 dVAR;
a0ed51b3 4954 if (off >= 0) {
1aa99e6b 4955 while (off-- && s < lim) {
ffc61ed2 4956 /* XXX could check well-formedness here */
a0ed51b3 4957 s += UTF8SKIP(s);
ffc61ed2 4958 }
a0ed51b3
LW
4959 }
4960 else {
4961 while (off++) {
1aa99e6b 4962 if (s > lim) {
a0ed51b3 4963 s--;
ffc61ed2 4964 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4965 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4966 s--;
ffc61ed2
JH
4967 }
4968 /* XXX could check well-formedness here */
a0ed51b3
LW
4969 }
4970 }
4971 }
4972 return s;
4973}
161b471a 4974
dfe13c55 4975STATIC U8 *
0ce71af7 4976S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 4977{
97aff369 4978 dVAR;
a0ed51b3 4979 if (off >= 0) {
1aa99e6b 4980 while (off-- && s < lim) {
ffc61ed2 4981 /* XXX could check well-formedness here */
a0ed51b3 4982 s += UTF8SKIP(s);
ffc61ed2 4983 }
a0ed51b3 4984 if (off >= 0)
3dab1dad 4985 return NULL;
a0ed51b3
LW
4986 }
4987 else {
4988 while (off++) {
1aa99e6b 4989 if (s > lim) {
a0ed51b3 4990 s--;
ffc61ed2 4991 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4992 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4993 s--;
ffc61ed2
JH
4994 }
4995 /* XXX could check well-formedness here */
a0ed51b3
LW
4996 }
4997 else
4998 break;
4999 }
5000 if (off <= 0)
3dab1dad 5001 return NULL;
a0ed51b3
LW
5002 }
5003 return s;
5004}
51371543 5005
51371543 5006static void
acfe0abc 5007restore_pos(pTHX_ void *arg)
51371543 5008{
97aff369 5009 dVAR;
097eb12c 5010 regexp * const rex = (regexp *)arg;
51371543
GS
5011 if (PL_reg_eval_set) {
5012 if (PL_reg_oldsaved) {
4f639d21
DM
5013 rex->subbeg = PL_reg_oldsaved;
5014 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5015#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5016 rex->saved_copy = PL_nrs;
ed252734 5017#endif
4f639d21 5018 RX_MATCH_COPIED_on(rex);
51371543
GS
5019 }
5020 PL_reg_magic->mg_len = PL_reg_oldpos;
5021 PL_reg_eval_set = 0;
5022 PL_curpm = PL_reg_oldcurpm;
5023 }
5024}
33b8afdf
JH
5025
5026STATIC void
5027S_to_utf8_substr(pTHX_ register regexp *prog)
5028{
33b8afdf 5029 if (prog->float_substr && !prog->float_utf8) {
097eb12c
AL
5030 SV* const sv = newSVsv(prog->float_substr);
5031 prog->float_utf8 = sv;
33b8afdf
JH
5032 sv_utf8_upgrade(sv);
5033 if (SvTAIL(prog->float_substr))
5034 SvTAIL_on(sv);
5035 if (prog->float_substr == prog->check_substr)
5036 prog->check_utf8 = sv;
5037 }
5038 if (prog->anchored_substr && !prog->anchored_utf8) {
097eb12c
AL
5039 SV* const sv = newSVsv(prog->anchored_substr);
5040 prog->anchored_utf8 = sv;
33b8afdf
JH
5041 sv_utf8_upgrade(sv);
5042 if (SvTAIL(prog->anchored_substr))
5043 SvTAIL_on(sv);
5044 if (prog->anchored_substr == prog->check_substr)
5045 prog->check_utf8 = sv;
5046 }
5047}
5048
5049STATIC void
5050S_to_byte_substr(pTHX_ register regexp *prog)
5051{
97aff369 5052 dVAR;
33b8afdf 5053 if (prog->float_utf8 && !prog->float_substr) {
097eb12c
AL
5054 SV* sv = newSVsv(prog->float_utf8);
5055 prog->float_substr = sv;
33b8afdf
JH
5056 if (sv_utf8_downgrade(sv, TRUE)) {
5057 if (SvTAIL(prog->float_utf8))
5058 SvTAIL_on(sv);
5059 } else {
5060 SvREFCNT_dec(sv);
5061 prog->float_substr = sv = &PL_sv_undef;
5062 }
5063 if (prog->float_utf8 == prog->check_utf8)
5064 prog->check_substr = sv;
5065 }
5066 if (prog->anchored_utf8 && !prog->anchored_substr) {
097eb12c
AL
5067 SV* sv = newSVsv(prog->anchored_utf8);
5068 prog->anchored_substr = sv;
33b8afdf
JH
5069 if (sv_utf8_downgrade(sv, TRUE)) {
5070 if (SvTAIL(prog->anchored_utf8))
5071 SvTAIL_on(sv);
5072 } else {
5073 SvREFCNT_dec(sv);
5074 prog->anchored_substr = sv = &PL_sv_undef;
5075 }
5076 if (prog->anchored_utf8 == prog->check_utf8)
5077 prog->check_substr = sv;
5078 }
5079}
66610fdd
RGS
5080
5081/*
5082 * Local variables:
5083 * c-indentation-style: bsd
5084 * c-basic-offset: 4
5085 * indent-tabs-mode: t
5086 * End:
5087 *
37442d52
RGS
5088 * ex: set ts=8 sts=4 sw=4 noet:
5089 */