This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Remove the enable_debugging member from the structure, and instead
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
c5be433b
GS
50
51# define PERL_NO_GET_CONTEXT
9041c2e3 52#endif
56953603 53
a687059c 54/*
e50aee73 55 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
56 *
57 * Copyright (c) 1986 by University of Toronto.
58 * Written by Henry Spencer. Not derived from licensed software.
59 *
60 * Permission is granted to anyone to use this software for any
61 * purpose on any computer system, and to redistribute it freely,
62 * subject to the following restrictions:
63 *
64 * 1. The author is not responsible for the consequences of use of
65 * this software, no matter how awful, even if they arise
66 * from defects in it.
67 *
68 * 2. The origin of this software must not be misrepresented, either
69 * by explicit claim or by omission.
70 *
71 * 3. Altered versions must be plainly marked as such, and must not
72 * be misrepresented as being the original software.
73 *
74 **** Alterations to Henry's code are...
75 ****
4bb101f2 76 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 77 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 78 ****
9ef589d8
LW
79 **** You may distribute under the terms of either the GNU General Public
80 **** License or the Artistic License, as specified in the README file.
a687059c
LW
81 *
82 * Beware that some of this code is subtly aware of the way operator
83 * precedence is structured in regular expressions. Serious changes in
84 * regular-expression syntax might require a total rethink.
85 */
86#include "EXTERN.h"
864dbfa3 87#define PERL_IN_REGEXEC_C
a687059c 88#include "perl.h"
0f5d15d6 89
a687059c
LW
90#include "regcomp.h"
91
c277df42
IZ
92#define RF_tainted 1 /* tainted information used? */
93#define RF_warned 2 /* warned about big count? */
ce862d02 94#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
95#define RF_utf8 8 /* String contains multibyte chars? */
96
eb160463 97#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
98
99#define RS_init 1 /* eval environment created */
100#define RS_set 2 /* replsv value is set */
c277df42 101
a687059c
LW
102#ifndef STATIC
103#define STATIC static
104#endif
105
32fc9b6a 106#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 107
c277df42
IZ
108/*
109 * Forwards.
110 */
111
33b8afdf 112#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 113#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 114
52657f30
AL
115#define HOPc(pos,off) ((char *)(PL_reg_match_utf8 \
116 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
117 : (U8*)(pos + off)))
118#define HOPBACKc(pos, off) ((char*) \
119 ((PL_reg_match_utf8) \
120 ? reghopmaybe3((U8*)pos, -off, ((U8*)(off < 0 ? PL_regeol : PL_bostr))) \
efb30f32
HS
121 : (pos - off >= PL_bostr) \
122 ? (U8*)(pos - off) \
52657f30 123 : (U8*)NULL) \
efb30f32 124)
efb30f32 125
1aa99e6b 126#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c 127#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 128#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 129
1a4fad37
AL
130#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
131 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
132#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
133#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
134#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
135#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 136
5f80c4cf 137/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
138#define JUMPABLE(rn) ( \
139 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
140 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
141 OP(rn) == PLUS || OP(rn) == MINMOD || \
142 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
143)
144
cca55fe3
JP
145#define HAS_TEXT(rn) ( \
146 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
147)
e2d8ce26 148
a84d97b6
HS
149/*
150 Search for mandatory following text node; for lookahead, the text must
151 follow but for lookbehind (rn->flags != 0) we skip to the next step.
152*/
cca55fe3 153#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 154 while (JUMPABLE(rn)) \
a84d97b6 155 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 156 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
157 else if (OP(rn) == PLUS) \
158 rn = NEXTOPER(rn); \
a84d97b6
HS
159 else if (OP(rn) == IFMATCH) \
160 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 161 else rn += NEXT_OFF(rn); \
5f80c4cf 162} STMT_END
74750237 163
acfe0abc 164static void restore_pos(pTHX_ void *arg);
51371543 165
76e3520e 166STATIC CHECKPOINT
cea2e8a9 167S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 168{
97aff369 169 dVAR;
a3b680e6 170 const int retval = PL_savestack_ix;
b1ce53c5 171#define REGCP_PAREN_ELEMS 4
a3b680e6 172 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
173 int p;
174
e49a9654
IH
175 if (paren_elems_to_push < 0)
176 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
177
a01268b5 178#define REGCP_OTHER_ELEMS 6
4b3c1a47 179 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 180 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 181/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
182 SSPUSHINT(PL_regendp[p]);
183 SSPUSHINT(PL_regstartp[p]);
3280af22 184 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
185 SSPUSHINT(p);
186 }
b1ce53c5 187/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
188 SSPUSHINT(PL_regsize);
189 SSPUSHINT(*PL_reglastparen);
a01268b5 190 SSPUSHINT(*PL_reglastcloseparen);
3280af22 191 SSPUSHPTR(PL_reginput);
41123dfd
JH
192#define REGCP_FRAME_ELEMS 2
193/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
194 * are needed for the regexp context stack bookkeeping. */
195 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 196 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 197
a0d0e21e
LW
198 return retval;
199}
200
c277df42 201/* These are needed since we do not localize EVAL nodes: */
a3621e74 202# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 203 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 204 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 205
a3621e74 206# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 207 PerlIO_printf(Perl_debug_log, \
faccc32b 208 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 209 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 210
76e3520e 211STATIC char *
097eb12c 212S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 213{
97aff369 214 dVAR;
b1ce53c5 215 I32 i;
a0d0e21e 216 char *input;
b1ce53c5 217
a3621e74
YO
218 GET_RE_DEBUG_FLAGS_DECL;
219
b1ce53c5 220 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 221 i = SSPOPINT;
b1ce53c5
JH
222 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
223 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 224 input = (char *) SSPOPPTR;
a01268b5 225 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
226 *PL_reglastparen = SSPOPINT;
227 PL_regsize = SSPOPINT;
b1ce53c5
JH
228
229 /* Now restore the parentheses context. */
41123dfd
JH
230 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
231 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 232 I32 tmps;
097eb12c 233 U32 paren = (U32)SSPOPINT;
3280af22 234 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
235 PL_regstartp[paren] = SSPOPINT;
236 tmps = SSPOPINT;
3280af22
NIS
237 if (paren <= *PL_reglastparen)
238 PL_regendp[paren] = tmps;
a3621e74 239 DEBUG_EXECUTE_r(
c3464db5 240 PerlIO_printf(Perl_debug_log,
b900a521 241 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 242 (UV)paren, (IV)PL_regstartp[paren],
b900a521 243 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 244 (IV)PL_regendp[paren],
3280af22 245 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 246 );
a0d0e21e 247 }
a3621e74 248 DEBUG_EXECUTE_r(
4f639d21 249 if ((I32)(*PL_reglastparen + 1) <= rex->nparens) {
c3464db5 250 PerlIO_printf(Perl_debug_log,
faccc32b 251 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 252 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
253 }
254 );
daf18116 255#if 1
dafc8851
JH
256 /* It would seem that the similar code in regtry()
257 * already takes care of this, and in fact it is in
258 * a better location to since this code can #if 0-ed out
259 * but the code in regtry() is needed or otherwise tests
260 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
261 * (as of patchlevel 7877) will fail. Then again,
262 * this code seems to be necessary or otherwise
263 * building DynaLoader will fail:
264 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
265 * --jhi */
097eb12c
AL
266 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
267 if (i > PL_regsize)
268 PL_regstartp[i] = -1;
269 PL_regendp[i] = -1;
a0d0e21e 270 }
dafc8851 271#endif
a0d0e21e
LW
272 return input;
273}
274
02db2b7b 275#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 276
95b24440 277#define TRYPAREN(paren, n, input, where) { \
29d1e993
HS
278 if (paren) { \
279 if (n) { \
280 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
281 PL_regendp[paren] = input - PL_bostr; \
282 } \
283 else \
284 PL_regendp[paren] = -1; \
285 } \
95b24440
DM
286 REGMATCH(next, where); \
287 if (result) \
29d1e993
HS
288 sayYES; \
289 if (paren && n) \
290 PL_regendp[paren] = -1; \
291}
292
293
a687059c 294/*
e50aee73 295 * pregexec and friends
a687059c
LW
296 */
297
76234dfb 298#ifndef PERL_IN_XSUB_RE
a687059c 299/*
c277df42 300 - pregexec - match a regexp against a string
a687059c 301 */
c277df42 302I32
864dbfa3 303Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 304 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
305/* strend: pointer to null at end of string */
306/* strbeg: real beginning of string */
307/* minend: end of match must be >=minend after stringarg. */
308/* nosave: For optimizations. */
309{
310 return
9041c2e3 311 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
312 nosave ? 0 : REXEC_COPY_STR);
313}
76234dfb 314#endif
22e551b9 315
9041c2e3 316/*
cad2e5aa
JH
317 * Need to implement the following flags for reg_anch:
318 *
319 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
320 * USE_INTUIT_ML
321 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
322 * INTUIT_AUTORITATIVE_ML
323 * INTUIT_ONCE_NOML - Intuit can match in one location only.
324 * INTUIT_ONCE_ML
325 *
326 * Another flag for this function: SECOND_TIME (so that float substrs
327 * with giant delta may be not rechecked).
328 */
329
330/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
331
3f7c398e 332/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
333 Otherwise, only SvCUR(sv) is used to get strbeg. */
334
335/* XXXX We assume that strpos is strbeg unless sv. */
336
6eb5f6b9
JH
337/* XXXX Some places assume that there is a fixed substring.
338 An update may be needed if optimizer marks as "INTUITable"
339 RExen without fixed substrings. Similarly, it is assumed that
340 lengths of all the strings are no more than minlen, thus they
341 cannot come from lookahead.
342 (Or minlen should take into account lookahead.) */
343
2c2d71f5
JH
344/* A failure to find a constant substring means that there is no need to make
345 an expensive call to REx engine, thus we celebrate a failure. Similarly,
346 finding a substring too deep into the string means that less calls to
30944b6d
IZ
347 regtry() should be needed.
348
349 REx compiler's optimizer found 4 possible hints:
350 a) Anchored substring;
351 b) Fixed substring;
352 c) Whether we are anchored (beginning-of-line or \G);
353 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 354 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
355 string which does not contradict any of them.
356 */
2c2d71f5 357
6eb5f6b9
JH
358/* Most of decisions we do here should have been done at compile time.
359 The nodes of the REx which we used for the search should have been
360 deleted from the finite automaton. */
361
cad2e5aa
JH
362char *
363Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
364 char *strend, U32 flags, re_scream_pos_data *data)
365{
97aff369 366 dVAR;
b7953727 367 register I32 start_shift = 0;
cad2e5aa 368 /* Should be nonnegative! */
b7953727 369 register I32 end_shift = 0;
2c2d71f5
JH
370 register char *s;
371 register SV *check;
a1933d95 372 char *strbeg;
cad2e5aa 373 char *t;
a3b680e6 374 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 375 I32 ml_anch;
bd61b366
SS
376 register char *other_last = NULL; /* other substr checked before this */
377 char *check_at = NULL; /* check substr found at this pos */
1df70142 378 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 379#ifdef DEBUGGING
890ce7af
AL
380 const char * const i_strpos = strpos;
381 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 382#endif
a3621e74
YO
383
384 GET_RE_DEBUG_FLAGS_DECL;
385
a30b2f1f 386 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 387
b8d68ded 388 if (prog->reganch & ROPT_UTF8) {
a3621e74 389 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
390 "UTF-8 regex...\n"));
391 PL_reg_flags |= RF_utf8;
392 }
393
a3621e74 394 DEBUG_EXECUTE_r({
1df70142 395 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
396 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
397 strpos;
1df70142 398 const int len = PL_reg_match_utf8 ?
b8d68ded 399 strlen(s) : strend - strpos;
2a782b5b
JH
400 if (!PL_colorset)
401 reginitcolors();
b8d68ded 402 if (PL_reg_match_utf8)
a3621e74 403 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 404 "UTF-8 target...\n"));
2a782b5b 405 PerlIO_printf(Perl_debug_log,
a0288114 406 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 407 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
408 prog->precomp,
409 PL_colors[1],
410 (strlen(prog->precomp) > 60 ? "..." : ""),
411 PL_colors[0],
412 (int)(len > 60 ? 60 : len),
413 s, PL_colors[1],
414 (len > 60 ? "..." : "")
415 );
416 });
cad2e5aa 417
c344f387
JH
418 /* CHR_DIST() would be more correct here but it makes things slow. */
419 if (prog->minlen > strend - strpos) {
a3621e74 420 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 421 "String too short... [re_intuit_start]\n"));
cad2e5aa 422 goto fail;
2c2d71f5 423 }
a1933d95 424 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 425 PL_regeol = strend;
33b8afdf
JH
426 if (do_utf8) {
427 if (!prog->check_utf8 && prog->check_substr)
428 to_utf8_substr(prog);
429 check = prog->check_utf8;
430 } else {
431 if (!prog->check_substr && prog->check_utf8)
432 to_byte_substr(prog);
433 check = prog->check_substr;
434 }
435 if (check == &PL_sv_undef) {
a3621e74 436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
437 "Non-utf string cannot match utf check string\n"));
438 goto fail;
439 }
2c2d71f5 440 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
441 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
442 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 443 && !multiline ) ); /* Check after \n? */
cad2e5aa 444
7e25d62c
JH
445 if (!ml_anch) {
446 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
447 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 448 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
449 && sv && !SvROK(sv)
450 && (strpos != strbeg)) {
a3621e74 451 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
452 goto fail;
453 }
454 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 455 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 456 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
457 I32 slen;
458
1aa99e6b 459 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
460 if (SvTAIL(check)) {
461 slen = SvCUR(check); /* >= 1 */
cad2e5aa 462
9041c2e3 463 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 464 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 465 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 466 goto fail_finish;
cad2e5aa
JH
467 }
468 /* Now should match s[0..slen-2] */
469 slen--;
3f7c398e 470 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 471 || (slen > 1
3f7c398e 472 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 473 report_neq:
a3621e74 474 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
475 goto fail_finish;
476 }
cad2e5aa 477 }
3f7c398e 478 else if (*SvPVX_const(check) != *s
653099ff 479 || ((slen = SvCUR(check)) > 1
3f7c398e 480 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 481 goto report_neq;
c315bfe8 482 check_at = s;
2c2d71f5 483 goto success_at_start;
7e25d62c 484 }
cad2e5aa 485 }
2c2d71f5 486 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 487 s = strpos;
2c2d71f5 488 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 489 end_shift = prog->minlen - start_shift -
653099ff 490 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 491 if (!ml_anch) {
a3b680e6 492 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 493 - (SvTAIL(check) != 0);
a3b680e6 494 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
495
496 if (end_shift < eshift)
497 end_shift = eshift;
498 }
cad2e5aa 499 }
2c2d71f5 500 else { /* Can match at random position */
cad2e5aa
JH
501 ml_anch = 0;
502 s = strpos;
2c2d71f5
JH
503 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
504 /* Should be nonnegative! */
505 end_shift = prog->minlen - start_shift -
653099ff 506 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
507 }
508
2c2d71f5 509#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 510 if (end_shift < 0)
6bbae5e6 511 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
512#endif
513
2c2d71f5
JH
514 restart:
515 /* Find a possible match in the region s..strend by looking for
516 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 517 if (flags & REXEC_SCREAM) {
cad2e5aa 518 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 519 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 520
2c2d71f5
JH
521 if (PL_screamfirst[BmRARE(check)] >= 0
522 || ( BmRARE(check) == '\n'
523 && (BmPREVIOUS(check) == SvCUR(check) - 1)
524 && SvTAIL(check) ))
9041c2e3 525 s = screaminstr(sv, check,
2c2d71f5 526 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 527 else
2c2d71f5 528 goto fail_finish;
4addbd3b
HS
529 /* we may be pointing at the wrong string */
530 if (s && RX_MATCH_COPIED(prog))
3f7c398e 531 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
532 if (data)
533 *data->scream_olds = s;
534 }
f33976b4 535 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
536 s = fbm_instr((U8*)(s + start_shift),
537 (U8*)(strend - end_shift),
7fba1cd6 538 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 539 else
1aa99e6b
IH
540 s = fbm_instr(HOP3(s, start_shift, strend),
541 HOP3(strend, -end_shift, strbeg),
7fba1cd6 542 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
543
544 /* Update the count-of-usability, remove useless subpatterns,
545 unshift s. */
2c2d71f5 546
a0288114 547 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 548 (s ? "Found" : "Did not find"),
33b8afdf 549 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 550 PL_colors[0],
7b0972df 551 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 552 SvPVX_const(check),
2c2d71f5
JH
553 PL_colors[1], (SvTAIL(check) ? "$" : ""),
554 (s ? " at offset " : "...\n") ) );
555
556 if (!s)
557 goto fail_finish;
558
6eb5f6b9
JH
559 check_at = s;
560
2c2d71f5 561 /* Finish the diagnostic message */
a3621e74 562 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
563
564 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
565 Start with the other substr.
566 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 567 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
568 *always* match. Probably should be marked during compile...
569 Probably it is right to do no SCREAM here...
570 */
571
33b8afdf 572 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 573 /* Take into account the "other" substring. */
2c2d71f5
JH
574 /* XXXX May be hopelessly wrong for UTF... */
575 if (!other_last)
6eb5f6b9 576 other_last = strpos;
33b8afdf 577 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
578 do_other_anchored:
579 {
890ce7af
AL
580 char * const last = HOP3c(s, -start_shift, strbeg);
581 char *last1, *last2;
2c2d71f5 582 char *s1 = s;
33b8afdf 583 SV* must;
2c2d71f5 584
2c2d71f5
JH
585 t = s - prog->check_offset_max;
586 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 587 && (!do_utf8
1aa99e6b 588 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 589 && t > strpos)))
30944b6d 590 /* EMPTY */;
2c2d71f5
JH
591 else
592 t = strpos;
1aa99e6b 593 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
594 if (t < other_last) /* These positions already checked */
595 t = other_last;
1aa99e6b 596 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
597 if (last < last1)
598 last1 = last;
599 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
600 /* On end-of-str: see comment below. */
33b8afdf
JH
601 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
602 if (must == &PL_sv_undef) {
603 s = (char*)NULL;
a3621e74 604 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
605 }
606 else
607 s = fbm_instr(
608 (unsigned char*)t,
609 HOP3(HOP3(last1, prog->anchored_offset, strend)
610 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
611 must,
7fba1cd6 612 multiline ? FBMrf_MULTILINE : 0
33b8afdf 613 );
a3621e74 614 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 615 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
616 (s ? "Found" : "Contradicts"),
617 PL_colors[0],
33b8afdf
JH
618 (int)(SvCUR(must)
619 - (SvTAIL(must)!=0)),
3f7c398e 620 SvPVX_const(must),
33b8afdf 621 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
622 if (!s) {
623 if (last1 >= last2) {
a3621e74 624 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
625 ", giving up...\n"));
626 goto fail_finish;
627 }
a3621e74 628 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 629 ", trying floating at offset %ld...\n",
1aa99e6b
IH
630 (long)(HOP3c(s1, 1, strend) - i_strpos)));
631 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
632 s = HOP3c(last, 1, strend);
2c2d71f5
JH
633 goto restart;
634 }
635 else {
a3621e74 636 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 637 (long)(s - i_strpos)));
1aa99e6b
IH
638 t = HOP3c(s, -prog->anchored_offset, strbeg);
639 other_last = HOP3c(s, 1, strend);
30944b6d 640 s = s1;
2c2d71f5
JH
641 if (t == strpos)
642 goto try_at_start;
2c2d71f5
JH
643 goto try_at_offset;
644 }
30944b6d 645 }
2c2d71f5
JH
646 }
647 else { /* Take into account the floating substring. */
33b8afdf
JH
648 char *last, *last1;
649 char *s1 = s;
650 SV* must;
651
652 t = HOP3c(s, -start_shift, strbeg);
653 last1 = last =
654 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
655 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
656 last = HOP3c(t, prog->float_max_offset, strend);
657 s = HOP3c(t, prog->float_min_offset, strend);
658 if (s < other_last)
659 s = other_last;
2c2d71f5 660 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
661 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
662 /* fbm_instr() takes into account exact value of end-of-str
663 if the check is SvTAIL(ed). Since false positives are OK,
664 and end-of-str is not later than strend we are OK. */
665 if (must == &PL_sv_undef) {
666 s = (char*)NULL;
a3621e74 667 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
668 }
669 else
2c2d71f5 670 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
671 (unsigned char*)last + SvCUR(must)
672 - (SvTAIL(must)!=0),
7fba1cd6 673 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 674 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
675 (s ? "Found" : "Contradicts"),
676 PL_colors[0],
677 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 678 SvPVX_const(must),
33b8afdf
JH
679 PL_colors[1], (SvTAIL(must) ? "$" : "")));
680 if (!s) {
681 if (last1 == last) {
a3621e74 682 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
683 ", giving up...\n"));
684 goto fail_finish;
2c2d71f5 685 }
a3621e74 686 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
687 ", trying anchored starting at offset %ld...\n",
688 (long)(s1 + 1 - i_strpos)));
689 other_last = last;
690 s = HOP3c(t, 1, strend);
691 goto restart;
692 }
693 else {
a3621e74 694 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
695 (long)(s - i_strpos)));
696 other_last = s; /* Fix this later. --Hugo */
697 s = s1;
698 if (t == strpos)
699 goto try_at_start;
700 goto try_at_offset;
701 }
2c2d71f5 702 }
cad2e5aa 703 }
2c2d71f5
JH
704
705 t = s - prog->check_offset_max;
2c2d71f5 706 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 707 && (!do_utf8
1aa99e6b
IH
708 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
709 && t > strpos))) {
2c2d71f5
JH
710 /* Fixed substring is found far enough so that the match
711 cannot start at strpos. */
712 try_at_offset:
cad2e5aa 713 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
714 /* Eventually fbm_*() should handle this, but often
715 anchored_offset is not 0, so this check will not be wasted. */
716 /* XXXX In the code below we prefer to look for "^" even in
717 presence of anchored substrings. And we search even
718 beyond the found float position. These pessimizations
719 are historical artefacts only. */
720 find_anchor:
2c2d71f5 721 while (t < strend - prog->minlen) {
cad2e5aa 722 if (*t == '\n') {
4ee3650e 723 if (t < check_at - prog->check_offset_min) {
33b8afdf 724 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
725 /* Since we moved from the found position,
726 we definitely contradict the found anchored
30944b6d
IZ
727 substr. Due to the above check we do not
728 contradict "check" substr.
729 Thus we can arrive here only if check substr
730 is float. Redo checking for "other"=="fixed".
731 */
9041c2e3 732 strpos = t + 1;
a3621e74 733 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 734 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
735 goto do_other_anchored;
736 }
4ee3650e
GS
737 /* We don't contradict the found floating substring. */
738 /* XXXX Why not check for STCLASS? */
cad2e5aa 739 s = t + 1;
a3621e74 740 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 741 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
742 goto set_useful;
743 }
4ee3650e
GS
744 /* Position contradicts check-string */
745 /* XXXX probably better to look for check-string
746 than for "\n", so one should lower the limit for t? */
a3621e74 747 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 748 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 749 other_last = strpos = s = t + 1;
cad2e5aa
JH
750 goto restart;
751 }
752 t++;
753 }
a3621e74 754 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 755 PL_colors[0], PL_colors[1]));
2c2d71f5 756 goto fail_finish;
cad2e5aa 757 }
f5952150 758 else {
a3621e74 759 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 760 PL_colors[0], PL_colors[1]));
f5952150 761 }
cad2e5aa
JH
762 s = t;
763 set_useful:
33b8afdf 764 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
765 }
766 else {
f5952150 767 /* The found string does not prohibit matching at strpos,
2c2d71f5 768 - no optimization of calling REx engine can be performed,
f5952150
GS
769 unless it was an MBOL and we are not after MBOL,
770 or a future STCLASS check will fail this. */
2c2d71f5
JH
771 try_at_start:
772 /* Even in this situation we may use MBOL flag if strpos is offset
773 wrt the start of the string. */
05b4157f 774 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 775 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
776 /* May be due to an implicit anchor of m{.*foo} */
777 && !(prog->reganch & ROPT_IMPLICIT))
778 {
cad2e5aa
JH
779 t = strpos;
780 goto find_anchor;
781 }
a3621e74 782 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 783 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 784 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 785 );
2c2d71f5 786 success_at_start:
30944b6d 787 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
788 && (do_utf8 ? (
789 prog->check_utf8 /* Could be deleted already */
790 && --BmUSEFUL(prog->check_utf8) < 0
791 && (prog->check_utf8 == prog->float_utf8)
792 ) : (
793 prog->check_substr /* Could be deleted already */
794 && --BmUSEFUL(prog->check_substr) < 0
795 && (prog->check_substr == prog->float_substr)
796 )))
66e933ab 797 {
cad2e5aa 798 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 799 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
800 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
801 if (do_utf8 ? prog->check_substr : prog->check_utf8)
802 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
803 prog->check_substr = prog->check_utf8 = NULL; /* disable */
804 prog->float_substr = prog->float_utf8 = NULL; /* clear */
805 check = NULL; /* abort */
cad2e5aa 806 s = strpos;
3cf5c195
IZ
807 /* XXXX This is a remnant of the old implementation. It
808 looks wasteful, since now INTUIT can use many
6eb5f6b9 809 other heuristics. */
cad2e5aa
JH
810 prog->reganch &= ~RE_USE_INTUIT;
811 }
812 else
813 s = strpos;
814 }
815
6eb5f6b9
JH
816 /* Last resort... */
817 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
818 if (prog->regstclass) {
819 /* minlen == 0 is possible if regstclass is \b or \B,
820 and the fixed substr is ''$.
821 Since minlen is already taken into account, s+1 is before strend;
822 accidentally, minlen >= 1 guaranties no false positives at s + 1
823 even for \b or \B. But (minlen? 1 : 0) below assumes that
824 regstclass does not come from lookahead... */
825 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
826 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 827 const U8* const str = (U8*)STRING(prog->regstclass);
06b5626a 828 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 829 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 830 : 1);
a3b680e6 831 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 832 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 833 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
834 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
835 cl_l, strend)
836 : strend);
6eb5f6b9
JH
837
838 t = s;
3b0527fe 839 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
840 if (!s) {
841#ifdef DEBUGGING
cbbf8932 842 const char *what = NULL;
6eb5f6b9
JH
843#endif
844 if (endpos == strend) {
a3621e74 845 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
846 "Could not match STCLASS...\n") );
847 goto fail;
848 }
a3621e74 849 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 850 "This position contradicts STCLASS...\n") );
653099ff
GS
851 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
852 goto fail;
6eb5f6b9 853 /* Contradict one of substrings */
33b8afdf
JH
854 if (prog->anchored_substr || prog->anchored_utf8) {
855 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 856 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 857 hop_and_restart:
1aa99e6b 858 s = HOP3c(t, 1, strend);
66e933ab
GS
859 if (s + start_shift + end_shift > strend) {
860 /* XXXX Should be taken into account earlier? */
a3621e74 861 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
862 "Could not match STCLASS...\n") );
863 goto fail;
864 }
5e39e1e5
HS
865 if (!check)
866 goto giveup;
a3621e74 867 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 868 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
869 what, (long)(s + start_shift - i_strpos)) );
870 goto restart;
871 }
66e933ab 872 /* Have both, check_string is floating */
6eb5f6b9
JH
873 if (t + start_shift >= check_at) /* Contradicts floating=check */
874 goto retry_floating_check;
875 /* Recheck anchored substring, but not floating... */
9041c2e3 876 s = check_at;
5e39e1e5
HS
877 if (!check)
878 goto giveup;
a3621e74 879 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 880 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
881 (long)(other_last - i_strpos)) );
882 goto do_other_anchored;
883 }
60e71179
GS
884 /* Another way we could have checked stclass at the
885 current position only: */
886 if (ml_anch) {
887 s = t = t + 1;
5e39e1e5
HS
888 if (!check)
889 goto giveup;
a3621e74 890 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 891 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 892 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 893 goto try_at_offset;
66e933ab 894 }
33b8afdf 895 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 896 goto fail;
6eb5f6b9
JH
897 /* Check is floating subtring. */
898 retry_floating_check:
899 t = check_at - start_shift;
a3621e74 900 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
901 goto hop_and_restart;
902 }
b7953727 903 if (t != s) {
a3621e74 904 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 905 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
906 (long)(t - i_strpos), (long)(s - i_strpos))
907 );
908 }
909 else {
a3621e74 910 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
911 "Does not contradict STCLASS...\n");
912 );
913 }
6eb5f6b9 914 }
5e39e1e5 915 giveup:
a3621e74 916 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
917 PL_colors[4], (check ? "Guessed" : "Giving up"),
918 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 919 return s;
2c2d71f5
JH
920
921 fail_finish: /* Substring not found */
33b8afdf
JH
922 if (prog->check_substr || prog->check_utf8) /* could be removed already */
923 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 924 fail:
a3621e74 925 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 926 PL_colors[4], PL_colors[5]));
bd61b366 927 return NULL;
cad2e5aa 928}
9661b544 929
6eb5f6b9 930/* We know what class REx starts with. Try to find this position... */
3b0527fe
DM
931/* if reginfo is NULL, its a dryrun */
932
3c3eec57 933STATIC char *
3b0527fe
DM
934S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, const char
935*strend, const regmatch_info *reginfo)
a687059c 936{
27da23d5 937 dVAR;
1df70142 938 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 939 char *m;
d8093b23 940 STRLEN ln;
5dab1207 941 STRLEN lnc;
078c425b 942 register STRLEN uskip;
d8093b23
G
943 unsigned int c1;
944 unsigned int c2;
6eb5f6b9
JH
945 char *e;
946 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 947 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 948
6eb5f6b9
JH
949 /* We know what class it must start with. */
950 switch (OP(c)) {
6eb5f6b9 951 case ANYOF:
388cc4de 952 if (do_utf8) {
078c425b 953 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
954 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
955 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a
DM
956 reginclass(prog, c, (U8*)s, 0, do_utf8) :
957 REGINCLASS(prog, c, (U8*)s)) {
3b0527fe 958 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
959 goto got_it;
960 else
961 tmp = doevery;
962 }
963 else
964 tmp = 1;
078c425b 965 s += uskip;
388cc4de
HS
966 }
967 }
968 else {
969 while (s < strend) {
970 STRLEN skip = 1;
971
32fc9b6a 972 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
973 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
974 /* The assignment of 2 is intentional:
975 * for the folded sharp s, the skip is 2. */
976 (skip = SHARP_S_SKIP))) {
3b0527fe 977 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
978 goto got_it;
979 else
980 tmp = doevery;
981 }
982 else
983 tmp = 1;
984 s += skip;
985 }
a0d0e21e 986 }
6eb5f6b9 987 break;
f33976b4
DB
988 case CANY:
989 while (s < strend) {
3b0527fe 990 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
991 goto got_it;
992 else
993 tmp = doevery;
994 s++;
995 }
996 break;
6eb5f6b9 997 case EXACTF:
5dab1207
NIS
998 m = STRING(c);
999 ln = STR_LEN(c); /* length to match in octets/bytes */
1000 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1001 if (UTF) {
a2a2844f 1002 STRLEN ulen1, ulen2;
5dab1207 1003 U8 *sm = (U8 *) m;
89ebb4a3
JH
1004 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1005 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1006 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1007
1008 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1009 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1010
89ebb4a3 1011 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1012 0, uniflags);
89ebb4a3 1013 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1014 0, uniflags);
5dab1207
NIS
1015 lnc = 0;
1016 while (sm < ((U8 *) m + ln)) {
1017 lnc++;
1018 sm += UTF8SKIP(sm);
1019 }
1aa99e6b
IH
1020 }
1021 else {
1022 c1 = *(U8*)m;
1023 c2 = PL_fold[c1];
1024 }
6eb5f6b9
JH
1025 goto do_exactf;
1026 case EXACTFL:
5dab1207
NIS
1027 m = STRING(c);
1028 ln = STR_LEN(c);
1029 lnc = (I32) ln;
d8093b23 1030 c1 = *(U8*)m;
6eb5f6b9
JH
1031 c2 = PL_fold_locale[c1];
1032 do_exactf:
db12adc6 1033 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1034
3b0527fe 1035 if (!reginfo && e < s)
6eb5f6b9 1036 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1037
60a8b682
JH
1038 /* The idea in the EXACTF* cases is to first find the
1039 * first character of the EXACTF* node and then, if
1040 * necessary, case-insensitively compare the full
1041 * text of the node. The c1 and c2 are the first
1042 * characters (though in Unicode it gets a bit
1043 * more complicated because there are more cases
7f16dd3d
JH
1044 * than just upper and lower: one needs to use
1045 * the so-called folding case for case-insensitive
1046 * matching (called "loose matching" in Unicode).
1047 * ibcmp_utf8() will do just that. */
60a8b682 1048
1aa99e6b 1049 if (do_utf8) {
575cac57 1050 UV c, f;
89ebb4a3 1051 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1052 STRLEN len, foldlen;
4ad0818d 1053 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1054 if (c1 == c2) {
5dab1207
NIS
1055 /* Upper and lower of 1st char are equal -
1056 * probably not a "letter". */
1aa99e6b 1057 while (s <= e) {
89ebb4a3 1058 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1059 uniflags);
80aecb99
JH
1060 if ( c == c1
1061 && (ln == len ||
66423254 1062 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1063 m, (char **)0, ln, (bool)UTF))
3b0527fe 1064 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1065 goto got_it;
80aecb99 1066 else {
1df70142 1067 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1068 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1069 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1070 if ( f != c
1071 && (f == c1 || f == c2)
1072 && (ln == foldlen ||
66423254
JH
1073 !ibcmp_utf8((char *) foldbuf,
1074 (char **)0, foldlen, do_utf8,
d07ddd77 1075 m,
eb160463 1076 (char **)0, ln, (bool)UTF))
3b0527fe 1077 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1078 goto got_it;
1079 }
1aa99e6b
IH
1080 s += len;
1081 }
09091399
JH
1082 }
1083 else {
1aa99e6b 1084 while (s <= e) {
89ebb4a3 1085 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1086 uniflags);
80aecb99 1087
60a8b682 1088 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1089 * Note that not all the possible combinations
1090 * are handled here: some of them are handled
1091 * by the standard folding rules, and some of
1092 * them (the character class or ANYOF cases)
1093 * are handled during compiletime in
1094 * regexec.c:S_regclass(). */
880bd946
JH
1095 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1096 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1097 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1098
1099 if ( (c == c1 || c == c2)
1100 && (ln == len ||
66423254 1101 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1102 m, (char **)0, ln, (bool)UTF))
3b0527fe 1103 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1104 goto got_it;
80aecb99 1105 else {
1df70142 1106 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1107 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1108 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1109 if ( f != c
1110 && (f == c1 || f == c2)
1111 && (ln == foldlen ||
a6872d42 1112 !ibcmp_utf8((char *) foldbuf,
66423254 1113 (char **)0, foldlen, do_utf8,
d07ddd77 1114 m,
eb160463 1115 (char **)0, ln, (bool)UTF))
3b0527fe 1116 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1117 goto got_it;
1118 }
1aa99e6b
IH
1119 s += len;
1120 }
09091399 1121 }
1aa99e6b
IH
1122 }
1123 else {
1124 if (c1 == c2)
1125 while (s <= e) {
1126 if ( *(U8*)s == c1
1127 && (ln == 1 || !(OP(c) == EXACTF
1128 ? ibcmp(s, m, ln)
1129 : ibcmp_locale(s, m, ln)))
3b0527fe 1130 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1131 goto got_it;
1132 s++;
1133 }
1134 else
1135 while (s <= e) {
1136 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1137 && (ln == 1 || !(OP(c) == EXACTF
1138 ? ibcmp(s, m, ln)
1139 : ibcmp_locale(s, m, ln)))
3b0527fe 1140 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1141 goto got_it;
1142 s++;
1143 }
b3c9acc1
IZ
1144 }
1145 break;
bbce6d69 1146 case BOUNDL:
3280af22 1147 PL_reg_flags |= RF_tainted;
bbce6d69 1148 /* FALL THROUGH */
a0d0e21e 1149 case BOUND:
ffc61ed2 1150 if (do_utf8) {
12d33761 1151 if (s == PL_bostr)
ffc61ed2
JH
1152 tmp = '\n';
1153 else {
6136c704 1154 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1155 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1156 }
1157 tmp = ((OP(c) == BOUND ?
9041c2e3 1158 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1159 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1160 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1161 if (tmp == !(OP(c) == BOUND ?
3568d838 1162 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1163 isALNUM_LC_utf8((U8*)s)))
1164 {
1165 tmp = !tmp;
3b0527fe 1166 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1167 goto got_it;
1168 }
078c425b 1169 s += uskip;
a687059c 1170 }
a0d0e21e 1171 }
667bb95a 1172 else {
12d33761 1173 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1174 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1175 while (s < strend) {
1176 if (tmp ==
1177 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1178 tmp = !tmp;
3b0527fe 1179 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1180 goto got_it;
1181 }
1182 s++;
a0ed51b3 1183 }
a0ed51b3 1184 }
3b0527fe 1185 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1186 goto got_it;
1187 break;
bbce6d69 1188 case NBOUNDL:
3280af22 1189 PL_reg_flags |= RF_tainted;
bbce6d69 1190 /* FALL THROUGH */
a0d0e21e 1191 case NBOUND:
ffc61ed2 1192 if (do_utf8) {
12d33761 1193 if (s == PL_bostr)
ffc61ed2
JH
1194 tmp = '\n';
1195 else {
6136c704 1196 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1197 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1198 }
1199 tmp = ((OP(c) == NBOUND ?
9041c2e3 1200 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1201 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1202 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1203 if (tmp == !(OP(c) == NBOUND ?
3568d838 1204 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1205 isALNUM_LC_utf8((U8*)s)))
1206 tmp = !tmp;
3b0527fe 1207 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2 1208 goto got_it;
078c425b 1209 s += uskip;
ffc61ed2 1210 }
a0d0e21e 1211 }
667bb95a 1212 else {
12d33761 1213 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1214 tmp = ((OP(c) == NBOUND ?
1215 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1216 while (s < strend) {
1217 if (tmp ==
1218 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1219 tmp = !tmp;
3b0527fe 1220 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1221 goto got_it;
1222 s++;
1223 }
a0ed51b3 1224 }
3b0527fe 1225 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1226 goto got_it;
1227 break;
a0d0e21e 1228 case ALNUM:
ffc61ed2 1229 if (do_utf8) {
1a4fad37 1230 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1231 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1232 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1233 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1234 goto got_it;
1235 else
1236 tmp = doevery;
1237 }
bbce6d69 1238 else
ffc61ed2 1239 tmp = 1;
078c425b 1240 s += uskip;
bbce6d69 1241 }
bbce6d69 1242 }
ffc61ed2
JH
1243 else {
1244 while (s < strend) {
1245 if (isALNUM(*s)) {
3b0527fe 1246 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1247 goto got_it;
1248 else
1249 tmp = doevery;
1250 }
a0ed51b3 1251 else
ffc61ed2
JH
1252 tmp = 1;
1253 s++;
a0ed51b3 1254 }
a0ed51b3
LW
1255 }
1256 break;
bbce6d69 1257 case ALNUML:
3280af22 1258 PL_reg_flags |= RF_tainted;
ffc61ed2 1259 if (do_utf8) {
078c425b 1260 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1261 if (isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1262 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1263 goto got_it;
1264 else
1265 tmp = doevery;
1266 }
a687059c 1267 else
ffc61ed2 1268 tmp = 1;
078c425b 1269 s += uskip;
a0d0e21e 1270 }
a0d0e21e 1271 }
ffc61ed2
JH
1272 else {
1273 while (s < strend) {
1274 if (isALNUM_LC(*s)) {
3b0527fe 1275 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1276 goto got_it;
1277 else
1278 tmp = doevery;
1279 }
a0ed51b3 1280 else
ffc61ed2
JH
1281 tmp = 1;
1282 s++;
a0ed51b3 1283 }
a0ed51b3
LW
1284 }
1285 break;
a0d0e21e 1286 case NALNUM:
ffc61ed2 1287 if (do_utf8) {
1a4fad37 1288 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1289 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1290 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1291 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1292 goto got_it;
1293 else
1294 tmp = doevery;
1295 }
bbce6d69 1296 else
ffc61ed2 1297 tmp = 1;
078c425b 1298 s += uskip;
bbce6d69 1299 }
bbce6d69 1300 }
ffc61ed2
JH
1301 else {
1302 while (s < strend) {
1303 if (!isALNUM(*s)) {
3b0527fe 1304 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1305 goto got_it;
1306 else
1307 tmp = doevery;
1308 }
a0ed51b3 1309 else
ffc61ed2
JH
1310 tmp = 1;
1311 s++;
a0ed51b3 1312 }
a0ed51b3
LW
1313 }
1314 break;
bbce6d69 1315 case NALNUML:
3280af22 1316 PL_reg_flags |= RF_tainted;
ffc61ed2 1317 if (do_utf8) {
078c425b 1318 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1319 if (!isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1320 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1321 goto got_it;
1322 else
1323 tmp = doevery;
1324 }
a687059c 1325 else
ffc61ed2 1326 tmp = 1;
078c425b 1327 s += uskip;
a687059c 1328 }
a0d0e21e 1329 }
ffc61ed2
JH
1330 else {
1331 while (s < strend) {
1332 if (!isALNUM_LC(*s)) {
3b0527fe 1333 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1334 goto got_it;
1335 else
1336 tmp = doevery;
1337 }
a0ed51b3 1338 else
ffc61ed2
JH
1339 tmp = 1;
1340 s++;
a0ed51b3 1341 }
a0ed51b3
LW
1342 }
1343 break;
a0d0e21e 1344 case SPACE:
ffc61ed2 1345 if (do_utf8) {
1a4fad37 1346 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1347 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1348 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
3b0527fe 1349 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1350 goto got_it;
1351 else
1352 tmp = doevery;
1353 }
a0d0e21e 1354 else
ffc61ed2 1355 tmp = 1;
078c425b 1356 s += uskip;
2304df62 1357 }
a0d0e21e 1358 }
ffc61ed2
JH
1359 else {
1360 while (s < strend) {
1361 if (isSPACE(*s)) {
3b0527fe 1362 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1363 goto got_it;
1364 else
1365 tmp = doevery;
1366 }
a0ed51b3 1367 else
ffc61ed2
JH
1368 tmp = 1;
1369 s++;
a0ed51b3 1370 }
a0ed51b3
LW
1371 }
1372 break;
bbce6d69 1373 case SPACEL:
3280af22 1374 PL_reg_flags |= RF_tainted;
ffc61ed2 1375 if (do_utf8) {
078c425b 1376 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1377 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
3b0527fe 1378 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1379 goto got_it;
1380 else
1381 tmp = doevery;
1382 }
bbce6d69 1383 else
ffc61ed2 1384 tmp = 1;
078c425b 1385 s += uskip;
bbce6d69 1386 }
bbce6d69 1387 }
ffc61ed2
JH
1388 else {
1389 while (s < strend) {
1390 if (isSPACE_LC(*s)) {
3b0527fe 1391 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1392 goto got_it;
1393 else
1394 tmp = doevery;
1395 }
a0ed51b3 1396 else
ffc61ed2
JH
1397 tmp = 1;
1398 s++;
a0ed51b3 1399 }
a0ed51b3
LW
1400 }
1401 break;
a0d0e21e 1402 case NSPACE:
ffc61ed2 1403 if (do_utf8) {
1a4fad37 1404 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1405 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1406 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
3b0527fe 1407 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1408 goto got_it;
1409 else
1410 tmp = doevery;
1411 }
a0d0e21e 1412 else
ffc61ed2 1413 tmp = 1;
078c425b 1414 s += uskip;
a687059c 1415 }
a0d0e21e 1416 }
ffc61ed2
JH
1417 else {
1418 while (s < strend) {
1419 if (!isSPACE(*s)) {
3b0527fe 1420 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1421 goto got_it;
1422 else
1423 tmp = doevery;
1424 }
a0ed51b3 1425 else
ffc61ed2
JH
1426 tmp = 1;
1427 s++;
a0ed51b3 1428 }
a0ed51b3
LW
1429 }
1430 break;
bbce6d69 1431 case NSPACEL:
3280af22 1432 PL_reg_flags |= RF_tainted;
ffc61ed2 1433 if (do_utf8) {
078c425b 1434 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1435 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
3b0527fe 1436 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1437 goto got_it;
1438 else
1439 tmp = doevery;
1440 }
bbce6d69 1441 else
ffc61ed2 1442 tmp = 1;
078c425b 1443 s += uskip;
bbce6d69 1444 }
bbce6d69 1445 }
ffc61ed2
JH
1446 else {
1447 while (s < strend) {
1448 if (!isSPACE_LC(*s)) {
3b0527fe 1449 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1450 goto got_it;
1451 else
1452 tmp = doevery;
1453 }
a0ed51b3 1454 else
ffc61ed2
JH
1455 tmp = 1;
1456 s++;
a0ed51b3 1457 }
a0ed51b3
LW
1458 }
1459 break;
a0d0e21e 1460 case DIGIT:
ffc61ed2 1461 if (do_utf8) {
1a4fad37 1462 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1463 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1464 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1465 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1466 goto got_it;
1467 else
1468 tmp = doevery;
1469 }
a0d0e21e 1470 else
ffc61ed2 1471 tmp = 1;
078c425b 1472 s += uskip;
2b69d0c2 1473 }
a0d0e21e 1474 }
ffc61ed2
JH
1475 else {
1476 while (s < strend) {
1477 if (isDIGIT(*s)) {
3b0527fe 1478 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1479 goto got_it;
1480 else
1481 tmp = doevery;
1482 }
a0ed51b3 1483 else
ffc61ed2
JH
1484 tmp = 1;
1485 s++;
a0ed51b3 1486 }
a0ed51b3
LW
1487 }
1488 break;
b8c5462f
JH
1489 case DIGITL:
1490 PL_reg_flags |= RF_tainted;
ffc61ed2 1491 if (do_utf8) {
078c425b 1492 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1493 if (isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1494 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1495 goto got_it;
1496 else
1497 tmp = doevery;
1498 }
b8c5462f 1499 else
ffc61ed2 1500 tmp = 1;
078c425b 1501 s += uskip;
b8c5462f 1502 }
b8c5462f 1503 }
ffc61ed2
JH
1504 else {
1505 while (s < strend) {
1506 if (isDIGIT_LC(*s)) {
3b0527fe 1507 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1508 goto got_it;
1509 else
1510 tmp = doevery;
1511 }
b8c5462f 1512 else
ffc61ed2
JH
1513 tmp = 1;
1514 s++;
b8c5462f 1515 }
b8c5462f
JH
1516 }
1517 break;
a0d0e21e 1518 case NDIGIT:
ffc61ed2 1519 if (do_utf8) {
1a4fad37 1520 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1521 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1522 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1523 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1524 goto got_it;
1525 else
1526 tmp = doevery;
1527 }
a0d0e21e 1528 else
ffc61ed2 1529 tmp = 1;
078c425b 1530 s += uskip;
a687059c 1531 }
a0d0e21e 1532 }
ffc61ed2
JH
1533 else {
1534 while (s < strend) {
1535 if (!isDIGIT(*s)) {
3b0527fe 1536 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1537 goto got_it;
1538 else
1539 tmp = doevery;
1540 }
a0ed51b3 1541 else
ffc61ed2
JH
1542 tmp = 1;
1543 s++;
a0ed51b3 1544 }
a0ed51b3
LW
1545 }
1546 break;
b8c5462f
JH
1547 case NDIGITL:
1548 PL_reg_flags |= RF_tainted;
ffc61ed2 1549 if (do_utf8) {
078c425b 1550 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1551 if (!isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1552 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1553 goto got_it;
1554 else
1555 tmp = doevery;
1556 }
b8c5462f 1557 else
ffc61ed2 1558 tmp = 1;
078c425b 1559 s += uskip;
b8c5462f 1560 }
a0ed51b3 1561 }
ffc61ed2
JH
1562 else {
1563 while (s < strend) {
1564 if (!isDIGIT_LC(*s)) {
3b0527fe 1565 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1566 goto got_it;
1567 else
1568 tmp = doevery;
1569 }
cf93c79d 1570 else
ffc61ed2
JH
1571 tmp = 1;
1572 s++;
b8c5462f 1573 }
b8c5462f
JH
1574 }
1575 break;
b3c9acc1 1576 default:
3c3eec57
GS
1577 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1578 break;
d6a28714 1579 }
6eb5f6b9
JH
1580 return 0;
1581 got_it:
1582 return s;
1583}
1584
1585/*
1586 - regexec_flags - match a regexp against a string
1587 */
1588I32
1589Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1590 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1591/* strend: pointer to null at end of string */
1592/* strbeg: real beginning of string */
1593/* minend: end of match must be >=minend after stringarg. */
1594/* data: May be used for some additional optimizations. */
1595/* nosave: For optimizations. */
1596{
97aff369 1597 dVAR;
6eb5f6b9
JH
1598 register char *s;
1599 register regnode *c;
1600 register char *startpos = stringarg;
6eb5f6b9
JH
1601 I32 minlen; /* must match at least this many chars */
1602 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1603 I32 end_shift = 0; /* Same for the end. */ /* CC */
1604 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1605 char *scream_olds = NULL;
6eb5f6b9 1606 SV* oreplsv = GvSV(PL_replgv);
1df70142 1607 const bool do_utf8 = DO_UTF8(sv);
2757e526 1608 I32 multiline;
2a782b5b 1609#ifdef DEBUGGING
2757e526
JH
1610 SV* dsv0;
1611 SV* dsv1;
2a782b5b 1612#endif
3b0527fe 1613 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1614
1615 GET_RE_DEBUG_FLAGS_DECL;
1616
9d4ba2ae 1617 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1618
1619 /* Be paranoid... */
1620 if (prog == NULL || startpos == NULL) {
1621 Perl_croak(aTHX_ "NULL regexp parameter");
1622 return 0;
1623 }
1624
2757e526 1625 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1626 reginfo.prog = prog;
2757e526
JH
1627
1628#ifdef DEBUGGING
1629 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1630 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1631#endif
1632
bac06658
JH
1633 RX_MATCH_UTF8_set(prog, do_utf8);
1634
6eb5f6b9 1635 minlen = prog->minlen;
61a36c01 1636 if (strend - startpos < minlen) {
a3621e74 1637 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1638 "String too short [regexec_flags]...\n"));
1639 goto phooey;
1aa99e6b 1640 }
6eb5f6b9 1641
6eb5f6b9
JH
1642 /* Check validity of program. */
1643 if (UCHARAT(prog->program) != REG_MAGIC) {
1644 Perl_croak(aTHX_ "corrupted regexp program");
1645 }
1646
1647 PL_reg_flags = 0;
1648 PL_reg_eval_set = 0;
1649 PL_reg_maxiter = 0;
1650
1651 if (prog->reganch & ROPT_UTF8)
1652 PL_reg_flags |= RF_utf8;
1653
1654 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1655 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1656 PL_bostr = strbeg;
3b0527fe 1657 reginfo.sv = sv;
6eb5f6b9
JH
1658
1659 /* Mark end of line for $ (and such) */
1660 PL_regeol = strend;
1661
1662 /* see how far we have to get to not match where we matched before */
3b0527fe 1663 reginfo.till = startpos+minend;
6eb5f6b9 1664
6eb5f6b9
JH
1665 /* If there is a "must appear" string, look for it. */
1666 s = startpos;
1667
3b0527fe 1668 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1669 MAGIC *mg;
1670
1671 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1672 reginfo.ganch = startpos;
6eb5f6b9
JH
1673 else if (sv && SvTYPE(sv) >= SVt_PVMG
1674 && SvMAGIC(sv)
14befaf4
DM
1675 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1676 && mg->mg_len >= 0) {
3b0527fe 1677 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1678 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1679 if (s > reginfo.ganch)
6eb5f6b9 1680 goto phooey;
3b0527fe 1681 s = reginfo.ganch;
6eb5f6b9
JH
1682 }
1683 }
1684 else /* pos() not defined */
3b0527fe 1685 reginfo.ganch = strbeg;
6eb5f6b9
JH
1686 }
1687
a0714e2c 1688 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1689 re_scream_pos_data d;
1690
1691 d.scream_olds = &scream_olds;
1692 d.scream_pos = &scream_pos;
1693 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1694 if (!s) {
a3621e74 1695 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1696 goto phooey; /* not present */
3fa9c3d7 1697 }
6eb5f6b9
JH
1698 }
1699
a3621e74 1700 DEBUG_EXECUTE_r({
1df70142
AL
1701 const char * const s0 = UTF
1702 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1703 UNI_DISPLAY_REGEX)
1704 : prog->precomp;
1705 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1706 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1707 UNI_DISPLAY_REGEX) : startpos;
1df70142 1708 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1709 if (!PL_colorset)
1710 reginitcolors();
1711 PerlIO_printf(Perl_debug_log,
a0288114 1712 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1713 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1714 len0, len0, s0,
2a782b5b 1715 PL_colors[1],
9e55ce06 1716 len0 > 60 ? "..." : "",
2a782b5b 1717 PL_colors[0],
9e55ce06
JH
1718 (int)(len1 > 60 ? 60 : len1),
1719 s1, PL_colors[1],
1720 (len1 > 60 ? "..." : "")
2a782b5b
JH
1721 );
1722 });
6eb5f6b9
JH
1723
1724 /* Simplest case: anchored match need be tried only once. */
1725 /* [unless only anchor is BOL and multiline is set] */
1726 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1727 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1728 goto got_it;
7fba1cd6 1729 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1730 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1731 {
1732 char *end;
1733
1734 if (minlen)
1735 dontbother = minlen - 1;
1aa99e6b 1736 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1737 /* for multiline we only have to try after newlines */
33b8afdf 1738 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1739 if (s == startpos)
1740 goto after_try;
1741 while (1) {
3b0527fe 1742 if (regtry(&reginfo, s))
6eb5f6b9
JH
1743 goto got_it;
1744 after_try:
1745 if (s >= end)
1746 goto phooey;
1747 if (prog->reganch & RE_USE_INTUIT) {
1748 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1749 if (!s)
1750 goto phooey;
1751 }
1752 else
1753 s++;
1754 }
1755 } else {
1756 if (s > startpos)
1757 s--;
1758 while (s < end) {
1759 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1760 if (regtry(&reginfo, s))
6eb5f6b9
JH
1761 goto got_it;
1762 }
1763 }
1764 }
1765 }
1766 goto phooey;
1767 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1768 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1769 goto got_it;
1770 goto phooey;
1771 }
1772
1773 /* Messy cases: unanchored match. */
33b8afdf 1774 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1775 /* we have /x+whatever/ */
1776 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1777 char ch;
bf93d4cc
GS
1778#ifdef DEBUGGING
1779 int did_match = 0;
1780#endif
33b8afdf
JH
1781 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1782 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1783 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1784
1aa99e6b 1785 if (do_utf8) {
6eb5f6b9
JH
1786 while (s < strend) {
1787 if (*s == ch) {
a3621e74 1788 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1789 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1790 s += UTF8SKIP(s);
1791 while (s < strend && *s == ch)
1792 s += UTF8SKIP(s);
1793 }
1794 s += UTF8SKIP(s);
1795 }
1796 }
1797 else {
1798 while (s < strend) {
1799 if (*s == ch) {
a3621e74 1800 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1801 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1802 s++;
1803 while (s < strend && *s == ch)
1804 s++;
1805 }
1806 s++;
1807 }
1808 }
a3621e74 1809 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1810 PerlIO_printf(Perl_debug_log,
b7953727
JH
1811 "Did not find anchored character...\n")
1812 );
6eb5f6b9 1813 }
a0714e2c
SS
1814 else if (prog->anchored_substr != NULL
1815 || prog->anchored_utf8 != NULL
1816 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1817 && prog->float_max_offset < strend - s)) {
1818 SV *must;
1819 I32 back_max;
1820 I32 back_min;
1821 char *last;
6eb5f6b9 1822 char *last1; /* Last position checked before */
bf93d4cc
GS
1823#ifdef DEBUGGING
1824 int did_match = 0;
1825#endif
33b8afdf
JH
1826 if (prog->anchored_substr || prog->anchored_utf8) {
1827 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1828 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1829 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1830 back_max = back_min = prog->anchored_offset;
1831 } else {
1832 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1833 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1834 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1835 back_max = prog->float_max_offset;
1836 back_min = prog->float_min_offset;
1837 }
1838 if (must == &PL_sv_undef)
1839 /* could not downgrade utf8 check substring, so must fail */
1840 goto phooey;
1841
1842 last = HOP3c(strend, /* Cannot start after this */
1843 -(I32)(CHR_SVLEN(must)
1844 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1845
1846 if (s > PL_bostr)
1847 last1 = HOPc(s, -1);
1848 else
1849 last1 = s - 1; /* bogus */
1850
a0288114 1851 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1852 check_substr==must. */
1853 scream_pos = -1;
1854 dontbother = end_shift;
1855 strend = HOPc(strend, -dontbother);
1856 while ( (s <= last) &&
9041c2e3 1857 ((flags & REXEC_SCREAM)
1aa99e6b 1858 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1859 end_shift, &scream_pos, 0))
1aa99e6b 1860 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1861 (unsigned char*)strend, must,
7fba1cd6 1862 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1863 /* we may be pointing at the wrong string */
1864 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1865 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1866 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1867 if (HOPc(s, -back_max) > last1) {
1868 last1 = HOPc(s, -back_min);
1869 s = HOPc(s, -back_max);
1870 }
1871 else {
52657f30 1872 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1873
1874 last1 = HOPc(s, -back_min);
52657f30 1875 s = t;
6eb5f6b9 1876 }
1aa99e6b 1877 if (do_utf8) {
6eb5f6b9 1878 while (s <= last1) {
3b0527fe 1879 if (regtry(&reginfo, s))
6eb5f6b9
JH
1880 goto got_it;
1881 s += UTF8SKIP(s);
1882 }
1883 }
1884 else {
1885 while (s <= last1) {
3b0527fe 1886 if (regtry(&reginfo, s))
6eb5f6b9
JH
1887 goto got_it;
1888 s++;
1889 }
1890 }
1891 }
a3621e74 1892 DEBUG_EXECUTE_r(if (!did_match)
b7953727 1893 PerlIO_printf(Perl_debug_log,
a0288114 1894 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1895 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1896 ? "anchored" : "floating"),
1897 PL_colors[0],
1898 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 1899 SvPVX_const(must),
b7953727
JH
1900 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1901 );
6eb5f6b9
JH
1902 goto phooey;
1903 }
155aba94 1904 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1905 if (minlen) {
1906 I32 op = (U8)OP(prog->regstclass);
66e933ab 1907 /* don't bother with what can't match */
f14c76ed
RGS
1908 if (PL_regkind[op] != EXACT && op != CANY)
1909 strend = HOPc(strend, -(minlen - 1));
1910 }
a3621e74 1911 DEBUG_EXECUTE_r({
ffc61ed2 1912 SV *prop = sv_newmortal();
cfd0369c
NC
1913 const char *s0;
1914 const char *s1;
9e55ce06
JH
1915 int len0;
1916 int len1;
1917
32fc9b6a 1918 regprop(prog, prop, c);
9e55ce06 1919 s0 = UTF ?
3f7c398e 1920 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 1921 UNI_DISPLAY_REGEX) :
cfd0369c 1922 SvPVX_const(prop);
9e55ce06
JH
1923 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1924 s1 = UTF ?
c728cb41 1925 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1926 len1 = UTF ? SvCUR(dsv1) : strend - s;
1927 PerlIO_printf(Perl_debug_log,
a0288114 1928 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
9e55ce06
JH
1929 len0, len0, s0,
1930 len1, len1, s1);
ffc61ed2 1931 });
3b0527fe 1932 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 1933 goto got_it;
a3621e74 1934 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1935 }
1936 else {
1937 dontbother = 0;
a0714e2c 1938 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1939 /* Trim the end. */
d6a28714 1940 char *last;
33b8afdf
JH
1941 SV* float_real;
1942
1943 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1944 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1945 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1946
1947 if (flags & REXEC_SCREAM) {
33b8afdf 1948 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1949 end_shift, &scream_pos, 1); /* last one */
1950 if (!last)
ffc61ed2 1951 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1952 /* we may be pointing at the wrong string */
1953 else if (RX_MATCH_COPIED(prog))
3f7c398e 1954 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1955 }
d6a28714
JH
1956 else {
1957 STRLEN len;
cfd0369c 1958 const char * const little = SvPV_const(float_real, len);
d6a28714 1959
33b8afdf 1960 if (SvTAIL(float_real)) {
d6a28714
JH
1961 if (memEQ(strend - len + 1, little, len - 1))
1962 last = strend - len + 1;
7fba1cd6 1963 else if (!multiline)
9041c2e3 1964 last = memEQ(strend - len, little, len)
bd61b366 1965 ? strend - len : NULL;
b8c5462f 1966 else
d6a28714
JH
1967 goto find_last;
1968 } else {
1969 find_last:
9041c2e3 1970 if (len)
d6a28714 1971 last = rninstr(s, strend, little, little + len);
b8c5462f 1972 else
a0288114 1973 last = strend; /* matching "$" */
b8c5462f 1974 }
b8c5462f 1975 }
bf93d4cc 1976 if (last == NULL) {
a3621e74 1977 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 1978 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 1979 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1980 goto phooey; /* Should not happen! */
1981 }
d6a28714
JH
1982 dontbother = strend - last + prog->float_min_offset;
1983 }
1984 if (minlen && (dontbother < minlen))
1985 dontbother = minlen - 1;
1986 strend -= dontbother; /* this one's always in bytes! */
1987 /* We don't know much -- general case. */
1aa99e6b 1988 if (do_utf8) {
d6a28714 1989 for (;;) {
3b0527fe 1990 if (regtry(&reginfo, s))
d6a28714
JH
1991 goto got_it;
1992 if (s >= strend)
1993 break;
b8c5462f 1994 s += UTF8SKIP(s);
d6a28714
JH
1995 };
1996 }
1997 else {
1998 do {
3b0527fe 1999 if (regtry(&reginfo, s))
d6a28714
JH
2000 goto got_it;
2001 } while (s++ < strend);
2002 }
2003 }
2004
2005 /* Failure. */
2006 goto phooey;
2007
2008got_it:
2009 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2010
2011 if (PL_reg_eval_set) {
2012 /* Preserve the current value of $^R */
2013 if (oreplsv != GvSV(PL_replgv))
2014 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2015 restored, the value remains
2016 the same. */
4f639d21 2017 restore_pos(aTHX_ prog);
d6a28714
JH
2018 }
2019
2020 /* make sure $`, $&, $', and $digit will work later */
2021 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2022 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2023 if (flags & REXEC_COPY_STR) {
2024 I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2025#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2026 if ((SvIsCOW(sv)
2027 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2028 if (DEBUG_C_TEST) {
2029 PerlIO_printf(Perl_debug_log,
2030 "Copy on write: regexp capture, type %d\n",
2031 (int) SvTYPE(sv));
2032 }
2033 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2034 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2035 assert (SvPOKp(prog->saved_copy));
2036 } else
2037#endif
2038 {
2039 RX_MATCH_COPIED_on(prog);
2040 s = savepvn(strbeg, i);
2041 prog->subbeg = s;
2042 }
d6a28714 2043 prog->sublen = i;
d6a28714
JH
2044 }
2045 else {
2046 prog->subbeg = strbeg;
2047 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2048 }
2049 }
9041c2e3 2050
d6a28714
JH
2051 return 1;
2052
2053phooey:
a3621e74 2054 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2055 PL_colors[4], PL_colors[5]));
d6a28714 2056 if (PL_reg_eval_set)
4f639d21 2057 restore_pos(aTHX_ prog);
d6a28714
JH
2058 return 0;
2059}
2060
2061/*
2062 - regtry - try match at specific point
2063 */
2064STATIC I32 /* 0 failure, 1 success */
3b0527fe 2065S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2066{
97aff369 2067 dVAR;
d6a28714
JH
2068 register I32 *sp;
2069 register I32 *ep;
2070 CHECKPOINT lastcp;
3b0527fe 2071 regexp *prog = reginfo->prog;
a3621e74 2072 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2073
02db2b7b
IZ
2074#ifdef DEBUGGING
2075 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2076#endif
d6a28714
JH
2077 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2078 MAGIC *mg;
2079
2080 PL_reg_eval_set = RS_init;
a3621e74 2081 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2082 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2083 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2084 ));
e8347627 2085 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2086 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2087 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2088 SAVETMPS;
2089 /* Apparently this is not needed, judging by wantarray. */
e8347627 2090 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2091 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2092
3b0527fe 2093 if (reginfo->sv) {
d6a28714 2094 /* Make $_ available to executed code. */
3b0527fe 2095 if (reginfo->sv != DEFSV) {
59f00321 2096 SAVE_DEFSV;
3b0527fe 2097 DEFSV = reginfo->sv;
b8c5462f 2098 }
d6a28714 2099
3b0527fe
DM
2100 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2101 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2102 /* prepare for quick setting of pos */
d300d9fa
NC
2103#ifdef PERL_OLD_COPY_ON_WRITE
2104 if (SvIsCOW(sv))
2105 sv_force_normal_flags(sv, 0);
2106#endif
3b0527fe 2107 mg = sv_magicext(reginfo->sv, (SV*)0, PERL_MAGIC_regex_global,
d300d9fa 2108 &PL_vtbl_mglob, NULL, 0);
d6a28714 2109 mg->mg_len = -1;
b8c5462f 2110 }
d6a28714
JH
2111 PL_reg_magic = mg;
2112 PL_reg_oldpos = mg->mg_len;
4f639d21 2113 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2114 }
09687e5a 2115 if (!PL_reg_curpm) {
a02a5408 2116 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2117#ifdef USE_ITHREADS
2118 {
2119 SV* repointer = newSViv(0);
577e12cc 2120 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2121 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2122 av_push(PL_regex_padav,repointer);
2123 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2124 PL_regex_pad = AvARRAY(PL_regex_padav);
2125 }
2126#endif
2127 }
aaa362c4 2128 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2129 PL_reg_oldcurpm = PL_curpm;
2130 PL_curpm = PL_reg_curpm;
2131 if (RX_MATCH_COPIED(prog)) {
2132 /* Here is a serious problem: we cannot rewrite subbeg,
2133 since it may be needed if this match fails. Thus
2134 $` inside (?{}) could fail... */
2135 PL_reg_oldsaved = prog->subbeg;
2136 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2137#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2138 PL_nrs = prog->saved_copy;
2139#endif
d6a28714
JH
2140 RX_MATCH_COPIED_off(prog);
2141 }
2142 else
bd61b366 2143 PL_reg_oldsaved = NULL;
d6a28714
JH
2144 prog->subbeg = PL_bostr;
2145 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2146 }
973dddac 2147 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2148 PL_reginput = startpos;
2149 PL_regstartp = prog->startp;
2150 PL_regendp = prog->endp;
2151 PL_reglastparen = &prog->lastparen;
a01268b5 2152 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2153 prog->lastparen = 0;
03994de8 2154 prog->lastcloseparen = 0;
d6a28714 2155 PL_regsize = 0;
a3621e74 2156 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2157 if (PL_reg_start_tmpl <= prog->nparens) {
2158 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2159 if(PL_reg_start_tmp)
2160 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2161 else
a02a5408 2162 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2163 }
2164
2165 /* XXXX What this code is doing here?!!! There should be no need
2166 to do this again and again, PL_reglastparen should take care of
3dd2943c 2167 this! --ilya*/
dafc8851
JH
2168
2169 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2170 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2171 * PL_reglastparen), is not needed at all by the test suite
2172 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2173 * enough, for building DynaLoader, or otherwise this
2174 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2175 * will happen. Meanwhile, this code *is* needed for the
2176 * above-mentioned test suite tests to succeed. The common theme
2177 * on those tests seems to be returning null fields from matches.
2178 * --jhi */
dafc8851 2179#if 1
d6a28714
JH
2180 sp = prog->startp;
2181 ep = prog->endp;
2182 if (prog->nparens) {
097eb12c 2183 register I32 i;
eb160463 2184 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2185 *++sp = -1;
2186 *++ep = -1;
2187 }
2188 }
dafc8851 2189#endif
02db2b7b 2190 REGCP_SET(lastcp);
3b0527fe 2191 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2192 prog->endp[0] = PL_reginput - PL_bostr;
2193 return 1;
2194 }
02db2b7b 2195 REGCP_UNWIND(lastcp);
d6a28714
JH
2196 return 0;
2197}
2198
02db2b7b
IZ
2199#define RE_UNWIND_BRANCH 1
2200#define RE_UNWIND_BRANCHJ 2
2201
2202union re_unwind_t;
2203
2204typedef struct { /* XX: makes sense to enlarge it... */
2205 I32 type;
2206 I32 prev;
2207 CHECKPOINT lastcp;
2208} re_unwind_generic_t;
2209
2210typedef struct {
2211 I32 type;
2212 I32 prev;
2213 CHECKPOINT lastcp;
2214 I32 lastparen;
2215 regnode *next;
2216 char *locinput;
2217 I32 nextchr;
3a2830be 2218 int minmod;
02db2b7b
IZ
2219#ifdef DEBUGGING
2220 int regindent;
2221#endif
2222} re_unwind_branch_t;
2223
2224typedef union re_unwind_t {
2225 I32 type;
2226 re_unwind_generic_t generic;
2227 re_unwind_branch_t branch;
2228} re_unwind_t;
2229
8ba1375e
MJD
2230#define sayYES goto yes
2231#define sayNO goto no
e0f9d4a8 2232#define sayNO_ANYOF goto no_anyof
8ba1375e 2233#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2234#define sayNO_FINAL goto no_final
2235#define sayNO_SILENT goto do_no
2236#define saySAME(x) if (x) goto yes; else goto no
2237
3ab3c9b4
HS
2238#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2239#define POSCACHE_SEEN 1 /* we know what we're caching */
2240#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
7409bbd3 2241
3ab3c9b4 2242#define CACHEsayYES STMT_START { \
d8319b27 2243 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3
DM
2244 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2245 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2246 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2247 } \
2248 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2249 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2250 } \
2251 else { \
3ab3c9b4
HS
2252 /* cache records failure, but this is success */ \
2253 DEBUG_r( \
2254 PerlIO_printf(Perl_debug_log, \
2255 "%*s (remove success from failure cache)\n", \
2256 REPORT_CODE_OFF+PL_regindent*2, "") \
2257 ); \
d8319b27 2258 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2259 } \
2260 } \
2261 sayYES; \
2262} STMT_END
7409bbd3 2263
3ab3c9b4 2264#define CACHEsayNO STMT_START { \
d8319b27 2265 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3 2266 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
3ab3c9b4 2267 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
7409bbd3
DM
2268 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2269 } \
2270 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2271 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2272 } \
2273 else { \
3ab3c9b4
HS
2274 /* cache records success, but this is failure */ \
2275 DEBUG_r( \
2276 PerlIO_printf(Perl_debug_log, \
2277 "%*s (remove failure from success cache)\n", \
2278 REPORT_CODE_OFF+PL_regindent*2, "") \
2279 ); \
d8319b27 2280 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2281 } \
2282 } \
2283 sayNO; \
2284} STMT_END
2285
a3621e74
YO
2286/* this is used to determine how far from the left messages like
2287 'failed...' are printed. Currently 29 makes these messages line
2288 up with the opcode they refer to. Earlier perls used 25 which
2289 left these messages outdented making reviewing a debug output
2290 quite difficult.
2291*/
2292#define REPORT_CODE_OFF 29
2293
2294
2295/* Make sure there is a test for this +1 options in re_tests */
2296#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2297
9e137952
DM
2298/* this value indiciates that the c1/c2 "next char" test should be skipped */
2299#define CHRTEST_VOID -1000
2300
86545054
DM
2301#define SLAB_FIRST(s) (&(s)->states[0])
2302#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2303
5d9a96ca
DM
2304/* grab a new slab and return the first slot in it */
2305
2306STATIC regmatch_state *
2307S_push_slab(pTHX)
2308{
2309 regmatch_slab *s = PL_regmatch_slab->next;
2310 if (!s) {
2311 Newx(s, 1, regmatch_slab);
2312 s->prev = PL_regmatch_slab;
2313 s->next = NULL;
2314 PL_regmatch_slab->next = s;
2315 }
2316 PL_regmatch_slab = s;
86545054 2317 return SLAB_FIRST(s);
5d9a96ca 2318}
5b47454d 2319
95b24440
DM
2320/* simulate a recursive call to regmatch */
2321
2322#define REGMATCH(ns, where) \
5d9a96ca
DM
2323 st->scan = scan; \
2324 scan = (ns); \
2325 st->resume_state = resume_##where; \
95b24440
DM
2326 goto start_recurse; \
2327 resume_point_##where:
2328
aa283a38
DM
2329
2330/* push a new regex state. Set newst to point to it */
2331
2332#define PUSH_STATE(newst, resume) \
2333 depth++; \
2334 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH STATE(%d)\n", depth)); \
2335 st->scan = scan; \
2336 st->next = next; \
2337 st->n = n; \
2338 st->locinput = locinput; \
2339 st->resume_state = resume; \
2340 newst = st+1; \
86545054 2341 if (newst > SLAB_LAST(PL_regmatch_slab)) \
aa283a38
DM
2342 newst = S_push_slab(aTHX); \
2343 PL_regmatch_state = newst; \
2344 newst->cc = 0; \
2345 newst->minmod = 0; \
2346 newst->sw = 0; \
2347 newst->logical = 0; \
2348 newst->unwind = 0; \
2349 locinput = PL_reginput; \
2350 nextchr = UCHARAT(locinput);
2351
2352#define POP_STATE \
2353 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth)); \
2354 depth--; \
2355 st--; \
86545054 2356 if (st < SLAB_FIRST(PL_regmatch_slab)) { \
aa283a38 2357 PL_regmatch_slab = PL_regmatch_slab->prev; \
86545054 2358 st = SLAB_LAST(PL_regmatch_slab); \
aa283a38
DM
2359 } \
2360 PL_regmatch_state = st; \
2361 scan = st->scan; \
2362 next = st->next; \
2363 n = st->n; \
2364 locinput = st->locinput; \
2365 nextchr = UCHARAT(locinput);
2366
d6a28714
JH
2367/*
2368 - regmatch - main matching routine
2369 *
2370 * Conceptually the strategy is simple: check to see whether the current
2371 * node matches, call self recursively to see whether the rest matches,
2372 * and then act accordingly. In practice we make some effort to avoid
2373 * recursion, in particular by going through "ordinary" nodes (that don't
2374 * need to know whether the rest of the match failed) by a loop instead of
2375 * by recursion.
2376 */
2377/* [lwall] I've hoisted the register declarations to the outer block in order to
2378 * maybe save a little bit of pushing and popping on the stack. It also takes
2379 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2380 *
2381 * This function used to be heavily recursive, but since this had the
2382 * effect of blowing the CPU stack on complex regexes, it has been
2383 * restructured to be iterative, and to save state onto the heap rather
2384 * than the stack. Essentially whereever regmatch() used to be called, it
2385 * pushes the current state, notes where to return, then jumps back into
2386 * the main loop.
2387 *
2388 * Originally the structure of this function used to look something like
2389
2390 S_regmatch() {
2391 int a = 1, b = 2;
2392 ...
2393 while (scan != NULL) {
5d9a96ca 2394 a++; // do stuff with a and b
95b24440
DM
2395 ...
2396 switch (OP(scan)) {
2397 case FOO: {
2398 int local = 3;
2399 ...
2400 if (regmatch(...)) // recurse
2401 goto yes;
2402 }
2403 ...
2404 }
2405 }
2406 yes:
2407 return 1;
2408 }
2409
2410 * Now it looks something like this:
2411
5d9a96ca 2412 typedef struct {
95b24440
DM
2413 int a, b, local;
2414 int resume_state;
5d9a96ca 2415 } regmatch_state;
95b24440
DM
2416
2417 S_regmatch() {
5d9a96ca
DM
2418 regmatch_state *st = new();
2419 int depth=0;
2420 st->a++; // do stuff with a and b
95b24440
DM
2421 ...
2422 while (scan != NULL) {
2423 ...
2424 switch (OP(scan)) {
2425 case FOO: {
5d9a96ca 2426 st->local = 3;
95b24440 2427 ...
5d9a96ca
DM
2428 st->scan = scan;
2429 scan = ...;
2430 st->resume_state = resume_FOO;
2431 goto start_recurse; // recurse
95b24440 2432
5d9a96ca
DM
2433 resume_point_FOO:
2434 if (result)
95b24440
DM
2435 goto yes;
2436 }
2437 ...
2438 }
5d9a96ca
DM
2439 start_recurse:
2440 st = new(); push a new state
2441 st->a = 1; st->b = 2;
2442 depth++;
95b24440 2443 }
5d9a96ca 2444 yes:
95b24440 2445 result = 1;
5d9a96ca
DM
2446 if (depth--) {
2447 st = pop();
95b24440
DM
2448 switch (resume_state) {
2449 case resume_FOO:
2450 goto resume_point_FOO;
2451 ...
2452 }
2453 }
2454 return result
2455 }
2456
2457 * WARNING: this means that any line in this function that contains a
2458 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2459 * regmatch() using gotos instead. Thus the values of any local variables
2460 * not saved in the regmatch_state structure will have been lost when
2461 * execution resumes on the next line .
5d9a96ca
DM
2462 *
2463 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2464 * PL_regmatch_state always points to the currently active state, and
2465 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2466 * The first time regmatch is called, the first slab is allocated, and is
2467 * never freed until interpreter desctruction. When the slab is full,
2468 * a new one is allocated chained to the end. At exit from regmatch, slabs
2469 * allocated since entry are freed.
d6a28714 2470 */
95b24440
DM
2471
2472
d6a28714 2473STATIC I32 /* 0 failure, 1 success */
3b0527fe 2474S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2475{
27da23d5 2476 dVAR;
95b24440 2477 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2478 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2479
3b0527fe
DM
2480 regexp *rex = reginfo->prog;
2481
5d9a96ca
DM
2482 regmatch_slab *orig_slab;
2483 regmatch_state *orig_state;
a3621e74 2484
5d9a96ca
DM
2485 /* the current state. This is a cached copy of PL_regmatch_state */
2486 register regmatch_state *st;
95b24440 2487
5d9a96ca
DM
2488 /* cache heavy used fields of st in registers */
2489 register regnode *scan;
2490 register regnode *next;
2491 register I32 n = 0; /* initialize to shut up compiler warning */
2492 register char *locinput = PL_reginput;
95b24440 2493
5d9a96ca
DM
2494 /* these variables are NOT saved during a recusive RFEGMATCH: */
2495 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2496 bool result; /* return value of S_regmatch */
2497 regnode *inner; /* Next node in internal branch. */
2498 int depth = 0; /* depth of recursion */
aa283a38 2499 regmatch_state *newst; /* when pushing a state, this is the new one */
77cb431f
DM
2500 regmatch_state *yes_state = NULL; /* state to pop to on success of
2501 subpattern */
95b24440
DM
2502
2503#ifdef DEBUGGING
ab74612d 2504 SV *re_debug_flags = NULL;
a3621e74 2505 GET_RE_DEBUG_FLAGS;
d6a28714
JH
2506 PL_regindent++;
2507#endif
2508
5d9a96ca
DM
2509 /* on first ever call to regmatch, allocate first slab */
2510 if (!PL_regmatch_slab) {
2511 Newx(PL_regmatch_slab, 1, regmatch_slab);
2512 PL_regmatch_slab->prev = NULL;
2513 PL_regmatch_slab->next = NULL;
86545054 2514 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2515 }
2516
2517 /* remember current high-water mark for exit */
2518 /* XXX this should be done with SAVE* instead */
2519 orig_slab = PL_regmatch_slab;
2520 orig_state = PL_regmatch_state;
2521
2522 /* grab next free state slot */
2523 st = ++PL_regmatch_state;
86545054 2524 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2525 st = PL_regmatch_state = S_push_slab(aTHX);
2526
2527 st->minmod = 0;
2528 st->sw = 0;
2529 st->logical = 0;
2530 st->unwind = 0;
2531 st->cc = NULL;
d6a28714
JH
2532 /* Note that nextchr is a byte even in UTF */
2533 nextchr = UCHARAT(locinput);
2534 scan = prog;
2535 while (scan != NULL) {
8ba1375e 2536
a3621e74 2537 DEBUG_EXECUTE_r( {
6136c704 2538 SV * const prop = sv_newmortal();
1df70142
AL
2539 const int docolor = *PL_colors[0];
2540 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2541 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2542 /* The part of the string before starttry has one color
2543 (pref0_len chars), between starttry and current
2544 position another one (pref_len - pref0_len chars),
2545 after the current position the third one.
2546 We assume that pref0_len <= pref_len, otherwise we
2547 decrease pref0_len. */
9041c2e3 2548 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2549 ? (5 + taill) - l : locinput - PL_bostr;
2550 int pref0_len;
d6a28714 2551
df1ffd02 2552 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2553 pref_len++;
2554 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2555 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2556 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2557 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2558 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2559 l--;
d6a28714
JH
2560 if (pref0_len < 0)
2561 pref0_len = 0;
2562 if (pref0_len > pref_len)
2563 pref0_len = pref_len;
32fc9b6a 2564 regprop(rex, prop, scan);
2a782b5b 2565 {
1df70142 2566 const char * const s0 =
f14c76ed 2567 do_utf8 && OP(scan) != CANY ?
95b24440 2568 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
c728cb41 2569 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2570 locinput - pref_len;
1df70142
AL
2571 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2572 const char * const s1 = do_utf8 && OP(scan) != CANY ?
95b24440
DM
2573 pv_uni_display(PERL_DEBUG_PAD(1),
2574 (U8*)(locinput - pref_len + pref0_len),
c728cb41 2575 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2576 locinput - pref_len + pref0_len;
1df70142
AL
2577 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2578 const char * const s2 = do_utf8 && OP(scan) != CANY ?
95b24440 2579 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
c728cb41 2580 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2581 locinput;
1df70142 2582 const int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2583 PerlIO_printf(Perl_debug_log,
2584 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2585 (IV)(locinput - PL_bostr),
2586 PL_colors[4],
2587 len0, s0,
2588 PL_colors[5],
2589 PL_colors[2],
2590 len1, s1,
2591 PL_colors[3],
2592 (docolor ? "" : "> <"),
2593 PL_colors[0],
2594 len2, s2,
2595 PL_colors[1],
2596 15 - l - pref_len + 1,
2597 "",
4f639d21 2598 (IV)(scan - rex->program), PL_regindent*2, "",
3f7c398e 2599 SvPVX_const(prop));
2a782b5b
JH
2600 }
2601 });
d6a28714
JH
2602
2603 next = scan + NEXT_OFF(scan);
2604 if (next == scan)
2605 next = NULL;
2606
2607 switch (OP(scan)) {
2608 case BOL:
7fba1cd6 2609 if (locinput == PL_bostr)
d6a28714 2610 {
3b0527fe 2611 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2612 break;
2613 }
d6a28714
JH
2614 sayNO;
2615 case MBOL:
12d33761
HS
2616 if (locinput == PL_bostr ||
2617 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2618 {
b8c5462f
JH
2619 break;
2620 }
d6a28714
JH
2621 sayNO;
2622 case SBOL:
c2a73568 2623 if (locinput == PL_bostr)
b8c5462f 2624 break;
d6a28714
JH
2625 sayNO;
2626 case GPOS:
3b0527fe 2627 if (locinput == reginfo->ganch)
d6a28714
JH
2628 break;
2629 sayNO;
2630 case EOL:
d6a28714
JH
2631 goto seol;
2632 case MEOL:
d6a28714 2633 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2634 sayNO;
b8c5462f 2635 break;
d6a28714
JH
2636 case SEOL:
2637 seol:
2638 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2639 sayNO;
d6a28714 2640 if (PL_regeol - locinput > 1)
b8c5462f 2641 sayNO;
b8c5462f 2642 break;
d6a28714
JH
2643 case EOS:
2644 if (PL_regeol != locinput)
b8c5462f 2645 sayNO;
d6a28714 2646 break;
ffc61ed2 2647 case SANY:
d6a28714 2648 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2649 sayNO;
f33976b4
DB
2650 if (do_utf8) {
2651 locinput += PL_utf8skip[nextchr];
2652 if (locinput > PL_regeol)
2653 sayNO;
2654 nextchr = UCHARAT(locinput);
2655 }
2656 else
2657 nextchr = UCHARAT(++locinput);
2658 break;
2659 case CANY:
2660 if (!nextchr && locinput >= PL_regeol)
2661 sayNO;
b8c5462f 2662 nextchr = UCHARAT(++locinput);
a0d0e21e 2663 break;
ffc61ed2 2664 case REG_ANY:
1aa99e6b
IH
2665 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2666 sayNO;
2667 if (do_utf8) {
b8c5462f 2668 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2669 if (locinput > PL_regeol)
2670 sayNO;
a0ed51b3 2671 nextchr = UCHARAT(locinput);
a0ed51b3 2672 }
1aa99e6b
IH
2673 else
2674 nextchr = UCHARAT(++locinput);
a0ed51b3 2675 break;
a3621e74
YO
2676
2677
2678
2679 /*
2680 traverse the TRIE keeping track of all accepting states
2681 we transition through until we get to a failing node.
2682
a3621e74
YO
2683
2684 */
5b47454d 2685 case TRIE:
a3621e74
YO
2686 case TRIEF:
2687 case TRIEFL:
2688 {
a3621e74
YO
2689 U8 *uc = ( U8* )locinput;
2690 U32 state = 1;
2691 U16 charid = 0;
2692 U32 base = 0;
2693 UV uvc = 0;
2694 STRLEN len = 0;
2695 STRLEN foldlen = 0;
a3621e74
YO
2696 U8 *uscan = (U8*)NULL;
2697 STRLEN bufflen=0;
95b24440 2698 SV *sv_accept_buff = NULL;
5b47454d
DM
2699 const enum { trie_plain, trie_utf8, trie_uft8_fold }
2700 trie_type = do_utf8 ?
2701 (OP(scan) == TRIE ? trie_utf8 : trie_uft8_fold)
2702 : trie_plain;
2703
7087a21c
NC
2704 /* what trie are we using right now */
2705 reg_trie_data *trie
32fc9b6a 2706 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
d8319b27 2707 st->u.trie.accepted = 0; /* how many accepting states we have seen */
95b24440 2708 result = 0;
a3621e74
YO
2709
2710 while ( state && uc <= (U8*)PL_regeol ) {
2711
5b47454d 2712 if (trie->states[ state ].wordnum) {
d8319b27 2713 if (!st->u.trie.accepted ) {
5b47454d
DM
2714 ENTER;
2715 SAVETMPS;
2716 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2717 sv_accept_buff=newSV(bufflen *
2718 sizeof(reg_trie_accepted) - 1);
2719 SvCUR_set(sv_accept_buff,
2720 sizeof(reg_trie_accepted));
2721 SvPOK_on(sv_accept_buff);
2722 sv_2mortal(sv_accept_buff);
d8319b27 2723 st->u.trie.accept_buff =
5b47454d
DM
2724 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2725 }
2726 else {
d8319b27 2727 if (st->u.trie.accepted >= bufflen) {
5b47454d 2728 bufflen *= 2;
d8319b27 2729 st->u.trie.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2730 SvGROW(sv_accept_buff,
2731 bufflen * sizeof(reg_trie_accepted));
2732 }
2733 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2734 + sizeof(reg_trie_accepted));
2735 }
d8319b27
DM
2736 st->u.trie.accept_buff[st->u.trie.accepted].wordnum = trie->states[state].wordnum;
2737 st->u.trie.accept_buff[st->u.trie.accepted].endpos = uc;
2738 ++st->u.trie.accepted;
5b47454d 2739 }
a3621e74
YO
2740
2741 base = trie->states[ state ].trans.base;
2742
2743 DEBUG_TRIE_EXECUTE_r(
2744 PerlIO_printf( Perl_debug_log,
e4584336 2745 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2746 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27 2747 (UV)state, (UV)base, (UV)st->u.trie.accepted );
a3621e74
YO
2748 );
2749
2750 if ( base ) {
5b47454d
DM
2751 switch (trie_type) {
2752 case trie_uft8_fold:
a3621e74
YO
2753 if ( foldlen>0 ) {
2754 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2755 foldlen -= len;
2756 uscan += len;
2757 len=0;
2758 } else {
1df70142 2759 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2760 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2761 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2762 foldlen -= UNISKIP( uvc );
2763 uscan = foldbuf + UNISKIP( uvc );
2764 }
5b47454d
DM
2765 break;
2766 case trie_utf8:
2767 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2768 &len, uniflags );
2769 break;
2770 case trie_plain:
e4584336 2771 uvc = (UV)*uc;
a3621e74
YO
2772 len = 1;
2773 }
2774
5b47454d
DM
2775 if (uvc < 256) {
2776 charid = trie->charmap[ uvc ];
2777 }
2778 else {
2779 charid = 0;
2780 if (trie->widecharmap) {
2781 SV** svpp = (SV**)NULL;
2782 svpp = hv_fetch(trie->widecharmap,
2783 (char*)&uvc, sizeof(UV), 0);
2784 if (svpp)
2785 charid = (U16)SvIV(*svpp);
2786 }
2787 }
a3621e74 2788
5b47454d
DM
2789 if (charid &&
2790 (base + charid > trie->uniquecharcount )
2791 && (base + charid - 1 - trie->uniquecharcount
2792 < trie->lasttrans)
2793 && trie->trans[base + charid - 1 -
2794 trie->uniquecharcount].check == state)
2795 {
2796 state = trie->trans[base + charid - 1 -
2797 trie->uniquecharcount ].next;
2798 }
2799 else {
2800 state = 0;
2801 }
2802 uc += len;
2803
2804 }
2805 else {
a3621e74
YO
2806 state = 0;
2807 }
2808 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2809 PerlIO_printf( Perl_debug_log,
2810 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2811 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2812 );
2813 }
d8319b27 2814 if (!st->u.trie.accepted )
a3621e74 2815 sayNO;
a3621e74
YO
2816
2817 /*
2818 There was at least one accepting state that we
2819 transitioned through. Presumably the number of accepting
2820 states is going to be low, typically one or two. So we
2821 simply scan through to find the one with lowest wordnum.
2822 Once we find it, we swap the last state into its place
2823 and decrement the size. We then try to match the rest of
2824 the pattern at the point where the word ends, if we
2825 succeed then we end the loop, otherwise the loop
2826 eventually terminates once all of the accepting states
2827 have been tried.
2828 */
a3621e74 2829
d8319b27 2830 if ( st->u.trie.accepted == 1 ) {
a3621e74 2831 DEBUG_EXECUTE_r({
097eb12c 2832 SV ** const tmp = av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 );
a3621e74
YO
2833 PerlIO_printf( Perl_debug_log,
2834 "%*s %sonly one match : #%d <%s>%s\n",
e4584336 2835 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
d8319b27 2836 st->u.trie.accept_buff[ 0 ].wordnum,
cfd0369c 2837 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
a3621e74
YO
2838 PL_colors[5] );
2839 });
d8319b27 2840 PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos;
a3621e74
YO
2841 /* in this case we free tmps/leave before we call regmatch
2842 as we wont be using accept_buff again. */
2843 FREETMPS;
2844 LEAVE;
95b24440
DM
2845 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
2846 /*** all unsaved local vars undefined at this point */
a3621e74
YO
2847 } else {
2848 DEBUG_EXECUTE_r(
e4584336 2849 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
d8319b27 2850 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)st->u.trie.accepted,
a3621e74
YO
2851 PL_colors[5] );
2852 );
d8319b27 2853 while ( !result && st->u.trie.accepted-- ) {
a3621e74
YO
2854 U32 best = 0;
2855 U32 cur;
d8319b27 2856 for( cur = 1 ; cur <= st->u.trie.accepted ; cur++ ) {
e4584336
RB
2857 DEBUG_TRIE_EXECUTE_r(
2858 PerlIO_printf( Perl_debug_log,
2859 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2860 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27
DM
2861 (IV)best, st->u.trie.accept_buff[ best ].wordnum, (IV)cur,
2862 st->u.trie.accept_buff[ cur ].wordnum, PL_colors[5] );
e4584336 2863 );
a3621e74 2864
d8319b27
DM
2865 if (st->u.trie.accept_buff[cur].wordnum <
2866 st->u.trie.accept_buff[best].wordnum)
e822a8b4 2867 best = cur;
a3621e74
YO
2868 }
2869 DEBUG_EXECUTE_r({
87830502 2870 reg_trie_data * const trie = (reg_trie_data*)
32fc9b6a 2871 rex->data->data[ARG(scan)];
d8319b27 2872 SV ** const tmp = av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 );
a3621e74 2873 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
e4584336 2874 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
d8319b27 2875 st->u.trie.accept_buff[best].wordnum,
cfd0369c 2876 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",scan,
a3621e74
YO
2877 PL_colors[5] );
2878 });
d8319b27
DM
2879 if ( best<st->u.trie.accepted ) {
2880 reg_trie_accepted tmp = st->u.trie.accept_buff[ best ];
2881 st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ];
2882 st->u.trie.accept_buff[ st->u.trie.accepted ] = tmp;
2883 best = st->u.trie.accepted;
a3621e74 2884 }
d8319b27 2885 PL_reginput = (char *)st->u.trie.accept_buff[ best ].endpos;
a3621e74
YO
2886
2887 /*
2888 as far as I can tell we only need the SAVETMPS/FREETMPS
2889 for re's with EVAL in them but I'm leaving them in for
2890 all until I can be sure.
2891 */
2892 SAVETMPS;
95b24440
DM
2893 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
2894 /*** all unsaved local vars undefined at this point */
a3621e74
YO
2895 FREETMPS;
2896 }
2897 FREETMPS;
2898 LEAVE;
2899 }
2900
95b24440 2901 if (result) {
a3621e74
YO
2902 sayYES;
2903 } else {
2904 sayNO;
2905 }
2906 }
2907 /* unreached codepoint */
95b24440
DM
2908 case EXACT: {
2909 char *s = STRING(scan);
5d9a96ca 2910 st->ln = STR_LEN(scan);
eb160463 2911 if (do_utf8 != UTF) {
bc517b45 2912 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2913 char *l = locinput;
5d9a96ca 2914 const char *e = s + st->ln;
a72c7584 2915
5ff6fc6d
JH
2916 if (do_utf8) {
2917 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2918 while (s < e) {
a3b680e6 2919 STRLEN ulen;
1aa99e6b 2920 if (l >= PL_regeol)
5ff6fc6d
JH
2921 sayNO;
2922 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2923 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2924 uniflags))
5ff6fc6d 2925 sayNO;
bc517b45 2926 l += ulen;
5ff6fc6d 2927 s ++;
1aa99e6b 2928 }
5ff6fc6d
JH
2929 }
2930 else {
2931 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2932 while (s < e) {
a3b680e6 2933 STRLEN ulen;
1aa99e6b
IH
2934 if (l >= PL_regeol)
2935 sayNO;
5ff6fc6d 2936 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2937 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2938 uniflags))
1aa99e6b 2939 sayNO;
bc517b45 2940 s += ulen;
a72c7584 2941 l ++;
1aa99e6b 2942 }
5ff6fc6d 2943 }
1aa99e6b
IH
2944 locinput = l;
2945 nextchr = UCHARAT(locinput);
2946 break;
2947 }
bc517b45 2948 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2949 /* Inline the first character, for speed. */
2950 if (UCHARAT(s) != nextchr)
2951 sayNO;
5d9a96ca 2952 if (PL_regeol - locinput < st->ln)
d6a28714 2953 sayNO;
5d9a96ca 2954 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 2955 sayNO;
5d9a96ca 2956 locinput += st->ln;
d6a28714
JH
2957 nextchr = UCHARAT(locinput);
2958 break;
95b24440 2959 }
d6a28714 2960 case EXACTFL:
b8c5462f
JH
2961 PL_reg_flags |= RF_tainted;
2962 /* FALL THROUGH */
95b24440
DM
2963 case EXACTF: {
2964 char *s = STRING(scan);
5d9a96ca 2965 st->ln = STR_LEN(scan);
d6a28714 2966
d07ddd77
JH
2967 if (do_utf8 || UTF) {
2968 /* Either target or the pattern are utf8. */
d6a28714 2969 char *l = locinput;
d07ddd77 2970 char *e = PL_regeol;
bc517b45 2971
5d9a96ca 2972 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 2973 l, &e, 0, do_utf8)) {
5486206c
JH
2974 /* One more case for the sharp s:
2975 * pack("U0U*", 0xDF) =~ /ss/i,
2976 * the 0xC3 0x9F are the UTF-8
2977 * byte sequence for the U+00DF. */
2978 if (!(do_utf8 &&
2979 toLOWER(s[0]) == 's' &&
5d9a96ca 2980 st->ln >= 2 &&
5486206c
JH
2981 toLOWER(s[1]) == 's' &&
2982 (U8)l[0] == 0xC3 &&
2983 e - l >= 2 &&
2984 (U8)l[1] == 0x9F))
2985 sayNO;
2986 }
d07ddd77
JH
2987 locinput = e;
2988 nextchr = UCHARAT(locinput);
2989 break;
a0ed51b3 2990 }
d6a28714 2991
bc517b45
JH
2992 /* Neither the target and the pattern are utf8. */
2993
d6a28714
JH
2994 /* Inline the first character, for speed. */
2995 if (UCHARAT(s) != nextchr &&
2996 UCHARAT(s) != ((OP(scan) == EXACTF)
2997 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2998 sayNO;
5d9a96ca 2999 if (PL_regeol - locinput < st->ln)
b8c5462f 3000 sayNO;
5d9a96ca
DM
3001 if (st->ln > 1 && (OP(scan) == EXACTF
3002 ? ibcmp(s, locinput, st->ln)
3003 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 3004 sayNO;
5d9a96ca 3005 locinput += st->ln;
d6a28714 3006 nextchr = UCHARAT(locinput);
a0d0e21e 3007 break;
95b24440 3008 }
d6a28714 3009 case ANYOF:
ffc61ed2 3010 if (do_utf8) {
9e55ce06
JH
3011 STRLEN inclasslen = PL_regeol - locinput;
3012
32fc9b6a 3013 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 3014 sayNO_ANYOF;
ffc61ed2
JH
3015 if (locinput >= PL_regeol)
3016 sayNO;
0f0076b4 3017 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3018 nextchr = UCHARAT(locinput);
e0f9d4a8 3019 break;
ffc61ed2
JH
3020 }
3021 else {
3022 if (nextchr < 0)
3023 nextchr = UCHARAT(locinput);
32fc9b6a 3024 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 3025 sayNO_ANYOF;
ffc61ed2
JH
3026 if (!nextchr && locinput >= PL_regeol)
3027 sayNO;
3028 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3029 break;
3030 }
3031 no_anyof:
3032 /* If we might have the case of the German sharp s
3033 * in a casefolding Unicode character class. */
3034
ebc501f0
JH
3035 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3036 locinput += SHARP_S_SKIP;
e0f9d4a8 3037 nextchr = UCHARAT(locinput);
ffc61ed2 3038 }
e0f9d4a8
JH
3039 else
3040 sayNO;
b8c5462f 3041 break;
d6a28714 3042 case ALNUML:
b8c5462f
JH
3043 PL_reg_flags |= RF_tainted;
3044 /* FALL THROUGH */
d6a28714 3045 case ALNUM:
b8c5462f 3046 if (!nextchr)
4633a7c4 3047 sayNO;
ffc61ed2 3048 if (do_utf8) {
1a4fad37 3049 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3050 if (!(OP(scan) == ALNUM
3568d838 3051 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3052 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3053 {
3054 sayNO;
a0ed51b3 3055 }
b8c5462f 3056 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3057 nextchr = UCHARAT(locinput);
3058 break;
3059 }
ffc61ed2 3060 if (!(OP(scan) == ALNUM
d6a28714 3061 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3062 sayNO;
b8c5462f 3063 nextchr = UCHARAT(++locinput);
a0d0e21e 3064 break;
d6a28714 3065 case NALNUML:
b8c5462f
JH
3066 PL_reg_flags |= RF_tainted;
3067 /* FALL THROUGH */
d6a28714
JH
3068 case NALNUM:
3069 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3070 sayNO;
ffc61ed2 3071 if (do_utf8) {
1a4fad37 3072 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3073 if (OP(scan) == NALNUM
3568d838 3074 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3075 : isALNUM_LC_utf8((U8*)locinput))
3076 {
b8c5462f 3077 sayNO;
d6a28714 3078 }
b8c5462f
JH
3079 locinput += PL_utf8skip[nextchr];
3080 nextchr = UCHARAT(locinput);
3081 break;
3082 }
ffc61ed2 3083 if (OP(scan) == NALNUM
d6a28714 3084 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3085 sayNO;
76e3520e 3086 nextchr = UCHARAT(++locinput);
a0d0e21e 3087 break;
d6a28714
JH
3088 case BOUNDL:
3089 case NBOUNDL:
3280af22 3090 PL_reg_flags |= RF_tainted;
bbce6d69 3091 /* FALL THROUGH */
d6a28714
JH
3092 case BOUND:
3093 case NBOUND:
3094 /* was last char in word? */
ffc61ed2 3095 if (do_utf8) {
12d33761 3096 if (locinput == PL_bostr)
5d9a96ca 3097 st->ln = '\n';
ffc61ed2 3098 else {
a3b680e6 3099 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3100
4ad0818d 3101 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3102 }
3103 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3104 st->ln = isALNUM_uni(st->ln);
1a4fad37 3105 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3106 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3107 }
3108 else {
5d9a96ca 3109 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3110 n = isALNUM_LC_utf8((U8*)locinput);
3111 }
a0ed51b3 3112 }
d6a28714 3113 else {
5d9a96ca 3114 st->ln = (locinput != PL_bostr) ?
12d33761 3115 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3116 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3117 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3118 n = isALNUM(nextchr);
3119 }
3120 else {
5d9a96ca 3121 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3122 n = isALNUM_LC(nextchr);
3123 }
d6a28714 3124 }
5d9a96ca 3125 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3126 OP(scan) == BOUNDL))
3127 sayNO;
a0ed51b3 3128 break;
d6a28714 3129 case SPACEL:
3280af22 3130 PL_reg_flags |= RF_tainted;
bbce6d69 3131 /* FALL THROUGH */
d6a28714 3132 case SPACE:
9442cb0e 3133 if (!nextchr)
4633a7c4 3134 sayNO;
1aa99e6b 3135 if (do_utf8) {
fd400ab9 3136 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3137 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3138 if (!(OP(scan) == SPACE
3568d838 3139 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3140 : isSPACE_LC_utf8((U8*)locinput)))
3141 {
3142 sayNO;
3143 }
3144 locinput += PL_utf8skip[nextchr];
3145 nextchr = UCHARAT(locinput);
3146 break;
d6a28714 3147 }
ffc61ed2
JH
3148 if (!(OP(scan) == SPACE
3149 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3150 sayNO;
3151 nextchr = UCHARAT(++locinput);
3152 }
3153 else {
3154 if (!(OP(scan) == SPACE
3155 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3156 sayNO;
3157 nextchr = UCHARAT(++locinput);
a0ed51b3 3158 }
a0ed51b3 3159 break;
d6a28714 3160 case NSPACEL:
3280af22 3161 PL_reg_flags |= RF_tainted;
bbce6d69 3162 /* FALL THROUGH */
d6a28714 3163 case NSPACE:
9442cb0e 3164 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3165 sayNO;
1aa99e6b 3166 if (do_utf8) {
1a4fad37 3167 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3168 if (OP(scan) == NSPACE
3568d838 3169 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3170 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3171 {
3172 sayNO;
3173 }
3174 locinput += PL_utf8skip[nextchr];
3175 nextchr = UCHARAT(locinput);
3176 break;
a0ed51b3 3177 }
ffc61ed2 3178 if (OP(scan) == NSPACE
d6a28714 3179 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3180 sayNO;
76e3520e 3181 nextchr = UCHARAT(++locinput);
a0d0e21e 3182 break;
d6a28714 3183 case DIGITL:
a0ed51b3
LW
3184 PL_reg_flags |= RF_tainted;
3185 /* FALL THROUGH */
d6a28714 3186 case DIGIT:
9442cb0e 3187 if (!nextchr)
a0ed51b3 3188 sayNO;
1aa99e6b 3189 if (do_utf8) {
1a4fad37 3190 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3191 if (!(OP(scan) == DIGIT
3568d838 3192 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3193 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3194 {
a0ed51b3 3195 sayNO;
dfe13c55 3196 }
6f06b55f 3197 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3198 nextchr = UCHARAT(locinput);
3199 break;
3200 }
ffc61ed2 3201 if (!(OP(scan) == DIGIT
9442cb0e 3202 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3203 sayNO;
3204 nextchr = UCHARAT(++locinput);
3205 break;
d6a28714 3206 case NDIGITL:
b8c5462f
JH
3207 PL_reg_flags |= RF_tainted;
3208 /* FALL THROUGH */
d6a28714 3209 case NDIGIT:
9442cb0e 3210 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3211 sayNO;
1aa99e6b 3212 if (do_utf8) {
1a4fad37 3213 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3214 if (OP(scan) == NDIGIT
3568d838 3215 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3216 : isDIGIT_LC_utf8((U8*)locinput))
3217 {
a0ed51b3 3218 sayNO;
9442cb0e 3219 }
6f06b55f 3220 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3221 nextchr = UCHARAT(locinput);
3222 break;
3223 }
ffc61ed2 3224 if (OP(scan) == NDIGIT
9442cb0e 3225 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3226 sayNO;
3227 nextchr = UCHARAT(++locinput);
3228 break;
3229 case CLUMP:
b7c83a7e 3230 if (locinput >= PL_regeol)
a0ed51b3 3231 sayNO;
b7c83a7e 3232 if (do_utf8) {
1a4fad37 3233 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3234 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3235 sayNO;
3236 locinput += PL_utf8skip[nextchr];
3237 while (locinput < PL_regeol &&
3238 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3239 locinput += UTF8SKIP(locinput);
3240 if (locinput > PL_regeol)
3241 sayNO;
eb08e2da
JH
3242 }
3243 else
3244 locinput++;
a0ed51b3
LW
3245 nextchr = UCHARAT(locinput);
3246 break;
c8756f30 3247 case REFFL:
3280af22 3248 PL_reg_flags |= RF_tainted;
c8756f30 3249 /* FALL THROUGH */
c277df42 3250 case REF:
95b24440
DM
3251 case REFF: {
3252 char *s;
c277df42 3253 n = ARG(scan); /* which paren pair */
5d9a96ca 3254 st->ln = PL_regstartp[n];
2c2d71f5 3255 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3256 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3257 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3258 if (st->ln == PL_regendp[n])
a0d0e21e 3259 break;
a0ed51b3 3260
5d9a96ca 3261 s = PL_bostr + st->ln;
1aa99e6b 3262 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3263 char *l = locinput;
a3b680e6 3264 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3265 /*
3266 * Note that we can't do the "other character" lookup trick as
3267 * in the 8-bit case (no pun intended) because in Unicode we
3268 * have to map both upper and title case to lower case.
3269 */
3270 if (OP(scan) == REFF) {
3271 while (s < e) {
a3b680e6
AL
3272 STRLEN ulen1, ulen2;
3273 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3274 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3275
a0ed51b3
LW
3276 if (l >= PL_regeol)
3277 sayNO;
a2a2844f
JH
3278 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3279 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3280 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3281 sayNO;
a2a2844f
JH
3282 s += ulen1;
3283 l += ulen2;
a0ed51b3
LW
3284 }
3285 }
3286 locinput = l;
3287 nextchr = UCHARAT(locinput);
3288 break;
3289 }
3290
a0d0e21e 3291 /* Inline the first character, for speed. */
76e3520e 3292 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3293 (OP(scan) == REF ||
3294 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3295 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3296 sayNO;
5d9a96ca
DM
3297 st->ln = PL_regendp[n] - st->ln;
3298 if (locinput + st->ln > PL_regeol)
4633a7c4 3299 sayNO;
5d9a96ca
DM
3300 if (st->ln > 1 && (OP(scan) == REF
3301 ? memNE(s, locinput, st->ln)
c8756f30 3302 : (OP(scan) == REFF
5d9a96ca
DM
3303 ? ibcmp(s, locinput, st->ln)
3304 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3305 sayNO;
5d9a96ca 3306 locinput += st->ln;
76e3520e 3307 nextchr = UCHARAT(locinput);
a0d0e21e 3308 break;
95b24440 3309 }
a0d0e21e
LW
3310
3311 case NOTHING:
c277df42 3312 case TAIL:
a0d0e21e
LW
3313 break;
3314 case BACK:
3315 break;
c277df42
IZ
3316 case EVAL:
3317 {
c277df42 3318 SV *ret;
8e5e9ebe 3319 {
4aabdb9b
DM
3320 /* execute the code in the {...} */
3321 dSP;
6136c704 3322 SV ** const before = SP;
4aabdb9b
DM
3323 OP_4tree * const oop = PL_op;
3324 COP * const ocurcop = PL_curcop;
3325 PAD *old_comppad;
4aabdb9b
DM
3326
3327 n = ARG(scan);
32fc9b6a 3328 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3329 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3330 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3331 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3332
8e5e9ebe
RGS
3333 CALLRUNOPS(aTHX); /* Scalar context. */
3334 SPAGAIN;
3335 if (SP == before)
075aa684 3336 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3337 else {
3338 ret = POPs;
3339 PUTBACK;
3340 }
4aabdb9b
DM
3341
3342 PL_op = oop;
3343 PAD_RESTORE_LOCAL(old_comppad);
3344 PL_curcop = ocurcop;
3345 if (!st->logical) {
3346 /* /(?{...})/ */
3347 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3348 break;
3349 }
8e5e9ebe 3350 }
4aabdb9b
DM
3351 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3352 regexp *re;
4aabdb9b 3353 {
4f639d21
DM
3354 /* extract RE object from returned value; compiling if
3355 * necessary */
3356
6136c704 3357 MAGIC *mg = NULL;
4aabdb9b 3358 SV *sv;
faf82a0b
AE
3359 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3360 mg = mg_find(sv, PERL_MAGIC_qr);
3361 else if (SvSMAGICAL(ret)) {
3362 if (SvGMAGICAL(ret))
3363 sv_unmagic(ret, PERL_MAGIC_qr);
3364 else
3365 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3366 }
faf82a0b 3367
0f5d15d6
IZ
3368 if (mg) {
3369 re = (regexp *)mg->mg_obj;
df0003d4 3370 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3371 }
3372 else {
3373 STRLEN len;
6136c704 3374 const char * const t = SvPV_const(ret, len);
0f5d15d6 3375 PMOP pm;
a3b680e6 3376 const I32 osize = PL_regsize;
0f5d15d6 3377
5fcd1c1b 3378 Zero(&pm, 1, PMOP);
4aabdb9b 3379 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3380 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3381 if (!(SvFLAGS(ret)
faf82a0b
AE
3382 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3383 | SVs_GMG)))
14befaf4
DM
3384 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3385 PERL_MAGIC_qr,0,0);
0f5d15d6 3386 PL_regsize = osize;
0f5d15d6 3387 }
4aabdb9b 3388 }
aa283a38
DM
3389
3390 /* run the pattern returned from (??{...}) */
3391
4aabdb9b
DM
3392 DEBUG_EXECUTE_r(
3393 PerlIO_printf(Perl_debug_log,
3394 "Entering embedded \"%s%.60s%s%s\"\n",
3395 PL_colors[0],
3396 re->precomp,
3397 PL_colors[1],
3398 (strlen(re->precomp) > 60 ? "..." : ""))
3399 );
2c2d71f5 3400
4aabdb9b
DM
3401 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
3402 REGCP_SET(st->u.eval.lastcp);
4aabdb9b
DM
3403 *PL_reglastparen = 0;
3404 *PL_reglastcloseparen = 0;
4aabdb9b 3405 PL_reginput = locinput;
4aabdb9b
DM
3406
3407 /* XXXX This is too dramatic a measure... */
3408 PL_reg_maxiter = 0;
3409
5d9a96ca 3410 st->logical = 0;
aa283a38
DM
3411 st->u.eval.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3412 ((re->reganch & ROPT_UTF8) != 0);
3413 if (st->u.eval.toggleutf) PL_reg_flags ^= RF_utf8;
3414 st->u.eval.prev_rex = rex;
aa283a38 3415 rex = re;
aa283a38 3416
77cb431f
DM
3417 /* resume to current state on success */
3418 st->u.yes.prev_yes_state = yes_state;
3419 yes_state = st;
aa283a38
DM
3420 PUSH_STATE(newst, resume_EVAL);
3421 st = newst;
3422
3423 /* now continue from first node in postoned RE */
3424 next = re->program + 1;
3425 break;
4aabdb9b 3426 /* NOTREACHED */
a0ed51b3 3427 }
4aabdb9b
DM
3428 /* /(?(?{...})X|Y)/ */
3429 st->sw = SvTRUE(ret);
3430 st->logical = 0;
c277df42
IZ
3431 break;
3432 }
a0d0e21e 3433 case OPEN:
c277df42 3434 n = ARG(scan); /* which paren pair */
3280af22
NIS
3435 PL_reg_start_tmp[n] = locinput;
3436 if (n > PL_regsize)
3437 PL_regsize = n;
a0d0e21e
LW
3438 break;
3439 case CLOSE:
c277df42 3440 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3441 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3442 PL_regendp[n] = locinput - PL_bostr;
eb160463 3443 if (n > (I32)*PL_reglastparen)
3280af22 3444 *PL_reglastparen = n;
a01268b5 3445 *PL_reglastcloseparen = n;
a0d0e21e 3446 break;
c277df42
IZ
3447 case GROUPP:
3448 n = ARG(scan); /* which paren pair */
5d9a96ca 3449 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3450 break;
3451 case IFTHEN:
2c2d71f5 3452 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3453 if (st->sw)
c277df42
IZ
3454 next = NEXTOPER(NEXTOPER(scan));
3455 else {
3456 next = scan + ARG(scan);
3457 if (OP(next) == IFTHEN) /* Fake one. */
3458 next = NEXTOPER(NEXTOPER(next));
3459 }
3460 break;
3461 case LOGICAL:
5d9a96ca 3462 st->logical = scan->flags;
c277df42 3463 break;
2ab05381 3464/*******************************************************************
a0374537
DM
3465 cc points to the regmatch_state associated with the most recent CURLYX.
3466 This struct contains info about the innermost (...)* loop (an
3467 "infoblock"), and a pointer to the next outer cc.
2ab05381
IZ
3468
3469 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3470
95b24440 3471 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3472
a0374537 3473 2) This regnode populates cc, and calls regmatch() recursively
95b24440 3474 with the starting point at WHILEM node;
2ab05381
IZ
3475
3476 3) Each hit of WHILEM node tries to match A and Z (in the order
3477 depending on the current iteration, min/max of {min,max} and
3478 greediness). The information about where are nodes for "A"
a0374537 3479 and "Z" is read from cc, as is info on how many times "A"
2ab05381
IZ
3480 was already matched, and greediness.
3481
3482 4) After A matches, the same WHILEM node is hit again.
3483
95b24440 3484 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
2ab05381 3485 of the same pair. Thus when WHILEM tries to match Z, it temporarily
95b24440 3486 resets cc, since this Y(A)*Z can be a part of some other loop:
2ab05381
IZ
3487 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3488 of the external loop.
3489
a0374537 3490 Currently present infoblocks form a tree with a stem formed by st->cc
2ab05381
IZ
3491 and whatever it mentions via ->next, and additional attached trees
3492 corresponding to temporarily unset infoblocks as in "5" above.
3493
95b24440 3494 In the following picture, infoblocks for outer loop of
2ab05381
IZ
3495 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3496 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3497 infoblocks are drawn below the "reset" infoblock.
3498
3499 In fact in the picture below we do not show failed matches for Z and T
3500 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3501 more obvious *why* one needs to *temporary* unset infoblocks.]
3502
3503 Matched REx position InfoBlocks Comment
3504 (Y(A)*?Z)*?T x
3505 Y(A)*?Z)*?T x <- O
3506 Y (A)*?Z)*?T x <- O
3507 Y A)*?Z)*?T x <- O <- I
3508 YA )*?Z)*?T x <- O <- I
3509 YA A)*?Z)*?T x <- O <- I
3510 YAA )*?Z)*?T x <- O <- I
3511 YAA Z)*?T x <- O # Temporary unset I
3512 I
3513
3514 YAAZ Y(A)*?Z)*?T x <- O
3515 I
3516
3517 YAAZY (A)*?Z)*?T x <- O
3518 I
3519
3520 YAAZY A)*?Z)*?T x <- O <- I
3521 I
3522
3523 YAAZYA )*?Z)*?T x <- O <- I
3524 I
3525
3526 YAAZYA Z)*?T x <- O # Temporary unset I
3527 I,I
3528
3529 YAAZYAZ )*?T x <- O
3530 I,I
3531
3532 YAAZYAZ T x # Temporary unset O
3533 O
3534 I,I
3535
3536 YAAZYAZT x
3537 O
3538 I,I
3539 *******************************************************************/
95b24440 3540
a0d0e21e 3541 case CURLYX: {
cb434fcc
IZ
3542 /* No need to save/restore up to this paren */
3543 I32 parenfloor = scan->flags;
c277df42 3544
c2b7afd3
NC
3545 /* Dave says:
3546
3547 CURLYX and WHILEM are always paired: they're the moral
3548 equivalent of pp_enteriter anbd pp_iter.
3549
3550 The only time next could be null is if the node tree is
3551 corrupt. This was mentioned on p5p a few days ago.
3552
3553 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3554 So we'll assert that this is true:
3555 */
3556 assert(next);
30b2893d 3557 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
c277df42 3558 next += ARG(next);
cb434fcc
IZ
3559 /* XXXX Probably it is better to teach regpush to support
3560 parenfloor > PL_regsize... */
eb160463 3561 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc 3562 parenfloor = *PL_reglastparen; /* Pessimization... */
a0374537 3563
d8319b27
DM
3564 st->u.curlyx.cp = PL_savestack_ix;
3565 st->u.curlyx.outercc = st->cc;
a0374537
DM
3566 st->cc = st;
3567 /* these fields contain the state of the current curly.
3568 * they are accessed by subsequent WHILEMs;
3569 * cur and lastloc are also updated by WHILEM */
d8319b27
DM
3570 st->u.curlyx.parenfloor = parenfloor;
3571 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3572 st->u.curlyx.min = ARG1(scan);
3573 st->u.curlyx.max = ARG2(scan);
3574 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3575 st->u.curlyx.lastloc = 0;
a0374537
DM
3576 /* st->next and st->minmod are also read by WHILEM */
3577
3280af22 3578 PL_reginput = locinput;
95b24440
DM
3579 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3580 /*** all unsaved local vars undefined at this point */
d8319b27
DM
3581 regcpblow(st->u.curlyx.cp);
3582 st->cc = st->u.curlyx.outercc;
95b24440 3583 saySAME(result);
a0d0e21e 3584 }
5f66b61c 3585 /* NOTREACHED */
a0d0e21e
LW
3586 case WHILEM: {
3587 /*
3588 * This is really hard to understand, because after we match
3589 * what we're trying to match, we must make sure the rest of
2c2d71f5 3590 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3591 * to go back UP the parse tree by recursing ever deeper. And
3592 * if it fails, we have to reset our parent's current state
3593 * that we can try again after backing off.
3594 */
3595
c2b7afd3
NC
3596 /* Dave says:
3597
3598 st->cc gets initialised by CURLYX ready for use by WHILEM.
3599 So again, unless somethings been corrupted, st->cc cannot
3600 be null at that point in WHILEM.
3601
3602 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3603 So we'll assert that this is true:
3604 */
3605 assert(st->cc);
d8319b27
DM
3606 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3607 st->u.whilem.cache_offset = 0;
3608 st->u.whilem.cache_bit = 0;
c277df42 3609
d8319b27 3610 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3280af22 3611 PL_reginput = locinput;
a0d0e21e 3612
a3621e74 3613 DEBUG_EXECUTE_r(
9041c2e3 3614 PerlIO_printf(Perl_debug_log,
91f3b821 3615 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3616 REPORT_CODE_OFF+PL_regindent*2, "",
d8319b27
DM
3617 (long)n, (long)st->cc->u.curlyx.min,
3618 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
c277df42 3619 );
4633a7c4 3620
a0d0e21e
LW
3621 /* If degenerate scan matches "", assume scan done. */
3622
d8319b27
DM
3623 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3624 st->u.whilem.savecc = st->cc;
3625 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3626 if (st->cc)
d8319b27 3627 st->ln = st->cc->u.curlyx.cur;
a3621e74 3628 DEBUG_EXECUTE_r(
c3464db5
DD
3629 PerlIO_printf(Perl_debug_log,
3630 "%*s empty match detected, try continuation...\n",
3280af22 3631 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3632 );
d8319b27 3633 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
95b24440 3634 /*** all unsaved local vars undefined at this point */
d8319b27 3635 st->cc = st->u.whilem.savecc;
95b24440 3636 if (result)
4633a7c4 3637 sayYES;
d8319b27
DM
3638 if (st->cc->u.curlyx.outercc)
3639 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4633a7c4 3640 sayNO;
a0d0e21e
LW
3641 }
3642
3643 /* First just match a string of min scans. */
3644
d8319b27
DM
3645 if (n < st->cc->u.curlyx.min) {
3646 st->cc->u.curlyx.cur = n;
3647 st->cc->u.curlyx.lastloc = locinput;
3648 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
95b24440
DM
3649 /*** all unsaved local vars undefined at this point */
3650 if (result)
4633a7c4 3651 sayYES;
d8319b27
DM
3652 st->cc->u.curlyx.cur = n - 1;
3653 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
4633a7c4 3654 sayNO;
a0d0e21e
LW
3655 }
3656
2c2d71f5
JH
3657 if (scan->flags) {
3658 /* Check whether we already were at this position.
3659 Postpone detection until we know the match is not
3660 *that* much linear. */
3661 if (!PL_reg_maxiter) {
3662 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3663 PL_reg_leftiter = PL_reg_maxiter;
3664 }
3665 if (PL_reg_leftiter-- == 0) {
a3b680e6 3666 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
2c2d71f5 3667 if (PL_reg_poscache) {
eb160463 3668 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3669 Renew(PL_reg_poscache, size, char);
3670 PL_reg_poscache_size = size;
3671 }
3672 Zero(PL_reg_poscache, size, char);
3673 }
3674 else {
3675 PL_reg_poscache_size = size;
a02a5408 3676 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3677 }
a3621e74 3678 DEBUG_EXECUTE_r(
2c2d71f5
JH
3679 PerlIO_printf(Perl_debug_log,
3680 "%sDetected a super-linear match, switching on caching%s...\n",
3681 PL_colors[4], PL_colors[5])
3682 );
3683 }
3684 if (PL_reg_leftiter < 0) {
d8319b27 3685 st->u.whilem.cache_offset = locinput - PL_bostr;
2c2d71f5 3686
d8319b27
DM
3687 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3688 + st->u.whilem.cache_offset * (scan->flags>>4);
3689 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3690 st->u.whilem.cache_offset /= 8;
3691 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
a3621e74 3692 DEBUG_EXECUTE_r(
2c2d71f5
JH
3693 PerlIO_printf(Perl_debug_log,
3694 "%*s already tried at this position...\n",
3695 REPORT_CODE_OFF+PL_regindent*2, "")
3696 );
3ab3c9b4
HS
3697 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3698 /* cache records success */
c2b0868c
HS
3699 sayYES;
3700 else
3ab3c9b4 3701 /* cache records failure */
c2b0868c 3702 sayNO_SILENT;
2c2d71f5 3703 }
2c2d71f5
JH
3704 }
3705 }
3706
a0d0e21e
LW
3707 /* Prefer next over scan for minimal matching. */
3708
5d9a96ca 3709 if (st->cc->minmod) {
d8319b27
DM
3710 st->u.whilem.savecc = st->cc;
3711 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3712 if (st->cc)
d8319b27
DM
3713 st->ln = st->cc->u.curlyx.cur;
3714 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3715 REGCP_SET(st->u.whilem.lastcp);
3716 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
95b24440 3717 /*** all unsaved local vars undefined at this point */
d8319b27 3718 st->cc = st->u.whilem.savecc;
95b24440 3719 if (result) {
d8319b27 3720 regcpblow(st->u.whilem.cp);
3ab3c9b4 3721 CACHEsayYES; /* All done. */
5f05dabc 3722 }
d8319b27 3723 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3724 regcppop(rex);
d8319b27
DM
3725 if (st->cc->u.curlyx.outercc)
3726 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
a0d0e21e 3727
d8319b27 3728 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
9041c2e3 3729 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3730 && !(PL_reg_flags & RF_warned)) {
3731 PL_reg_flags |= RF_warned;
9014280d 3732 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3733 "Complex regular subexpression recursion",
3734 REG_INFTY - 1);
c277df42 3735 }
3ab3c9b4 3736 CACHEsayNO;
c277df42 3737 }
a687059c 3738
a3621e74 3739 DEBUG_EXECUTE_r(
c3464db5
DD
3740 PerlIO_printf(Perl_debug_log,
3741 "%*s trying longer...\n",
3280af22 3742 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3743 );
a0d0e21e 3744 /* Try scanning more and see if it helps. */
3280af22 3745 PL_reginput = locinput;
d8319b27
DM
3746 st->cc->u.curlyx.cur = n;
3747 st->cc->u.curlyx.lastloc = locinput;
3748 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3749 REGCP_SET(st->u.whilem.lastcp);
3750 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
95b24440
DM
3751 /*** all unsaved local vars undefined at this point */
3752 if (result) {
d8319b27 3753 regcpblow(st->u.whilem.cp);
3ab3c9b4 3754 CACHEsayYES;
5f05dabc 3755 }
d8319b27 3756 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3757 regcppop(rex);
d8319b27
DM
3758 st->cc->u.curlyx.cur = n - 1;
3759 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3760 CACHEsayNO;
a0d0e21e
LW
3761 }
3762
3763 /* Prefer scan over next for maximal matching. */
3764
d8319b27
DM
3765 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3766 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3767 st->cc->u.curlyx.cur = n;
3768 st->cc->u.curlyx.lastloc = locinput;
3769 REGCP_SET(st->u.whilem.lastcp);
3770 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
95b24440
DM
3771 /*** all unsaved local vars undefined at this point */
3772 if (result) {
d8319b27 3773 regcpblow(st->u.whilem.cp);
3ab3c9b4 3774 CACHEsayYES;
5f05dabc 3775 }
d8319b27 3776 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3777 regcppop(rex); /* Restore some previous $<digit>s? */
3280af22 3778 PL_reginput = locinput;
a3621e74 3779 DEBUG_EXECUTE_r(
c3464db5
DD
3780 PerlIO_printf(Perl_debug_log,
3781 "%*s failed, try continuation...\n",
3280af22 3782 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3783 );
3784 }
9041c2e3 3785 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3786 && !(PL_reg_flags & RF_warned)) {
3280af22 3787 PL_reg_flags |= RF_warned;
9014280d 3788 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3789 "Complex regular subexpression recursion",
3790 REG_INFTY - 1);
a0d0e21e
LW
3791 }
3792
3793 /* Failed deeper matches of scan, so see if this one works. */
d8319b27
DM
3794 st->u.whilem.savecc = st->cc;
3795 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3796 if (st->cc)
d8319b27
DM
3797 st->ln = st->cc->u.curlyx.cur;
3798 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
95b24440 3799 /*** all unsaved local vars undefined at this point */
d8319b27 3800 st->cc = st->u.whilem.savecc;
95b24440 3801 if (result)
3ab3c9b4 3802 CACHEsayYES;
d8319b27
DM
3803 if (st->cc->u.curlyx.outercc)
3804 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3805 st->cc->u.curlyx.cur = n - 1;
3806 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3807 CACHEsayNO;
a0d0e21e 3808 }
5f66b61c 3809 /* NOTREACHED */
9041c2e3 3810 case BRANCHJ:
c277df42
IZ
3811 next = scan + ARG(scan);
3812 if (next == scan)
3813 next = NULL;
3814 inner = NEXTOPER(NEXTOPER(scan));
3815 goto do_branch;
9041c2e3 3816 case BRANCH:
c277df42
IZ
3817 inner = NEXTOPER(scan);
3818 do_branch:
3819 {
e822a8b4
DM
3820 I32 type;
3821 type = OP(scan);
ae5031b3 3822 if (!next || OP(next) != type) /* No choice. */
c277df42 3823 next = inner; /* Avoid recursion. */
a0d0e21e 3824 else {
a3b680e6 3825 const I32 lastparen = *PL_reglastparen;
02db2b7b 3826 /* Put unwinding data on stack */
6136c704
AL
3827 const I32 unwind1 = SSNEWt(1,re_unwind_branch_t);
3828 re_unwind_branch_t * const uw = SSPTRt(unwind1,re_unwind_branch_t);
3829
5d9a96ca
DM
3830 uw->prev = st->unwind;
3831 st->unwind = unwind1;
e822a8b4 3832 uw->type = ((type == BRANCH)
02db2b7b
IZ
3833 ? RE_UNWIND_BRANCH
3834 : RE_UNWIND_BRANCHJ);
3835 uw->lastparen = lastparen;
3836 uw->next = next;
3837 uw->locinput = locinput;
3838 uw->nextchr = nextchr;
3a2830be 3839 uw->minmod = st->minmod;
02db2b7b
IZ
3840#ifdef DEBUGGING
3841 uw->regindent = ++PL_regindent;
3842#endif
c277df42 3843
02db2b7b
IZ
3844 REGCP_SET(uw->lastcp);
3845
3846 /* Now go into the first branch */
3847 next = inner;
a687059c 3848 }
a0d0e21e
LW
3849 }
3850 break;
3851 case MINMOD:
5d9a96ca 3852 st->minmod = 1;
a0d0e21e 3853 break;
c277df42
IZ
3854 case CURLYM:
3855 {
d8319b27 3856 st->u.curlym.l = st->u.curlym.matches = 0;
9041c2e3 3857
c277df42 3858 /* We suppose that the next guy does not need
0e788c72 3859 backtracking: in particular, it is of constant non-zero length,
c277df42 3860 and has no parenths to influence future backrefs. */
5d9a96ca 3861 st->ln = ARG1(scan); /* min to match */
c277df42 3862 n = ARG2(scan); /* max to match */
d8319b27
DM
3863 st->u.curlym.paren = scan->flags;
3864 if (st->u.curlym.paren) {
3865 if (st->u.curlym.paren > PL_regsize)
3866 PL_regsize = st->u.curlym.paren;
3867 if (st->u.curlym.paren > (I32)*PL_reglastparen)
3868 *PL_reglastparen = st->u.curlym.paren;
c277df42 3869 }
dc45a647 3870 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
d8319b27 3871 if (st->u.curlym.paren)
c277df42 3872 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3873 PL_reginput = locinput;
d8319b27 3874 st->u.curlym.maxwanted = st->minmod ? st->ln : n;
0cadcf80
DM
3875 while (PL_reginput < PL_regeol && st->u.curlym.matches < st->u.curlym.maxwanted) {
3876 /* resume to current state on success */
3877 st->u.yes.prev_yes_state = yes_state;
3878 yes_state = st;
3879 REGMATCH(scan, CURLYM1);
3880 yes_state = st->u.yes.prev_yes_state;
3881 /*** all unsaved local vars undefined at this point */
3882 if (!result)
3883 break;
3884 /* on first match, determine length, u.curlym.l */
3885 if (!st->u.curlym.matches++) {
3886 if (PL_reg_match_utf8) {
3887 char *s = locinput;
3888 while (s < PL_reginput) {
3889 st->u.curlym.l++;
3890 s += UTF8SKIP(s);
6407bf3b
DM
3891 }
3892 }
0cadcf80
DM
3893 else {
3894 st->u.curlym.l = PL_reginput - locinput;
3895 }
3896 if (st->u.curlym.l == 0) {
3897 st->u.curlym.matches = st->u.curlym.maxwanted;
3898 break;
3899 }
6407bf3b 3900 }
0cadcf80 3901 locinput = PL_reginput;
6407bf3b
DM
3902 }
3903
3904 PL_reginput = locinput;
0cadcf80 3905 if (st->u.curlym.matches < st->ln) {
5d9a96ca 3906 st->minmod = 0;
0cadcf80
DM
3907 sayNO;
3908 }
5f80c4cf 3909
0cadcf80
DM
3910 DEBUG_EXECUTE_r(
3911 PerlIO_printf(Perl_debug_log,
3912 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3913 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3914 (IV) st->u.curlym.matches, (IV)st->u.curlym.l)
3915 );
3916
3917 /* calculate c1 and c1 for possible match of 1st char
3918 * following curly */
9e137952 3919 st->u.curlym.c1 = st->u.curlym.c2 = CHRTEST_VOID;
0cadcf80
DM
3920 if (HAS_TEXT(next) || JUMPABLE(next)) {
3921 regnode *text_node = next;
3922 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3923 if (HAS_TEXT(text_node)
3924 && PL_regkind[(U8)OP(text_node)] != REF)
3925 {
3926 st->u.curlym.c1 = (U8)*STRING(text_node);
3927 st->u.curlym.c2 =
3928 (OP(text_node) == EXACTF || OP(text_node) == REFF)
3929 ? PL_fold[st->u.curlym.c1]
3930 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3931 ? PL_fold_locale[st->u.curlym.c1]
3932 : st->u.curlym.c1;
3933 }
3934 }
5f80c4cf 3935
0cadcf80
DM
3936 REGCP_SET(st->u.curlym.lastcp);
3937
3938 st->u.curlym.minmod = st->minmod;
3939 st->minmod = 0;
3940 while (st->u.curlym.matches >= st->ln
3941 && (st->u.curlym.matches <= n
3942 /* for REG_INFTY, ln could overflow to negative */
3943 || (n == REG_INFTY && st->u.curlym.matches >= 0)))
3944 {
3945 /* If it could work, try it. */
9e137952 3946 if (st->u.curlym.c1 == CHRTEST_VOID ||
0cadcf80
DM
3947 UCHARAT(PL_reginput) == st->u.curlym.c1 ||
3948 UCHARAT(PL_reginput) == st->u.curlym.c2)
3949 {
3950 DEBUG_EXECUTE_r(
3951 PerlIO_printf(Perl_debug_log,
3952 "%*s trying tail with matches=%"IVdf"...\n",
3953 (int)(REPORT_CODE_OFF+PL_regindent*2),
3954 "", (IV)st->u.curlym.matches)
3955 );
3956 if (st->u.curlym.paren) {
3957 if (st->u.curlym.matches) {
3958 PL_regstartp[st->u.curlym.paren]
3959 = HOPc(PL_reginput, -st->u.curlym.l) - PL_bostr;
3960 PL_regendp[st->u.curlym.paren] = PL_reginput - PL_bostr;
cca55fe3 3961 }
5f80c4cf 3962 else
0cadcf80 3963 PL_regendp[st->u.curlym.paren] = -1;
5f80c4cf 3964 }
0cadcf80
DM
3965 /* resume to current state on success */
3966 st->u.yes.prev_yes_state = yes_state;
3967 yes_state = st;
3968 REGMATCH(next, CURLYM2);
3969 yes_state = st->u.yes.prev_yes_state;
3970 /*** all unsaved local vars undefined at this point */
3971 if (result)
3972 /* XXX tmp sayYES; */
3973 sayYES_FINAL;
3974 REGCP_UNWIND(st->u.curlym.lastcp);
a0ed51b3 3975 }
0cadcf80
DM
3976 /* Couldn't or didn't -- move forward/backward. */
3977 if (st->u.curlym.minmod) {
3280af22 3978 PL_reginput = locinput;
dad79028
DM
3979 /* resume to current state on success */
3980 st->u.yes.prev_yes_state = yes_state;
3981 yes_state = st;
95b24440 3982 REGMATCH(scan, CURLYM3);
dad79028 3983 yes_state = st->u.yes.prev_yes_state;
95b24440
DM
3984 /*** all unsaved local vars undefined at this point */
3985 if (result) {
0cadcf80 3986 st->u.curlym.matches++;
3280af22 3987 locinput = PL_reginput;
c277df42
IZ
3988 }
3989 else
3990 sayNO;
3991 }
0cadcf80 3992 else {
d8319b27
DM
3993 st->u.curlym.matches--;
3994 locinput = HOPc(locinput, -st->u.curlym.l);
3280af22 3995 PL_reginput = locinput;
c277df42
IZ
3996 }
3997 }
3998 sayNO;
5f66b61c 3999 /* NOTREACHED */
c277df42
IZ
4000 break;
4001 }
4002 case CURLYN:
d8319b27
DM
4003 st->u.plus.paren = scan->flags; /* Which paren to set */
4004 if (st->u.plus.paren > PL_regsize)
4005 PL_regsize = st->u.plus.paren;
4006 if (st->u.plus.paren > (I32)*PL_reglastparen)
4007 *PL_reglastparen = st->u.plus.paren;
5d9a96ca 4008 st->ln = ARG1(scan); /* min to match */
c277df42 4009 n = ARG2(scan); /* max to match */
dc45a647 4010 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 4011 goto repeat;
a0d0e21e 4012 case CURLY:
d8319b27 4013 st->u.plus.paren = 0;
5d9a96ca 4014 st->ln = ARG1(scan); /* min to match */
a0d0e21e 4015 n = ARG2(scan); /* max to match */
dc45a647 4016 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
4017 goto repeat;
4018 case STAR:
5d9a96ca 4019 st->ln = 0;
c277df42 4020 n = REG_INFTY;
a0d0e21e 4021 scan = NEXTOPER(scan);
d8319b27 4022 st->u.plus.paren = 0;
a0d0e21e
LW
4023 goto repeat;
4024 case PLUS:
5d9a96ca 4025 st->ln = 1;
c277df42
IZ
4026 n = REG_INFTY;
4027 scan = NEXTOPER(scan);
d8319b27 4028 st->u.plus.paren = 0;
c277df42 4029 repeat:
a0d0e21e
LW
4030 /*
4031 * Lookahead to avoid useless match attempts
4032 * when we know what character comes next.
4033 */
5f80c4cf
JP
4034
4035 /*
4036 * Used to only do .*x and .*?x, but now it allows
4037 * for )'s, ('s and (?{ ... })'s to be in the way
4038 * of the quantifier and the EXACT-like node. -- japhy
4039 */
4040
cca55fe3 4041 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4042 U8 *s;
4043 regnode *text_node = next;
4044
cca55fe3 4045 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 4046
9e137952
DM
4047 if (! HAS_TEXT(text_node))
4048 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
5f80c4cf 4049 else {
cca55fe3 4050 if (PL_regkind[(U8)OP(text_node)] == REF) {
9e137952 4051 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
44a68960 4052 goto assume_ok_easy;
cca55fe3
JP
4053 }
4054 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
4055
4056 if (!UTF) {
d8319b27 4057 st->u.plus.c2 = st->u.plus.c1 = *s;
f65d3ee7 4058 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
d8319b27 4059 st->u.plus.c2 = PL_fold[st->u.plus.c1];
f65d3ee7 4060 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
d8319b27 4061 st->u.plus.c2 = PL_fold_locale[st->u.plus.c1];
1aa99e6b 4062 }
5f80c4cf 4063 else { /* UTF */
f65d3ee7 4064 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 4065 STRLEN ulen1, ulen2;
89ebb4a3
JH
4066 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4067 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4068
4069 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4070 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4071
d8319b27 4072 st->u.plus.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
041457d9 4073 uniflags);
d8319b27 4074 st->u.plus.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
041457d9 4075 uniflags);
5f80c4cf
JP
4076 }
4077 else {
d8319b27 4078 st->u.plus.c2 = st->u.plus.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4079 uniflags);
5f80c4cf 4080 }
1aa99e6b
IH
4081 }
4082 }
bbce6d69 4083 }
a0d0e21e 4084 else
9e137952 4085 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
cca55fe3 4086 assume_ok_easy:
3280af22 4087 PL_reginput = locinput;
5d9a96ca
DM
4088 if (st->minmod) {
4089 st->minmod = 0;
32fc9b6a 4090 if (st->ln && regrepeat(rex, scan, st->ln) < st->ln)
4633a7c4 4091 sayNO;
a0ed51b3 4092 locinput = PL_reginput;
d8319b27 4093 REGCP_SET(st->u.plus.lastcp);
9e137952 4094 if (st->u.plus.c1 != CHRTEST_VOID) {
d8319b27
DM
4095 st->u.plus.old = locinput;
4096 st->u.plus.count = 0;
0fe9bf95 4097
1aa99e6b 4098 if (n == REG_INFTY) {
d8319b27 4099 st->u.plus.e = PL_regeol - 1;
1aa99e6b 4100 if (do_utf8)
d8319b27
DM
4101 while (UTF8_IS_CONTINUATION(*(U8*)st->u.plus.e))
4102 st->u.plus.e--;
1aa99e6b
IH
4103 }
4104 else if (do_utf8) {
5d9a96ca 4105 int m = n - st->ln;
d8319b27
DM
4106 for (st->u.plus.e = locinput;
4107 m >0 && st->u.plus.e + UTF8SKIP(st->u.plus.e) <= PL_regeol; m--)
4108 st->u.plus.e += UTF8SKIP(st->u.plus.e);
1aa99e6b
IH
4109 }
4110 else {
d8319b27
DM
4111 st->u.plus.e = locinput + n - st->ln;
4112 if (st->u.plus.e >= PL_regeol)
4113 st->u.plus.e = PL_regeol - 1;
1aa99e6b 4114 }
0fe9bf95
IZ
4115 while (1) {
4116 /* Find place 'next' could work */
1aa99e6b 4117 if (!do_utf8) {
d8319b27
DM
4118 if (st->u.plus.c1 == st->u.plus.c2) {
4119 while (locinput <= st->u.plus.e &&
4120 UCHARAT(locinput) != st->u.plus.c1)
1aa99e6b
IH
4121 locinput++;
4122 } else {
d8319b27
DM
4123 while (locinput <= st->u.plus.e
4124 && UCHARAT(locinput) != st->u.plus.c1
4125 && UCHARAT(locinput) != st->u.plus.c2)
1aa99e6b
IH
4126 locinput++;
4127 }
d8319b27 4128 st->u.plus.count = locinput - st->u.plus.old;
1aa99e6b
IH
4129 }
4130 else {
d8319b27 4131 if (st->u.plus.c1 == st->u.plus.c2) {
a3b680e6 4132 STRLEN len;
872c91ae
JH
4133 /* count initialised to
4134 * utf8_distance(old, locinput) */
d8319b27 4135 while (locinput <= st->u.plus.e &&
872c91ae 4136 utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4137 UTF8_MAXBYTES, &len,
d8319b27 4138 uniflags) != (UV)st->u.plus.c1) {
1aa99e6b 4139 locinput += len;
d8319b27 4140 st->u.plus.count++;
b2f2f093 4141 }
1aa99e6b 4142 } else {
872c91ae
JH
4143 /* count initialised to
4144 * utf8_distance(old, locinput) */
d8319b27 4145 while (locinput <= st->u.plus.e) {
c4fd8992
AL
4146 STRLEN len;
4147 const UV c = utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4148 UTF8_MAXBYTES, &len,
041457d9 4149 uniflags);
d8319b27 4150 if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2)
1aa99e6b 4151 break;
b2f2f093 4152 locinput += len;
d8319b27 4153 st->u.plus.count++;
1aa99e6b
IH
4154 }
4155 }
0fe9bf95 4156 }
d8319b27 4157 if (locinput > st->u.plus.e)
0fe9bf95
IZ
4158 sayNO;
4159 /* PL_reginput == old now */
d8319b27 4160 if (locinput != st->u.plus.old) {
5d9a96ca 4161 st->ln = 1; /* Did some */
32fc9b6a 4162 if (regrepeat(rex, scan, st->u.plus.count) < st->u.plus.count)
0fe9bf95
IZ
4163 sayNO;
4164 }
4165 /* PL_reginput == locinput now */
d8319b27 4166 TRYPAREN(st->u.plus.paren, st->ln, locinput, PLUS1);
95b24440 4167 /*** all unsaved local vars undefined at this point */
0fe9bf95 4168 PL_reginput = locinput; /* Could be reset... */
d8319b27 4169 REGCP_UNWIND(st->u.plus.lastcp);
0fe9bf95 4170 /* Couldn't or didn't -- move forward. */
d8319b27 4171 st->u.plus.old = locinput;
1aa99e6b
IH
4172 if (do_utf8)
4173 locinput += UTF8SKIP(locinput);
4174 else
4175 locinput++;
d8319b27 4176 st->u.plus.count = 1;
0fe9bf95
IZ
4177 }
4178 }
4179 else
5d9a96ca 4180 while (n >= st->ln || (n == REG_INFTY && st->ln > 0)) { /* ln overflow ? */
1aa99e6b 4181 UV c;
9e137952 4182 if (st->u.plus.c1 != CHRTEST_VOID) {
1aa99e6b 4183 if (do_utf8)
872c91ae 4184 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4185 UTF8_MAXBYTES, 0,
041457d9 4186 uniflags);
1aa99e6b 4187 else
9041c2e3 4188 c = UCHARAT(PL_reginput);
2390ecbc 4189 /* If it could work, try it. */
d8319b27 4190 if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2)
2390ecbc 4191 {
d8319b27 4192 TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS2);
95b24440 4193 /*** all unsaved local vars undefined at this point */
d8319b27 4194 REGCP_UNWIND(st->u.plus.lastcp);
2390ecbc 4195 }
1aa99e6b 4196 }
a0d0e21e 4197 /* If it could work, try it. */
9e137952 4198 else if (st->u.plus.c1 == CHRTEST_VOID)
bbce6d69 4199 {
d8319b27 4200 TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS3);
95b24440 4201 /*** all unsaved local vars undefined at this point */
d8319b27 4202 REGCP_UNWIND(st->u.plus.lastcp);
bbce6d69 4203 }
c277df42 4204 /* Couldn't or didn't -- move forward. */
a0ed51b3 4205 PL_reginput = locinput;
32fc9b6a 4206 if (regrepeat(rex, scan, 1)) {
5d9a96ca 4207 st->ln++;
a0ed51b3
LW
4208 locinput = PL_reginput;
4209 }
4210 else
4633a7c4 4211 sayNO;
a0d0e21e
LW
4212 }
4213 }
4214 else {
32fc9b6a 4215 n = regrepeat(rex, scan, n);
a0ed51b3 4216 locinput = PL_reginput;
5d9a96ca 4217 if (st->ln < n && PL_regkind[(U8)OP(next)] == EOL &&
7fba1cd6 4218 (OP(next) != MEOL ||
15272685
HS
4219 OP(next) == SEOL || OP(next) == EOS))
4220 {
5d9a96ca 4221 st->ln = n; /* why back off? */
1aeab75a
GS
4222 /* ...because $ and \Z can match before *and* after
4223 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4224 We should back off by one in this case. */
4225 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
5d9a96ca 4226 st->ln--;
1aeab75a 4227 }
d8319b27 4228 REGCP_SET(st->u.plus.lastcp);
1d5c262f 4229 {
8fa7f367 4230 UV c = 0;
5d9a96ca 4231 while (n >= st->ln) {
9e137952 4232 if (st->u.plus.c1 != CHRTEST_VOID) {
1aa99e6b 4233 if (do_utf8)
872c91ae 4234 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4235 UTF8_MAXBYTES, 0,
041457d9 4236 uniflags);
1aa99e6b 4237 else
9041c2e3 4238 c = UCHARAT(PL_reginput);
1aa99e6b 4239 }
c277df42 4240 /* If it could work, try it. */
9e137952 4241 if (st->u.plus.c1 == CHRTEST_VOID || c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2)
c277df42 4242 {
d8319b27 4243 TRYPAREN(st->u.plus.paren, n, PL_reginput, PLUS4);
95b24440 4244 /*** all unsaved local vars undefined at this point */
d8319b27 4245 REGCP_UNWIND(st->u.plus.lastcp);
c277df42
IZ
4246 }
4247 /* Couldn't or didn't -- back up. */
4248 n--;
dfe13c55 4249 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 4250 }
a0d0e21e
LW
4251 }
4252 }
4633a7c4 4253 sayNO;
c277df42 4254 break;
a0d0e21e 4255 case END:
3b0527fe 4256 if (locinput < reginfo->till) {
a3621e74 4257 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4258 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4259 PL_colors[4],
4260 (long)(locinput - PL_reg_starttry),
3b0527fe 4261 (long)(reginfo->till - PL_reg_starttry),
7821416a
IZ
4262 PL_colors[5]));
4263 sayNO_FINAL; /* Cannot match: too short. */
4264 }
4265 PL_reginput = locinput; /* put where regtry can find it */
4266 sayYES_FINAL; /* Success! */
dad79028
DM
4267
4268 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4269 DEBUG_EXECUTE_r(
4270 PerlIO_printf(Perl_debug_log,
4271 "%*s %ssubpattern success...%s\n",
4272 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
3280af22 4273 PL_reginput = locinput; /* put where regtry can find it */
dad79028
DM
4274 sayYES_FINAL; /* Success! */
4275
4276 case SUSPEND: /* (?>FOO) */
4277 st->u.ifmatch.wanted = 1;
9fe1d20c 4278 PL_reginput = locinput;
9041c2e3 4279 goto do_ifmatch;
dad79028
DM
4280
4281 case UNLESSM: /* -ve lookaround: (?!FOO), or with flags, (?<!foo) */
4282 st->u.ifmatch.wanted = 0;
4283 goto ifmatch_trivial_fail_test;
4284
4285 case IFMATCH: /* +ve lookaround: (?=FOO), or with flags, (?<=foo) */
4286 st->u.ifmatch.wanted = 1;
4287 ifmatch_trivial_fail_test:
a0ed51b3 4288 if (scan->flags) {
52657f30 4289 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4290 if (!s) {
4291 /* trivial fail */
4292 if (st->logical) {
4293 st->logical = 0;
4294 st->sw = 1 - st->u.ifmatch.wanted;
4295 }
4296 else if (st->u.ifmatch.wanted)
4297 sayNO;
4298 next = scan + ARG(scan);
4299 if (next == scan)
4300 next = NULL;
4301 break;
4302 }
efb30f32 4303 PL_reginput = s;
a0ed51b3
LW
4304 }
4305 else
4306 PL_reginput = locinput;
4307
c277df42 4308 do_ifmatch:
dad79028
DM
4309 /* resume to current state on success */
4310 st->u.yes.prev_yes_state = yes_state;
4311 yes_state = st;
4312 PUSH_STATE(newst, resume_IFMATCH);
4313 st = newst;
4314 next = NEXTOPER(NEXTOPER(scan));
4315 break;
4316
c277df42 4317 case LONGJMP:
c277df42
IZ
4318 next = scan + ARG(scan);
4319 if (next == scan)
4320 next = NULL;
a0d0e21e
LW
4321 break;
4322 default:
b900a521 4323 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4324 PTR2UV(scan), OP(scan));
cea2e8a9 4325 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4326 }
95b24440 4327
02db2b7b 4328 reenter:
a0d0e21e 4329 scan = next;
95b24440
DM
4330 continue;
4331 /* NOTREACHED */
4332
4333 /* simulate recursively calling regmatch(), but without actually
4334 * recursing - ie save the current state on the heap rather than on
4335 * the stack, then re-enter the loop. This avoids complex regexes
4336 * blowing the processor stack */
4337
4338 start_recurse:
4339 {
5d9a96ca
DM
4340 /* push new state */
4341 regmatch_state *oldst = st;
4342
4343 depth++;
4344
4345 /* grab the next free state slot */
4346 st++;
86545054 4347 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
4348 st = S_push_slab(aTHX);
4349 PL_regmatch_state = st;
4350
4351 oldst->next = next;
4352 oldst->n = n;
4353 oldst->locinput = locinput;
5d9a96ca
DM
4354
4355 st->cc = oldst->cc;
95b24440
DM
4356 locinput = PL_reginput;
4357 nextchr = UCHARAT(locinput);
5d9a96ca
DM
4358 st->minmod = 0;
4359 st->sw = 0;
4360 st->logical = 0;
4361 st->unwind = 0;
95b24440
DM
4362#ifdef DEBUGGING
4363 PL_regindent++;
4364#endif
4365 }
a0d0e21e 4366 }
a687059c 4367
aa283a38
DM
4368
4369
a0d0e21e
LW
4370 /*
4371 * We get here only if there's trouble -- normally "case END" is
4372 * the terminating point.
4373 */
cea2e8a9 4374 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4375 /*NOTREACHED*/
4633a7c4
LW
4376 sayNO;
4377
7821416a 4378yes_final:
77cb431f
DM
4379
4380 if (yes_state) {
4381 /* we have successfully completed a subexpression, but we must now
4382 * pop to the state marked by yes_state and continue from there */
4383
dad79028 4384 /*XXX tmp for CURLYM*/
c4fd8992
AL
4385 regmatch_slab * const oslab = PL_regmatch_slab;
4386 regmatch_state * const ost = st;
4387 regmatch_state * const oys = yes_state;
dad79028
DM
4388 int odepth = depth;
4389
77cb431f
DM
4390 assert(st != yes_state);
4391 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4392 || yes_state > SLAB_LAST(PL_regmatch_slab))
4393 {
4394 /* not in this slab, pop slab */
4395 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4396 PL_regmatch_slab = PL_regmatch_slab->prev;
4397 st = SLAB_LAST(PL_regmatch_slab);
4398 }
4399 depth -= (st - yes_state);
dad79028 4400 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE TO (%d)\n", depth));
77cb431f
DM
4401 st = yes_state;
4402 yes_state = st->u.yes.prev_yes_state;
4403 PL_regmatch_state = st;
4404
4405 switch (st->resume_state) {
4406 case resume_EVAL:
4407 if (st->u.eval.toggleutf)
4408 PL_reg_flags ^= RF_utf8;
4409 ReREFCNT_dec(rex);
4410 rex = st->u.eval.prev_rex;
4411 /* XXXX This is too dramatic a measure... */
4412 PL_reg_maxiter = 0;
4413 /* Restore parens of the caller without popping the
4414 * savestack */
4415 {
c4fd8992 4416 const I32 tmp = PL_savestack_ix;
77cb431f
DM
4417 PL_savestack_ix = st->u.eval.lastcp;
4418 regcppop(rex);
4419 PL_savestack_ix = tmp;
4420 }
4421 PL_reginput = locinput;
4422 /* continue at the node following the (??{...}) */
4423 next = st->next;
4424 goto reenter;
4425
dad79028
DM
4426 case resume_IFMATCH:
4427 if (st->logical) {
4428 st->logical = 0;
4429 st->sw = st->u.ifmatch.wanted;
4430 }
4431 else if (!st->u.ifmatch.wanted)
4432 sayNO;
4433
4434 if (OP(st->scan) == SUSPEND)
4435 locinput = PL_reginput;
4436 else {
4437 locinput = PL_reginput = st->locinput;
4438 nextchr = UCHARAT(locinput);
4439 }
4440 next = st->scan + ARG(st->scan);
4441 if (next == st->scan)
4442 next = NULL;
4443 goto reenter;
4444
4445 /* XXX tmp don't handle yes_state yet */
4446 case resume_CURLYM1:
4447 case resume_CURLYM2:
4448 case resume_CURLYM3:
dad79028
DM
4449 PL_regmatch_slab =oslab;
4450 st = ost;
4451 PL_regmatch_state = st;
4452 depth = odepth;
4453 yes_state = oys;
4454 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "XXX revering a CURLYM\n"));
4455 goto yes;
4456
77cb431f
DM
4457 default:
4458 Perl_croak(aTHX_ "unexpected yes reume state");
4459 }
4460 }
4461
a3621e74 4462 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4463 PL_colors[4], PL_colors[5]));
4633a7c4
LW
4464yes:
4465#ifdef DEBUGGING
3280af22 4466 PL_regindent--;
4633a7c4 4467#endif
02db2b7b 4468
95b24440 4469 result = 1;
aa283a38 4470 /* XXX this is duplicate(ish) code to that in the do_no section.
77cb431f
DM
4471 * eventually a yes should just pop the stack back to the current
4472 * yes_state */
aa283a38
DM
4473 if (depth) {
4474 /* restore previous state and re-enter */
4475 POP_STATE;
4476
4477 switch (st->resume_state) {
4478 case resume_TRIE1:
4479 goto resume_point_TRIE1;
4480 case resume_TRIE2:
4481 goto resume_point_TRIE2;
aa283a38
DM
4482 case resume_CURLYX:
4483 goto resume_point_CURLYX;
4484 case resume_WHILEM1:
4485 goto resume_point_WHILEM1;
4486 case resume_WHILEM2:
4487 goto resume_point_WHILEM2;
4488 case resume_WHILEM3:
4489 goto resume_point_WHILEM3;
4490 case resume_WHILEM4:
4491 goto resume_point_WHILEM4;
4492 case resume_WHILEM5:
4493 goto resume_point_WHILEM5;
4494 case resume_WHILEM6:
4495 goto resume_point_WHILEM6;
4496 case resume_CURLYM1:
4497 goto resume_point_CURLYM1;
4498 case resume_CURLYM2:
4499 goto resume_point_CURLYM2;
4500 case resume_CURLYM3:
4501 goto resume_point_CURLYM3;
aa283a38
DM
4502 case resume_PLUS1:
4503 goto resume_point_PLUS1;
4504 case resume_PLUS2:
4505 goto resume_point_PLUS2;
4506 case resume_PLUS3:
4507 goto resume_point_PLUS3;
4508 case resume_PLUS4:
4509 goto resume_point_PLUS4;
77cb431f 4510
dad79028 4511 case resume_IFMATCH:
77cb431f 4512 case resume_EVAL:
aa283a38
DM
4513 default:
4514 Perl_croak(aTHX_ "regexp resume memory corruption");
4515 }
4516 }
4517 goto final_exit;
4633a7c4
LW
4518
4519no:
a3621e74 4520 DEBUG_EXECUTE_r(
7821416a
IZ
4521 PerlIO_printf(Perl_debug_log,
4522 "%*s %sfailed...%s\n",
e4584336 4523 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a
IZ
4524 );
4525 goto do_no;
4526no_final:
4527do_no:
5d9a96ca
DM
4528 if (st->unwind) {
4529 re_unwind_t * const uw = SSPTRt(st->unwind,re_unwind_t);
02db2b7b
IZ
4530
4531 switch (uw->type) {
4532 case RE_UNWIND_BRANCH:
4533 case RE_UNWIND_BRANCHJ:
4534 {
6136c704 4535 re_unwind_branch_t * const uwb = &(uw->branch);
a3b680e6 4536 const I32 lastparen = uwb->lastparen;
9041c2e3 4537
02db2b7b
IZ
4538 REGCP_UNWIND(uwb->lastcp);
4539 for (n = *PL_reglastparen; n > lastparen; n--)
4540 PL_regendp[n] = -1;
4541 *PL_reglastparen = n;
4542 scan = next = uwb->next;
3a2830be 4543 st->minmod = uwb->minmod;
9041c2e3
NIS
4544 if ( !scan ||
4545 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b 4546 ? BRANCH : BRANCHJ) ) { /* Failure */
5d9a96ca 4547 st->unwind = uwb->prev;
02db2b7b
IZ
4548#ifdef DEBUGGING
4549 PL_regindent--;
4550#endif
4551 goto do_no;
4552 }
4553 /* Have more choice yet. Reuse the same uwb. */
02db2b7b
IZ
4554 if ((n = (uwb->type == RE_UNWIND_BRANCH
4555 ? NEXT_OFF(next) : ARG(next))))
4556 next += n;
4557 else
4558 next = NULL; /* XXXX Needn't unwinding in this case... */
4559 uwb->next = next;
4560 next = NEXTOPER(scan);
4561 if (uwb->type == RE_UNWIND_BRANCHJ)
4562 next = NEXTOPER(next);
4563 locinput = uwb->locinput;
4564 nextchr = uwb->nextchr;
4565#ifdef DEBUGGING
4566 PL_regindent = uwb->regindent;
4567#endif
4568
4569 goto reenter;
4570 }
5f66b61c 4571 /* NOTREACHED */
02db2b7b
IZ
4572 default:
4573 Perl_croak(aTHX_ "regexp unwind memory corruption");
4574 }
5f66b61c 4575 /* NOTREACHED */
02db2b7b 4576 }
aa283a38 4577
4633a7c4 4578#ifdef DEBUGGING
3280af22 4579 PL_regindent--;
4633a7c4 4580#endif
95b24440 4581 result = 0;
5d9a96ca 4582
aa283a38
DM
4583 if (depth) {
4584 /* there's a previous state to backtrack to */
4585 POP_STATE;
5d9a96ca 4586 switch (st->resume_state) {
95b24440
DM
4587 case resume_TRIE1:
4588 goto resume_point_TRIE1;
4589 case resume_TRIE2:
4590 goto resume_point_TRIE2;
aa283a38
DM
4591 case resume_EVAL:
4592 /* we have failed an (??{...}). Restore state to the outer re
4593 * then re-throw the failure */
4594 if (st->u.eval.toggleutf)
4595 PL_reg_flags ^= RF_utf8;
4596 ReREFCNT_dec(rex);
4597 rex = st->u.eval.prev_rex;
77cb431f 4598 yes_state = st->u.yes.prev_yes_state;
aa283a38
DM
4599
4600 /* XXXX This is too dramatic a measure... */
4601 PL_reg_maxiter = 0;
4602
4603 PL_reginput = locinput;
4604 REGCP_UNWIND(st->u.eval.lastcp);
4605 regcppop(rex);
4606 goto do_no;
4607
95b24440
DM
4608 case resume_CURLYX:
4609 goto resume_point_CURLYX;
4610 case resume_WHILEM1:
4611 goto resume_point_WHILEM1;
4612 case resume_WHILEM2:
4613 goto resume_point_WHILEM2;
4614 case resume_WHILEM3:
4615 goto resume_point_WHILEM3;
4616 case resume_WHILEM4:
4617 goto resume_point_WHILEM4;
4618 case resume_WHILEM5:
4619 goto resume_point_WHILEM5;
4620 case resume_WHILEM6:
4621 goto resume_point_WHILEM6;
4622 case resume_CURLYM1:
4623 goto resume_point_CURLYM1;
4624 case resume_CURLYM2:
4625 goto resume_point_CURLYM2;
4626 case resume_CURLYM3:
4627 goto resume_point_CURLYM3;
95b24440 4628 case resume_IFMATCH:
dad79028
DM
4629 yes_state = st->u.yes.prev_yes_state;
4630 if (st->logical) {
4631 st->logical = 0;
4632 st->sw = !st->u.ifmatch.wanted;
4633 }
4634 else if (st->u.ifmatch.wanted)
4635 sayNO;
4636
4637 assert(OP(scan) != SUSPEND); /* XXX DAPM tmp */
4638 locinput = PL_reginput = st->locinput;
4639 nextchr = UCHARAT(locinput);
4640 next = scan + ARG(scan);
4641 if (next == scan)
4642 next = NULL;
4643 goto reenter;
4644
95b24440
DM
4645 case resume_PLUS1:
4646 goto resume_point_PLUS1;
4647 case resume_PLUS2:
4648 goto resume_point_PLUS2;
4649 case resume_PLUS3:
4650 goto resume_point_PLUS3;
4651 case resume_PLUS4:
4652 goto resume_point_PLUS4;
95b24440
DM
4653 default:
4654 Perl_croak(aTHX_ "regexp resume memory corruption");
4655 }
95b24440 4656 }
aa283a38
DM
4657
4658final_exit:
4659
5d9a96ca
DM
4660 /* restore original high-water mark */
4661 PL_regmatch_slab = orig_slab;
4662 PL_regmatch_state = orig_state;
4663
4664 /* free all slabs above current one */
4665 if (orig_slab->next) {
c4fd8992 4666 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
4667 orig_slab->next = NULL;
4668 while (sl) {
c4fd8992 4669 regmatch_slab * const osl = sl;
5d9a96ca 4670 sl = sl->next;
ad65c075 4671 Safefree(osl);
5d9a96ca
DM
4672 }
4673 }
4674
95b24440
DM
4675 return result;
4676
a687059c
LW
4677}
4678
4679/*
4680 - regrepeat - repeatedly match something simple, report how many
4681 */
4682/*
4683 * [This routine now assumes that it will only match on things of length 1.
4684 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4685 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4686 */
76e3520e 4687STATIC I32
32fc9b6a 4688S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
a687059c 4689{
27da23d5 4690 dVAR;
a0d0e21e 4691 register char *scan;
a0d0e21e 4692 register I32 c;
3280af22 4693 register char *loceol = PL_regeol;
a0ed51b3 4694 register I32 hardcount = 0;
53c4c00c 4695 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4696
3280af22 4697 scan = PL_reginput;
faf11cac
HS
4698 if (max == REG_INFTY)
4699 max = I32_MAX;
4700 else if (max < loceol - scan)
7f596f4c 4701 loceol = scan + max;
a0d0e21e 4702 switch (OP(p)) {
22c35a8c 4703 case REG_ANY:
1aa99e6b 4704 if (do_utf8) {
ffc61ed2 4705 loceol = PL_regeol;
1aa99e6b 4706 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4707 scan += UTF8SKIP(scan);
4708 hardcount++;
4709 }
4710 } else {
4711 while (scan < loceol && *scan != '\n')
4712 scan++;
a0ed51b3
LW
4713 }
4714 break;
ffc61ed2 4715 case SANY:
def8e4ea
JH
4716 if (do_utf8) {
4717 loceol = PL_regeol;
a0804c9e 4718 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4719 scan += UTF8SKIP(scan);
4720 hardcount++;
4721 }
4722 }
4723 else
4724 scan = loceol;
a0ed51b3 4725 break;
f33976b4
DB
4726 case CANY:
4727 scan = loceol;
4728 break;
090f7165
JH
4729 case EXACT: /* length of string is 1 */
4730 c = (U8)*STRING(p);
4731 while (scan < loceol && UCHARAT(scan) == c)
4732 scan++;
bbce6d69 4733 break;
4734 case EXACTF: /* length of string is 1 */
cd439c50 4735 c = (U8)*STRING(p);
bbce6d69 4736 while (scan < loceol &&
22c35a8c 4737 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4738 scan++;
4739 break;
4740 case EXACTFL: /* length of string is 1 */
3280af22 4741 PL_reg_flags |= RF_tainted;
cd439c50 4742 c = (U8)*STRING(p);
bbce6d69 4743 while (scan < loceol &&
22c35a8c 4744 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4745 scan++;
4746 break;
4747 case ANYOF:
ffc61ed2
JH
4748 if (do_utf8) {
4749 loceol = PL_regeol;
cfc92286 4750 while (hardcount < max && scan < loceol &&
32fc9b6a 4751 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4752 scan += UTF8SKIP(scan);
4753 hardcount++;
4754 }
4755 } else {
32fc9b6a 4756 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
4757 scan++;
4758 }
a0d0e21e
LW
4759 break;
4760 case ALNUM:
1aa99e6b 4761 if (do_utf8) {
ffc61ed2 4762 loceol = PL_regeol;
1a4fad37 4763 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4764 while (hardcount < max && scan < loceol &&
3568d838 4765 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4766 scan += UTF8SKIP(scan);
4767 hardcount++;
4768 }
4769 } else {
4770 while (scan < loceol && isALNUM(*scan))
4771 scan++;
a0ed51b3
LW
4772 }
4773 break;
bbce6d69 4774 case ALNUML:
3280af22 4775 PL_reg_flags |= RF_tainted;
1aa99e6b 4776 if (do_utf8) {
ffc61ed2 4777 loceol = PL_regeol;
1aa99e6b
IH
4778 while (hardcount < max && scan < loceol &&
4779 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4780 scan += UTF8SKIP(scan);
4781 hardcount++;
4782 }
4783 } else {
4784 while (scan < loceol && isALNUM_LC(*scan))
4785 scan++;
a0ed51b3
LW
4786 }
4787 break;
a0d0e21e 4788 case NALNUM:
1aa99e6b 4789 if (do_utf8) {
ffc61ed2 4790 loceol = PL_regeol;
1a4fad37 4791 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4792 while (hardcount < max && scan < loceol &&
3568d838 4793 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4794 scan += UTF8SKIP(scan);
4795 hardcount++;
4796 }
4797 } else {
4798 while (scan < loceol && !isALNUM(*scan))
4799 scan++;
a0ed51b3
LW
4800 }
4801 break;
bbce6d69 4802 case NALNUML:
3280af22 4803 PL_reg_flags |= RF_tainted;
1aa99e6b 4804 if (do_utf8) {
ffc61ed2 4805 loceol = PL_regeol;
1aa99e6b
IH
4806 while (hardcount < max && scan < loceol &&
4807 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4808 scan += UTF8SKIP(scan);
4809 hardcount++;
4810 }
4811 } else {
4812 while (scan < loceol && !isALNUM_LC(*scan))
4813 scan++;
a0ed51b3
LW
4814 }
4815 break;
a0d0e21e 4816 case SPACE:
1aa99e6b 4817 if (do_utf8) {
ffc61ed2 4818 loceol = PL_regeol;
1a4fad37 4819 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4820 while (hardcount < max && scan < loceol &&
3568d838
JH
4821 (*scan == ' ' ||
4822 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4823 scan += UTF8SKIP(scan);
4824 hardcount++;
4825 }
4826 } else {
4827 while (scan < loceol && isSPACE(*scan))
4828 scan++;
a0ed51b3
LW
4829 }
4830 break;
bbce6d69 4831 case SPACEL:
3280af22 4832 PL_reg_flags |= RF_tainted;
1aa99e6b 4833 if (do_utf8) {
ffc61ed2 4834 loceol = PL_regeol;
1aa99e6b 4835 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4836 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4837 scan += UTF8SKIP(scan);
4838 hardcount++;
4839 }
4840 } else {
4841 while (scan < loceol && isSPACE_LC(*scan))
4842 scan++;
a0ed51b3
LW
4843 }
4844 break;
a0d0e21e 4845 case NSPACE:
1aa99e6b 4846 if (do_utf8) {
ffc61ed2 4847 loceol = PL_regeol;
1a4fad37 4848 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4849 while (hardcount < max && scan < loceol &&
3568d838
JH
4850 !(*scan == ' ' ||
4851 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4852 scan += UTF8SKIP(scan);
4853 hardcount++;
4854 }
4855 } else {
4856 while (scan < loceol && !isSPACE(*scan))
4857 scan++;
4858 break;
a0ed51b3 4859 }
bbce6d69 4860 case NSPACEL:
3280af22 4861 PL_reg_flags |= RF_tainted;
1aa99e6b 4862 if (do_utf8) {
ffc61ed2 4863 loceol = PL_regeol;
1aa99e6b 4864 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4865 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4866 scan += UTF8SKIP(scan);
4867 hardcount++;
4868 }
4869 } else {
4870 while (scan < loceol && !isSPACE_LC(*scan))
4871 scan++;
a0ed51b3
LW
4872 }
4873 break;
a0d0e21e 4874 case DIGIT:
1aa99e6b 4875 if (do_utf8) {
ffc61ed2 4876 loceol = PL_regeol;
1a4fad37 4877 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4878 while (hardcount < max && scan < loceol &&
3568d838 4879 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4880 scan += UTF8SKIP(scan);
4881 hardcount++;
4882 }
4883 } else {
4884 while (scan < loceol && isDIGIT(*scan))
4885 scan++;
a0ed51b3
LW
4886 }
4887 break;
a0d0e21e 4888 case NDIGIT:
1aa99e6b 4889 if (do_utf8) {
ffc61ed2 4890 loceol = PL_regeol;
1a4fad37 4891 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4892 while (hardcount < max && scan < loceol &&
3568d838 4893 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4894 scan += UTF8SKIP(scan);
4895 hardcount++;
4896 }
4897 } else {
4898 while (scan < loceol && !isDIGIT(*scan))
4899 scan++;
a0ed51b3
LW
4900 }
4901 break;
a0d0e21e
LW
4902 default: /* Called on something of 0 width. */
4903 break; /* So match right here or not at all. */
4904 }
a687059c 4905
a0ed51b3
LW
4906 if (hardcount)
4907 c = hardcount;
4908 else
4909 c = scan - PL_reginput;
3280af22 4910 PL_reginput = scan;
a687059c 4911
a3621e74 4912 DEBUG_r({
ab74612d 4913 SV *re_debug_flags = NULL;
6136c704 4914 SV * const prop = sv_newmortal();
a3621e74
YO
4915 GET_RE_DEBUG_FLAGS;
4916 DEBUG_EXECUTE_r({
32fc9b6a 4917 regprop(prog, prop, p);
9041c2e3
NIS
4918 PerlIO_printf(Perl_debug_log,
4919 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3f7c398e 4920 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
c277df42 4921 });
a3621e74 4922 });
9041c2e3 4923
a0d0e21e 4924 return(c);
a687059c
LW
4925}
4926
c277df42 4927
76234dfb 4928#ifndef PERL_IN_XSUB_RE
c277df42 4929/*
ffc61ed2
JH
4930- regclass_swash - prepare the utf8 swash
4931*/
4932
4933SV *
32fc9b6a 4934Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4935{
97aff369 4936 dVAR;
9e55ce06
JH
4937 SV *sw = NULL;
4938 SV *si = NULL;
4939 SV *alt = NULL;
32fc9b6a 4940 const struct reg_data *data = prog ? prog->data : NULL;
ffc61ed2 4941
4f639d21 4942 if (data && data->count) {
a3b680e6 4943 const U32 n = ARG(node);
ffc61ed2 4944
4f639d21
DM
4945 if (data->what[n] == 's') {
4946 SV * const rv = (SV*)data->data[n];
890ce7af 4947 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 4948 SV **const ary = AvARRAY(av);
9e55ce06 4949 SV **a, **b;
9041c2e3 4950
711a919c 4951 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
4952 * documentation of these array elements. */
4953
b11f357e 4954 si = *ary;
8f7f7219 4955 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
4956 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4957
ffc61ed2
JH
4958 if (a)
4959 sw = *a;
4960 else if (si && doinit) {
4961 sw = swash_init("utf8", "", si, 1, 0);
4962 (void)av_store(av, 1, sw);
4963 }
9e55ce06
JH
4964 if (b)
4965 alt = *b;
ffc61ed2
JH
4966 }
4967 }
4968
9e55ce06
JH
4969 if (listsvp)
4970 *listsvp = si;
4971 if (altsvp)
4972 *altsvp = alt;
ffc61ed2
JH
4973
4974 return sw;
4975}
76234dfb 4976#endif
ffc61ed2
JH
4977
4978/*
ba7b4546 4979 - reginclass - determine if a character falls into a character class
832705d4
JH
4980
4981 The n is the ANYOF regnode, the p is the target string, lenp
4982 is pointer to the maximum length of how far to go in the p
4983 (if the lenp is zero, UTF8SKIP(p) is used),
4984 do_utf8 tells whether the target string is in UTF-8.
4985
bbce6d69 4986 */
4987
76e3520e 4988STATIC bool
32fc9b6a 4989S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4990{
27da23d5 4991 dVAR;
a3b680e6 4992 const char flags = ANYOF_FLAGS(n);
bbce6d69 4993 bool match = FALSE;
cc07378b 4994 UV c = *p;
ae9ddab8 4995 STRLEN len = 0;
9e55ce06 4996 STRLEN plen;
1aa99e6b 4997
19f67299
TS
4998 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
4999 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5000 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5001 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
19f67299
TS
5002 if (len == (STRLEN)-1)
5003 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5004 }
bbce6d69 5005
0f0076b4 5006 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5007 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5008 if (lenp)
5009 *lenp = 0;
ffc61ed2 5010 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5011 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5012 match = TRUE;
bbce6d69 5013 }
3568d838 5014 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5015 match = TRUE;
ffc61ed2 5016 if (!match) {
9e55ce06 5017 AV *av;
32fc9b6a 5018 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5019
5020 if (sw) {
3568d838 5021 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
5022 match = TRUE;
5023 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5024 if (!match && lenp && av) {
5025 I32 i;
9e55ce06 5026 for (i = 0; i <= av_len(av); i++) {
890ce7af 5027 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5028 STRLEN len;
890ce7af 5029 const char * const s = SvPV_const(sv, len);
9e55ce06 5030
061b10df 5031 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
5032 *lenp = len;
5033 match = TRUE;
5034 break;
5035 }
5036 }
5037 }
5038 if (!match) {
89ebb4a3 5039 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
5040 STRLEN tmplen;
5041
9e55ce06
JH
5042 to_utf8_fold(p, tmpbuf, &tmplen);
5043 if (swash_fetch(sw, tmpbuf, do_utf8))
5044 match = TRUE;
5045 }
ffc61ed2
JH
5046 }
5047 }
bbce6d69 5048 }
9e55ce06 5049 if (match && lenp && *lenp == 0)
0f0076b4 5050 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5051 }
1aa99e6b 5052 if (!match && c < 256) {
ffc61ed2
JH
5053 if (ANYOF_BITMAP_TEST(n, c))
5054 match = TRUE;
5055 else if (flags & ANYOF_FOLD) {
eb160463 5056 U8 f;
a0ed51b3 5057
ffc61ed2
JH
5058 if (flags & ANYOF_LOCALE) {
5059 PL_reg_flags |= RF_tainted;
5060 f = PL_fold_locale[c];
5061 }
5062 else
5063 f = PL_fold[c];
5064 if (f != c && ANYOF_BITMAP_TEST(n, f))
5065 match = TRUE;
5066 }
5067
5068 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5069 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5070 if (
5071 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5072 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5073 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5074 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5075 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5076 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5077 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5078 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5079 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5080 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5081 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5082 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5083 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5084 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5085 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5086 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5087 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5088 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5089 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5090 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5091 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5092 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5093 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5094 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5095 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5096 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5097 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5098 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5099 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5100 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5101 ) /* How's that for a conditional? */
5102 {
5103 match = TRUE;
5104 }
a0ed51b3 5105 }
a0ed51b3
LW
5106 }
5107
a0ed51b3
LW
5108 return (flags & ANYOF_INVERT) ? !match : match;
5109}
161b471a 5110
dfe13c55 5111STATIC U8 *
5f66b61c 5112S_reghop3(U8 *s, I32 off, U8* lim)
9041c2e3 5113{
97aff369 5114 dVAR;
a0ed51b3 5115 if (off >= 0) {
1aa99e6b 5116 while (off-- && s < lim) {
ffc61ed2 5117 /* XXX could check well-formedness here */
a0ed51b3 5118 s += UTF8SKIP(s);
ffc61ed2 5119 }
a0ed51b3
LW
5120 }
5121 else {
5122 while (off++) {
1aa99e6b 5123 if (s > lim) {
a0ed51b3 5124 s--;
ffc61ed2 5125 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5126 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5127 s--;
ffc61ed2
JH
5128 }
5129 /* XXX could check well-formedness here */
a0ed51b3
LW
5130 }
5131 }
5132 }
5133 return s;
5134}
161b471a 5135
dfe13c55 5136STATIC U8 *
5f66b61c 5137S_reghopmaybe3(U8* s, I32 off, U8* lim)
a0ed51b3 5138{
97aff369 5139 dVAR;
a0ed51b3 5140 if (off >= 0) {
1aa99e6b 5141 while (off-- && s < lim) {
ffc61ed2 5142 /* XXX could check well-formedness here */
a0ed51b3 5143 s += UTF8SKIP(s);
ffc61ed2 5144 }
a0ed51b3
LW
5145 if (off >= 0)
5146 return 0;
5147 }
5148 else {
5149 while (off++) {
1aa99e6b 5150 if (s > lim) {
a0ed51b3 5151 s--;
ffc61ed2 5152 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5153 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5154 s--;
ffc61ed2
JH
5155 }
5156 /* XXX could check well-formedness here */
a0ed51b3
LW
5157 }
5158 else
5159 break;
5160 }
5161 if (off <= 0)
5162 return 0;
5163 }
5164 return s;
5165}
51371543 5166
51371543 5167static void
acfe0abc 5168restore_pos(pTHX_ void *arg)
51371543 5169{
97aff369 5170 dVAR;
097eb12c 5171 regexp * const rex = (regexp *)arg;
51371543
GS
5172 if (PL_reg_eval_set) {
5173 if (PL_reg_oldsaved) {
4f639d21
DM
5174 rex->subbeg = PL_reg_oldsaved;
5175 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5176#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5177 rex->saved_copy = PL_nrs;
ed252734 5178#endif
4f639d21 5179 RX_MATCH_COPIED_on(rex);
51371543
GS
5180 }
5181 PL_reg_magic->mg_len = PL_reg_oldpos;
5182 PL_reg_eval_set = 0;
5183 PL_curpm = PL_reg_oldcurpm;
5184 }
5185}
33b8afdf
JH
5186
5187STATIC void
5188S_to_utf8_substr(pTHX_ register regexp *prog)
5189{
33b8afdf 5190 if (prog->float_substr && !prog->float_utf8) {
097eb12c
AL
5191 SV* const sv = newSVsv(prog->float_substr);
5192 prog->float_utf8 = sv;
33b8afdf
JH
5193 sv_utf8_upgrade(sv);
5194 if (SvTAIL(prog->float_substr))
5195 SvTAIL_on(sv);
5196 if (prog->float_substr == prog->check_substr)
5197 prog->check_utf8 = sv;
5198 }
5199 if (prog->anchored_substr && !prog->anchored_utf8) {
097eb12c
AL
5200 SV* const sv = newSVsv(prog->anchored_substr);
5201 prog->anchored_utf8 = sv;
33b8afdf
JH
5202 sv_utf8_upgrade(sv);
5203 if (SvTAIL(prog->anchored_substr))
5204 SvTAIL_on(sv);
5205 if (prog->anchored_substr == prog->check_substr)
5206 prog->check_utf8 = sv;
5207 }
5208}
5209
5210STATIC void
5211S_to_byte_substr(pTHX_ register regexp *prog)
5212{
97aff369 5213 dVAR;
33b8afdf 5214 if (prog->float_utf8 && !prog->float_substr) {
097eb12c
AL
5215 SV* sv = newSVsv(prog->float_utf8);
5216 prog->float_substr = sv;
33b8afdf
JH
5217 if (sv_utf8_downgrade(sv, TRUE)) {
5218 if (SvTAIL(prog->float_utf8))
5219 SvTAIL_on(sv);
5220 } else {
5221 SvREFCNT_dec(sv);
5222 prog->float_substr = sv = &PL_sv_undef;
5223 }
5224 if (prog->float_utf8 == prog->check_utf8)
5225 prog->check_substr = sv;
5226 }
5227 if (prog->anchored_utf8 && !prog->anchored_substr) {
097eb12c
AL
5228 SV* sv = newSVsv(prog->anchored_utf8);
5229 prog->anchored_substr = sv;
33b8afdf
JH
5230 if (sv_utf8_downgrade(sv, TRUE)) {
5231 if (SvTAIL(prog->anchored_utf8))
5232 SvTAIL_on(sv);
5233 } else {
5234 SvREFCNT_dec(sv);
5235 prog->anchored_substr = sv = &PL_sv_undef;
5236 }
5237 if (prog->anchored_utf8 == prog->check_utf8)
5238 prog->check_substr = sv;
5239 }
5240}
66610fdd
RGS
5241
5242/*
5243 * Local variables:
5244 * c-indentation-style: bsd
5245 * c-basic-offset: 4
5246 * indent-tabs-mode: t
5247 * End:
5248 *
37442d52
RGS
5249 * ex: set ts=8 sts=4 sw=4 noet:
5250 */