This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Why hide functions (and duplicate them) when you can just avoid
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
c5be433b
GS
50
51# define PERL_NO_GET_CONTEXT
9041c2e3 52#endif
56953603 53
a687059c 54/*
e50aee73 55 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
56 *
57 * Copyright (c) 1986 by University of Toronto.
58 * Written by Henry Spencer. Not derived from licensed software.
59 *
60 * Permission is granted to anyone to use this software for any
61 * purpose on any computer system, and to redistribute it freely,
62 * subject to the following restrictions:
63 *
64 * 1. The author is not responsible for the consequences of use of
65 * this software, no matter how awful, even if they arise
66 * from defects in it.
67 *
68 * 2. The origin of this software must not be misrepresented, either
69 * by explicit claim or by omission.
70 *
71 * 3. Altered versions must be plainly marked as such, and must not
72 * be misrepresented as being the original software.
73 *
74 **** Alterations to Henry's code are...
75 ****
4bb101f2 76 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 77 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 78 ****
9ef589d8
LW
79 **** You may distribute under the terms of either the GNU General Public
80 **** License or the Artistic License, as specified in the README file.
a687059c
LW
81 *
82 * Beware that some of this code is subtly aware of the way operator
83 * precedence is structured in regular expressions. Serious changes in
84 * regular-expression syntax might require a total rethink.
85 */
86#include "EXTERN.h"
864dbfa3 87#define PERL_IN_REGEXEC_C
a687059c 88#include "perl.h"
0f5d15d6 89
a687059c
LW
90#include "regcomp.h"
91
c277df42
IZ
92#define RF_tainted 1 /* tainted information used? */
93#define RF_warned 2 /* warned about big count? */
ce862d02 94#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
95#define RF_utf8 8 /* String contains multibyte chars? */
96
eb160463 97#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
98
99#define RS_init 1 /* eval environment created */
100#define RS_set 2 /* replsv value is set */
c277df42 101
a687059c
LW
102#ifndef STATIC
103#define STATIC static
104#endif
105
32fc9b6a 106#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 107
c277df42
IZ
108/*
109 * Forwards.
110 */
111
33b8afdf 112#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 113#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 114
52657f30
AL
115#define HOPc(pos,off) ((char *)(PL_reg_match_utf8 \
116 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
117 : (U8*)(pos + off)))
118#define HOPBACKc(pos, off) ((char*) \
119 ((PL_reg_match_utf8) \
120 ? reghopmaybe3((U8*)pos, -off, ((U8*)(off < 0 ? PL_regeol : PL_bostr))) \
efb30f32
HS
121 : (pos - off >= PL_bostr) \
122 ? (U8*)(pos - off) \
52657f30 123 : (U8*)NULL) \
efb30f32 124)
efb30f32 125
1aa99e6b 126#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c 127#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 128#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 129
1a4fad37
AL
130#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
131 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
132#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
133#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
134#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
135#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 136
5f80c4cf 137/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
138#define JUMPABLE(rn) ( \
139 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
140 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
141 OP(rn) == PLUS || OP(rn) == MINMOD || \
142 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
143)
144
cca55fe3
JP
145#define HAS_TEXT(rn) ( \
146 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
147)
e2d8ce26 148
a84d97b6
HS
149/*
150 Search for mandatory following text node; for lookahead, the text must
151 follow but for lookbehind (rn->flags != 0) we skip to the next step.
152*/
cca55fe3 153#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 154 while (JUMPABLE(rn)) \
a84d97b6 155 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 156 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
157 else if (OP(rn) == PLUS) \
158 rn = NEXTOPER(rn); \
a84d97b6
HS
159 else if (OP(rn) == IFMATCH) \
160 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 161 else rn += NEXT_OFF(rn); \
5f80c4cf 162} STMT_END
74750237 163
acfe0abc 164static void restore_pos(pTHX_ void *arg);
51371543 165
76e3520e 166STATIC CHECKPOINT
cea2e8a9 167S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 168{
97aff369 169 dVAR;
a3b680e6 170 const int retval = PL_savestack_ix;
b1ce53c5 171#define REGCP_PAREN_ELEMS 4
a3b680e6 172 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
173 int p;
174
e49a9654
IH
175 if (paren_elems_to_push < 0)
176 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
177
a01268b5 178#define REGCP_OTHER_ELEMS 6
4b3c1a47 179 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 180 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 181/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
182 SSPUSHINT(PL_regendp[p]);
183 SSPUSHINT(PL_regstartp[p]);
3280af22 184 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
185 SSPUSHINT(p);
186 }
b1ce53c5 187/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
188 SSPUSHINT(PL_regsize);
189 SSPUSHINT(*PL_reglastparen);
a01268b5 190 SSPUSHINT(*PL_reglastcloseparen);
3280af22 191 SSPUSHPTR(PL_reginput);
41123dfd
JH
192#define REGCP_FRAME_ELEMS 2
193/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
194 * are needed for the regexp context stack bookkeeping. */
195 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 196 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 197
a0d0e21e
LW
198 return retval;
199}
200
c277df42 201/* These are needed since we do not localize EVAL nodes: */
a3621e74 202# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 203 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 204 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 205
a3621e74 206# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 207 PerlIO_printf(Perl_debug_log, \
faccc32b 208 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 209 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 210
76e3520e 211STATIC char *
097eb12c 212S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 213{
97aff369 214 dVAR;
b1ce53c5 215 I32 i;
a0d0e21e 216 char *input;
b1ce53c5 217
a3621e74
YO
218 GET_RE_DEBUG_FLAGS_DECL;
219
b1ce53c5 220 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 221 i = SSPOPINT;
b1ce53c5
JH
222 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
223 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 224 input = (char *) SSPOPPTR;
a01268b5 225 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
226 *PL_reglastparen = SSPOPINT;
227 PL_regsize = SSPOPINT;
b1ce53c5
JH
228
229 /* Now restore the parentheses context. */
41123dfd
JH
230 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
231 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 232 I32 tmps;
097eb12c 233 U32 paren = (U32)SSPOPINT;
3280af22 234 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
235 PL_regstartp[paren] = SSPOPINT;
236 tmps = SSPOPINT;
3280af22
NIS
237 if (paren <= *PL_reglastparen)
238 PL_regendp[paren] = tmps;
a3621e74 239 DEBUG_EXECUTE_r(
c3464db5 240 PerlIO_printf(Perl_debug_log,
b900a521 241 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 242 (UV)paren, (IV)PL_regstartp[paren],
b900a521 243 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 244 (IV)PL_regendp[paren],
3280af22 245 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 246 );
a0d0e21e 247 }
a3621e74 248 DEBUG_EXECUTE_r(
4f639d21 249 if ((I32)(*PL_reglastparen + 1) <= rex->nparens) {
c3464db5 250 PerlIO_printf(Perl_debug_log,
faccc32b 251 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 252 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
253 }
254 );
daf18116 255#if 1
dafc8851
JH
256 /* It would seem that the similar code in regtry()
257 * already takes care of this, and in fact it is in
258 * a better location to since this code can #if 0-ed out
259 * but the code in regtry() is needed or otherwise tests
260 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
261 * (as of patchlevel 7877) will fail. Then again,
262 * this code seems to be necessary or otherwise
263 * building DynaLoader will fail:
264 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
265 * --jhi */
097eb12c
AL
266 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
267 if (i > PL_regsize)
268 PL_regstartp[i] = -1;
269 PL_regendp[i] = -1;
a0d0e21e 270 }
dafc8851 271#endif
a0d0e21e
LW
272 return input;
273}
274
02db2b7b 275#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 276
95b24440 277#define TRYPAREN(paren, n, input, where) { \
29d1e993
HS
278 if (paren) { \
279 if (n) { \
280 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
281 PL_regendp[paren] = input - PL_bostr; \
282 } \
283 else \
284 PL_regendp[paren] = -1; \
285 } \
95b24440
DM
286 REGMATCH(next, where); \
287 if (result) \
29d1e993
HS
288 sayYES; \
289 if (paren && n) \
290 PL_regendp[paren] = -1; \
291}
292
293
a687059c 294/*
e50aee73 295 * pregexec and friends
a687059c
LW
296 */
297
76234dfb 298#ifndef PERL_IN_XSUB_RE
a687059c 299/*
c277df42 300 - pregexec - match a regexp against a string
a687059c 301 */
c277df42 302I32
864dbfa3 303Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 304 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
305/* strend: pointer to null at end of string */
306/* strbeg: real beginning of string */
307/* minend: end of match must be >=minend after stringarg. */
308/* nosave: For optimizations. */
309{
310 return
9041c2e3 311 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
312 nosave ? 0 : REXEC_COPY_STR);
313}
76234dfb 314#endif
22e551b9 315
9041c2e3 316/*
cad2e5aa
JH
317 * Need to implement the following flags for reg_anch:
318 *
319 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
320 * USE_INTUIT_ML
321 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
322 * INTUIT_AUTORITATIVE_ML
323 * INTUIT_ONCE_NOML - Intuit can match in one location only.
324 * INTUIT_ONCE_ML
325 *
326 * Another flag for this function: SECOND_TIME (so that float substrs
327 * with giant delta may be not rechecked).
328 */
329
330/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
331
3f7c398e 332/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
333 Otherwise, only SvCUR(sv) is used to get strbeg. */
334
335/* XXXX We assume that strpos is strbeg unless sv. */
336
6eb5f6b9
JH
337/* XXXX Some places assume that there is a fixed substring.
338 An update may be needed if optimizer marks as "INTUITable"
339 RExen without fixed substrings. Similarly, it is assumed that
340 lengths of all the strings are no more than minlen, thus they
341 cannot come from lookahead.
342 (Or minlen should take into account lookahead.) */
343
2c2d71f5
JH
344/* A failure to find a constant substring means that there is no need to make
345 an expensive call to REx engine, thus we celebrate a failure. Similarly,
346 finding a substring too deep into the string means that less calls to
30944b6d
IZ
347 regtry() should be needed.
348
349 REx compiler's optimizer found 4 possible hints:
350 a) Anchored substring;
351 b) Fixed substring;
352 c) Whether we are anchored (beginning-of-line or \G);
353 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 354 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
355 string which does not contradict any of them.
356 */
2c2d71f5 357
6eb5f6b9
JH
358/* Most of decisions we do here should have been done at compile time.
359 The nodes of the REx which we used for the search should have been
360 deleted from the finite automaton. */
361
cad2e5aa
JH
362char *
363Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
364 char *strend, U32 flags, re_scream_pos_data *data)
365{
97aff369 366 dVAR;
b7953727 367 register I32 start_shift = 0;
cad2e5aa 368 /* Should be nonnegative! */
b7953727 369 register I32 end_shift = 0;
2c2d71f5
JH
370 register char *s;
371 register SV *check;
a1933d95 372 char *strbeg;
cad2e5aa 373 char *t;
a3b680e6 374 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 375 I32 ml_anch;
bd61b366
SS
376 register char *other_last = NULL; /* other substr checked before this */
377 char *check_at = NULL; /* check substr found at this pos */
1df70142 378 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 379#ifdef DEBUGGING
890ce7af
AL
380 const char * const i_strpos = strpos;
381 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 382#endif
a3621e74
YO
383
384 GET_RE_DEBUG_FLAGS_DECL;
385
a30b2f1f 386 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 387
b8d68ded 388 if (prog->reganch & ROPT_UTF8) {
a3621e74 389 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
390 "UTF-8 regex...\n"));
391 PL_reg_flags |= RF_utf8;
392 }
393
a3621e74 394 DEBUG_EXECUTE_r({
1df70142 395 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
396 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
397 strpos;
1df70142 398 const int len = PL_reg_match_utf8 ?
b8d68ded 399 strlen(s) : strend - strpos;
2a782b5b
JH
400 if (!PL_colorset)
401 reginitcolors();
b8d68ded 402 if (PL_reg_match_utf8)
a3621e74 403 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 404 "UTF-8 target...\n"));
2a782b5b 405 PerlIO_printf(Perl_debug_log,
a0288114 406 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 407 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
408 prog->precomp,
409 PL_colors[1],
410 (strlen(prog->precomp) > 60 ? "..." : ""),
411 PL_colors[0],
412 (int)(len > 60 ? 60 : len),
413 s, PL_colors[1],
414 (len > 60 ? "..." : "")
415 );
416 });
cad2e5aa 417
c344f387
JH
418 /* CHR_DIST() would be more correct here but it makes things slow. */
419 if (prog->minlen > strend - strpos) {
a3621e74 420 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 421 "String too short... [re_intuit_start]\n"));
cad2e5aa 422 goto fail;
2c2d71f5 423 }
a1933d95 424 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 425 PL_regeol = strend;
33b8afdf
JH
426 if (do_utf8) {
427 if (!prog->check_utf8 && prog->check_substr)
428 to_utf8_substr(prog);
429 check = prog->check_utf8;
430 } else {
431 if (!prog->check_substr && prog->check_utf8)
432 to_byte_substr(prog);
433 check = prog->check_substr;
434 }
435 if (check == &PL_sv_undef) {
a3621e74 436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
437 "Non-utf string cannot match utf check string\n"));
438 goto fail;
439 }
2c2d71f5 440 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
441 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
442 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 443 && !multiline ) ); /* Check after \n? */
cad2e5aa 444
7e25d62c
JH
445 if (!ml_anch) {
446 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
447 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 448 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
449 && sv && !SvROK(sv)
450 && (strpos != strbeg)) {
a3621e74 451 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
452 goto fail;
453 }
454 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 455 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 456 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
457 I32 slen;
458
1aa99e6b 459 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
460 if (SvTAIL(check)) {
461 slen = SvCUR(check); /* >= 1 */
cad2e5aa 462
9041c2e3 463 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 464 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 465 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 466 goto fail_finish;
cad2e5aa
JH
467 }
468 /* Now should match s[0..slen-2] */
469 slen--;
3f7c398e 470 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 471 || (slen > 1
3f7c398e 472 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 473 report_neq:
a3621e74 474 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
475 goto fail_finish;
476 }
cad2e5aa 477 }
3f7c398e 478 else if (*SvPVX_const(check) != *s
653099ff 479 || ((slen = SvCUR(check)) > 1
3f7c398e 480 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 481 goto report_neq;
c315bfe8 482 check_at = s;
2c2d71f5 483 goto success_at_start;
7e25d62c 484 }
cad2e5aa 485 }
2c2d71f5 486 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 487 s = strpos;
2c2d71f5 488 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 489 end_shift = prog->minlen - start_shift -
653099ff 490 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 491 if (!ml_anch) {
a3b680e6 492 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 493 - (SvTAIL(check) != 0);
a3b680e6 494 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
495
496 if (end_shift < eshift)
497 end_shift = eshift;
498 }
cad2e5aa 499 }
2c2d71f5 500 else { /* Can match at random position */
cad2e5aa
JH
501 ml_anch = 0;
502 s = strpos;
2c2d71f5
JH
503 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
504 /* Should be nonnegative! */
505 end_shift = prog->minlen - start_shift -
653099ff 506 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
507 }
508
2c2d71f5 509#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 510 if (end_shift < 0)
6bbae5e6 511 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
512#endif
513
2c2d71f5
JH
514 restart:
515 /* Find a possible match in the region s..strend by looking for
516 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 517 if (flags & REXEC_SCREAM) {
cad2e5aa 518 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 519 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 520
2c2d71f5
JH
521 if (PL_screamfirst[BmRARE(check)] >= 0
522 || ( BmRARE(check) == '\n'
523 && (BmPREVIOUS(check) == SvCUR(check) - 1)
524 && SvTAIL(check) ))
9041c2e3 525 s = screaminstr(sv, check,
2c2d71f5 526 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 527 else
2c2d71f5 528 goto fail_finish;
4addbd3b
HS
529 /* we may be pointing at the wrong string */
530 if (s && RX_MATCH_COPIED(prog))
3f7c398e 531 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
532 if (data)
533 *data->scream_olds = s;
534 }
f33976b4 535 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
536 s = fbm_instr((U8*)(s + start_shift),
537 (U8*)(strend - end_shift),
7fba1cd6 538 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 539 else
1aa99e6b
IH
540 s = fbm_instr(HOP3(s, start_shift, strend),
541 HOP3(strend, -end_shift, strbeg),
7fba1cd6 542 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
543
544 /* Update the count-of-usability, remove useless subpatterns,
545 unshift s. */
2c2d71f5 546
a0288114 547 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 548 (s ? "Found" : "Did not find"),
33b8afdf 549 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 550 PL_colors[0],
7b0972df 551 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 552 SvPVX_const(check),
2c2d71f5
JH
553 PL_colors[1], (SvTAIL(check) ? "$" : ""),
554 (s ? " at offset " : "...\n") ) );
555
556 if (!s)
557 goto fail_finish;
558
6eb5f6b9
JH
559 check_at = s;
560
2c2d71f5 561 /* Finish the diagnostic message */
a3621e74 562 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
563
564 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
565 Start with the other substr.
566 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 567 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
568 *always* match. Probably should be marked during compile...
569 Probably it is right to do no SCREAM here...
570 */
571
33b8afdf 572 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 573 /* Take into account the "other" substring. */
2c2d71f5
JH
574 /* XXXX May be hopelessly wrong for UTF... */
575 if (!other_last)
6eb5f6b9 576 other_last = strpos;
33b8afdf 577 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
578 do_other_anchored:
579 {
890ce7af
AL
580 char * const last = HOP3c(s, -start_shift, strbeg);
581 char *last1, *last2;
2c2d71f5 582 char *s1 = s;
33b8afdf 583 SV* must;
2c2d71f5 584
2c2d71f5
JH
585 t = s - prog->check_offset_max;
586 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 587 && (!do_utf8
1aa99e6b 588 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 589 && t > strpos)))
30944b6d 590 /* EMPTY */;
2c2d71f5
JH
591 else
592 t = strpos;
1aa99e6b 593 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
594 if (t < other_last) /* These positions already checked */
595 t = other_last;
1aa99e6b 596 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
597 if (last < last1)
598 last1 = last;
599 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
600 /* On end-of-str: see comment below. */
33b8afdf
JH
601 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
602 if (must == &PL_sv_undef) {
603 s = (char*)NULL;
a3621e74 604 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
605 }
606 else
607 s = fbm_instr(
608 (unsigned char*)t,
609 HOP3(HOP3(last1, prog->anchored_offset, strend)
610 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
611 must,
7fba1cd6 612 multiline ? FBMrf_MULTILINE : 0
33b8afdf 613 );
a3621e74 614 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 615 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
616 (s ? "Found" : "Contradicts"),
617 PL_colors[0],
33b8afdf
JH
618 (int)(SvCUR(must)
619 - (SvTAIL(must)!=0)),
3f7c398e 620 SvPVX_const(must),
33b8afdf 621 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
622 if (!s) {
623 if (last1 >= last2) {
a3621e74 624 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
625 ", giving up...\n"));
626 goto fail_finish;
627 }
a3621e74 628 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 629 ", trying floating at offset %ld...\n",
1aa99e6b
IH
630 (long)(HOP3c(s1, 1, strend) - i_strpos)));
631 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
632 s = HOP3c(last, 1, strend);
2c2d71f5
JH
633 goto restart;
634 }
635 else {
a3621e74 636 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 637 (long)(s - i_strpos)));
1aa99e6b
IH
638 t = HOP3c(s, -prog->anchored_offset, strbeg);
639 other_last = HOP3c(s, 1, strend);
30944b6d 640 s = s1;
2c2d71f5
JH
641 if (t == strpos)
642 goto try_at_start;
2c2d71f5
JH
643 goto try_at_offset;
644 }
30944b6d 645 }
2c2d71f5
JH
646 }
647 else { /* Take into account the floating substring. */
33b8afdf
JH
648 char *last, *last1;
649 char *s1 = s;
650 SV* must;
651
652 t = HOP3c(s, -start_shift, strbeg);
653 last1 = last =
654 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
655 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
656 last = HOP3c(t, prog->float_max_offset, strend);
657 s = HOP3c(t, prog->float_min_offset, strend);
658 if (s < other_last)
659 s = other_last;
2c2d71f5 660 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
661 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
662 /* fbm_instr() takes into account exact value of end-of-str
663 if the check is SvTAIL(ed). Since false positives are OK,
664 and end-of-str is not later than strend we are OK. */
665 if (must == &PL_sv_undef) {
666 s = (char*)NULL;
a3621e74 667 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
668 }
669 else
2c2d71f5 670 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
671 (unsigned char*)last + SvCUR(must)
672 - (SvTAIL(must)!=0),
7fba1cd6 673 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 674 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
675 (s ? "Found" : "Contradicts"),
676 PL_colors[0],
677 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 678 SvPVX_const(must),
33b8afdf
JH
679 PL_colors[1], (SvTAIL(must) ? "$" : "")));
680 if (!s) {
681 if (last1 == last) {
a3621e74 682 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
683 ", giving up...\n"));
684 goto fail_finish;
2c2d71f5 685 }
a3621e74 686 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
687 ", trying anchored starting at offset %ld...\n",
688 (long)(s1 + 1 - i_strpos)));
689 other_last = last;
690 s = HOP3c(t, 1, strend);
691 goto restart;
692 }
693 else {
a3621e74 694 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
695 (long)(s - i_strpos)));
696 other_last = s; /* Fix this later. --Hugo */
697 s = s1;
698 if (t == strpos)
699 goto try_at_start;
700 goto try_at_offset;
701 }
2c2d71f5 702 }
cad2e5aa 703 }
2c2d71f5
JH
704
705 t = s - prog->check_offset_max;
2c2d71f5 706 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 707 && (!do_utf8
1aa99e6b
IH
708 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
709 && t > strpos))) {
2c2d71f5
JH
710 /* Fixed substring is found far enough so that the match
711 cannot start at strpos. */
712 try_at_offset:
cad2e5aa 713 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
714 /* Eventually fbm_*() should handle this, but often
715 anchored_offset is not 0, so this check will not be wasted. */
716 /* XXXX In the code below we prefer to look for "^" even in
717 presence of anchored substrings. And we search even
718 beyond the found float position. These pessimizations
719 are historical artefacts only. */
720 find_anchor:
2c2d71f5 721 while (t < strend - prog->minlen) {
cad2e5aa 722 if (*t == '\n') {
4ee3650e 723 if (t < check_at - prog->check_offset_min) {
33b8afdf 724 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
725 /* Since we moved from the found position,
726 we definitely contradict the found anchored
30944b6d
IZ
727 substr. Due to the above check we do not
728 contradict "check" substr.
729 Thus we can arrive here only if check substr
730 is float. Redo checking for "other"=="fixed".
731 */
9041c2e3 732 strpos = t + 1;
a3621e74 733 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 734 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
735 goto do_other_anchored;
736 }
4ee3650e
GS
737 /* We don't contradict the found floating substring. */
738 /* XXXX Why not check for STCLASS? */
cad2e5aa 739 s = t + 1;
a3621e74 740 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 741 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
742 goto set_useful;
743 }
4ee3650e
GS
744 /* Position contradicts check-string */
745 /* XXXX probably better to look for check-string
746 than for "\n", so one should lower the limit for t? */
a3621e74 747 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 748 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 749 other_last = strpos = s = t + 1;
cad2e5aa
JH
750 goto restart;
751 }
752 t++;
753 }
a3621e74 754 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 755 PL_colors[0], PL_colors[1]));
2c2d71f5 756 goto fail_finish;
cad2e5aa 757 }
f5952150 758 else {
a3621e74 759 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 760 PL_colors[0], PL_colors[1]));
f5952150 761 }
cad2e5aa
JH
762 s = t;
763 set_useful:
33b8afdf 764 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
765 }
766 else {
f5952150 767 /* The found string does not prohibit matching at strpos,
2c2d71f5 768 - no optimization of calling REx engine can be performed,
f5952150
GS
769 unless it was an MBOL and we are not after MBOL,
770 or a future STCLASS check will fail this. */
2c2d71f5
JH
771 try_at_start:
772 /* Even in this situation we may use MBOL flag if strpos is offset
773 wrt the start of the string. */
05b4157f 774 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 775 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
776 /* May be due to an implicit anchor of m{.*foo} */
777 && !(prog->reganch & ROPT_IMPLICIT))
778 {
cad2e5aa
JH
779 t = strpos;
780 goto find_anchor;
781 }
a3621e74 782 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 783 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 784 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 785 );
2c2d71f5 786 success_at_start:
30944b6d 787 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
788 && (do_utf8 ? (
789 prog->check_utf8 /* Could be deleted already */
790 && --BmUSEFUL(prog->check_utf8) < 0
791 && (prog->check_utf8 == prog->float_utf8)
792 ) : (
793 prog->check_substr /* Could be deleted already */
794 && --BmUSEFUL(prog->check_substr) < 0
795 && (prog->check_substr == prog->float_substr)
796 )))
66e933ab 797 {
cad2e5aa 798 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 799 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
800 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
801 if (do_utf8 ? prog->check_substr : prog->check_utf8)
802 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
803 prog->check_substr = prog->check_utf8 = NULL; /* disable */
804 prog->float_substr = prog->float_utf8 = NULL; /* clear */
805 check = NULL; /* abort */
cad2e5aa 806 s = strpos;
3cf5c195
IZ
807 /* XXXX This is a remnant of the old implementation. It
808 looks wasteful, since now INTUIT can use many
6eb5f6b9 809 other heuristics. */
cad2e5aa
JH
810 prog->reganch &= ~RE_USE_INTUIT;
811 }
812 else
813 s = strpos;
814 }
815
6eb5f6b9
JH
816 /* Last resort... */
817 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
818 if (prog->regstclass) {
819 /* minlen == 0 is possible if regstclass is \b or \B,
820 and the fixed substr is ''$.
821 Since minlen is already taken into account, s+1 is before strend;
822 accidentally, minlen >= 1 guaranties no false positives at s + 1
823 even for \b or \B. But (minlen? 1 : 0) below assumes that
824 regstclass does not come from lookahead... */
825 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
826 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 827 const U8* const str = (U8*)STRING(prog->regstclass);
06b5626a 828 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 829 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 830 : 1);
a3b680e6 831 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 832 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 833 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
834 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
835 cl_l, strend)
836 : strend);
6eb5f6b9
JH
837
838 t = s;
3b0527fe 839 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
840 if (!s) {
841#ifdef DEBUGGING
cbbf8932 842 const char *what = NULL;
6eb5f6b9
JH
843#endif
844 if (endpos == strend) {
a3621e74 845 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
846 "Could not match STCLASS...\n") );
847 goto fail;
848 }
a3621e74 849 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 850 "This position contradicts STCLASS...\n") );
653099ff
GS
851 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
852 goto fail;
6eb5f6b9 853 /* Contradict one of substrings */
33b8afdf
JH
854 if (prog->anchored_substr || prog->anchored_utf8) {
855 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 856 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 857 hop_and_restart:
1aa99e6b 858 s = HOP3c(t, 1, strend);
66e933ab
GS
859 if (s + start_shift + end_shift > strend) {
860 /* XXXX Should be taken into account earlier? */
a3621e74 861 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
862 "Could not match STCLASS...\n") );
863 goto fail;
864 }
5e39e1e5
HS
865 if (!check)
866 goto giveup;
a3621e74 867 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 868 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
869 what, (long)(s + start_shift - i_strpos)) );
870 goto restart;
871 }
66e933ab 872 /* Have both, check_string is floating */
6eb5f6b9
JH
873 if (t + start_shift >= check_at) /* Contradicts floating=check */
874 goto retry_floating_check;
875 /* Recheck anchored substring, but not floating... */
9041c2e3 876 s = check_at;
5e39e1e5
HS
877 if (!check)
878 goto giveup;
a3621e74 879 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 880 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
881 (long)(other_last - i_strpos)) );
882 goto do_other_anchored;
883 }
60e71179
GS
884 /* Another way we could have checked stclass at the
885 current position only: */
886 if (ml_anch) {
887 s = t = t + 1;
5e39e1e5
HS
888 if (!check)
889 goto giveup;
a3621e74 890 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 891 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 892 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 893 goto try_at_offset;
66e933ab 894 }
33b8afdf 895 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 896 goto fail;
6eb5f6b9
JH
897 /* Check is floating subtring. */
898 retry_floating_check:
899 t = check_at - start_shift;
a3621e74 900 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
901 goto hop_and_restart;
902 }
b7953727 903 if (t != s) {
a3621e74 904 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 905 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
906 (long)(t - i_strpos), (long)(s - i_strpos))
907 );
908 }
909 else {
a3621e74 910 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
911 "Does not contradict STCLASS...\n");
912 );
913 }
6eb5f6b9 914 }
5e39e1e5 915 giveup:
a3621e74 916 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
917 PL_colors[4], (check ? "Guessed" : "Giving up"),
918 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 919 return s;
2c2d71f5
JH
920
921 fail_finish: /* Substring not found */
33b8afdf
JH
922 if (prog->check_substr || prog->check_utf8) /* could be removed already */
923 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 924 fail:
a3621e74 925 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 926 PL_colors[4], PL_colors[5]));
bd61b366 927 return NULL;
cad2e5aa 928}
9661b544 929
6eb5f6b9 930/* We know what class REx starts with. Try to find this position... */
3b0527fe
DM
931/* if reginfo is NULL, its a dryrun */
932
3c3eec57 933STATIC char *
3b0527fe
DM
934S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, const char
935*strend, const regmatch_info *reginfo)
a687059c 936{
27da23d5 937 dVAR;
1df70142 938 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 939 char *m;
d8093b23 940 STRLEN ln;
5dab1207 941 STRLEN lnc;
078c425b 942 register STRLEN uskip;
d8093b23 943 unsigned int c1;
944 unsigned int c2;
6eb5f6b9
JH
945 char *e;
946 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 947 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 948
6eb5f6b9
JH
949 /* We know what class it must start with. */
950 switch (OP(c)) {
6eb5f6b9 951 case ANYOF:
388cc4de 952 if (do_utf8) {
078c425b 953 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
954 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
955 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a
DM
956 reginclass(prog, c, (U8*)s, 0, do_utf8) :
957 REGINCLASS(prog, c, (U8*)s)) {
3b0527fe 958 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
959 goto got_it;
960 else
961 tmp = doevery;
962 }
963 else
964 tmp = 1;
078c425b 965 s += uskip;
388cc4de
HS
966 }
967 }
968 else {
969 while (s < strend) {
970 STRLEN skip = 1;
971
32fc9b6a 972 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
973 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
974 /* The assignment of 2 is intentional:
975 * for the folded sharp s, the skip is 2. */
976 (skip = SHARP_S_SKIP))) {
3b0527fe 977 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
978 goto got_it;
979 else
980 tmp = doevery;
981 }
982 else
983 tmp = 1;
984 s += skip;
985 }
a0d0e21e 986 }
6eb5f6b9 987 break;
f33976b4
DB
988 case CANY:
989 while (s < strend) {
3b0527fe 990 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
991 goto got_it;
992 else
993 tmp = doevery;
994 s++;
995 }
996 break;
6eb5f6b9 997 case EXACTF:
5dab1207
NIS
998 m = STRING(c);
999 ln = STR_LEN(c); /* length to match in octets/bytes */
1000 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1001 if (UTF) {
a2a2844f 1002 STRLEN ulen1, ulen2;
5dab1207 1003 U8 *sm = (U8 *) m;
89ebb4a3
JH
1004 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1005 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1006 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1007
1008 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1009 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1010
89ebb4a3 1011 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1012 0, uniflags);
89ebb4a3 1013 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1014 0, uniflags);
5dab1207
NIS
1015 lnc = 0;
1016 while (sm < ((U8 *) m + ln)) {
1017 lnc++;
1018 sm += UTF8SKIP(sm);
1019 }
1aa99e6b
IH
1020 }
1021 else {
1022 c1 = *(U8*)m;
1023 c2 = PL_fold[c1];
1024 }
6eb5f6b9
JH
1025 goto do_exactf;
1026 case EXACTFL:
5dab1207
NIS
1027 m = STRING(c);
1028 ln = STR_LEN(c);
1029 lnc = (I32) ln;
d8093b23 1030 c1 = *(U8*)m;
6eb5f6b9
JH
1031 c2 = PL_fold_locale[c1];
1032 do_exactf:
db12adc6 1033 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1034
3b0527fe 1035 if (!reginfo && e < s)
6eb5f6b9 1036 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1037
60a8b682
JH
1038 /* The idea in the EXACTF* cases is to first find the
1039 * first character of the EXACTF* node and then, if
1040 * necessary, case-insensitively compare the full
1041 * text of the node. The c1 and c2 are the first
1042 * characters (though in Unicode it gets a bit
1043 * more complicated because there are more cases
7f16dd3d
JH
1044 * than just upper and lower: one needs to use
1045 * the so-called folding case for case-insensitive
1046 * matching (called "loose matching" in Unicode).
1047 * ibcmp_utf8() will do just that. */
60a8b682 1048
1aa99e6b 1049 if (do_utf8) {
575cac57 1050 UV c, f;
89ebb4a3 1051 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1052 STRLEN len, foldlen;
4ad0818d 1053 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1054 if (c1 == c2) {
5dab1207
NIS
1055 /* Upper and lower of 1st char are equal -
1056 * probably not a "letter". */
1aa99e6b 1057 while (s <= e) {
89ebb4a3 1058 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1059 uniflags);
80aecb99
JH
1060 if ( c == c1
1061 && (ln == len ||
66423254 1062 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1063 m, (char **)0, ln, (bool)UTF))
3b0527fe 1064 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1065 goto got_it;
80aecb99 1066 else {
1df70142 1067 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1068 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1069 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1070 if ( f != c
1071 && (f == c1 || f == c2)
1072 && (ln == foldlen ||
66423254
JH
1073 !ibcmp_utf8((char *) foldbuf,
1074 (char **)0, foldlen, do_utf8,
d07ddd77 1075 m,
eb160463 1076 (char **)0, ln, (bool)UTF))
3b0527fe 1077 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1078 goto got_it;
1079 }
1aa99e6b
IH
1080 s += len;
1081 }
09091399
JH
1082 }
1083 else {
1aa99e6b 1084 while (s <= e) {
89ebb4a3 1085 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1086 uniflags);
80aecb99 1087
60a8b682 1088 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1089 * Note that not all the possible combinations
1090 * are handled here: some of them are handled
1091 * by the standard folding rules, and some of
1092 * them (the character class or ANYOF cases)
1093 * are handled during compiletime in
1094 * regexec.c:S_regclass(). */
880bd946
JH
1095 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1096 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1097 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1098
1099 if ( (c == c1 || c == c2)
1100 && (ln == len ||
66423254 1101 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1102 m, (char **)0, ln, (bool)UTF))
3b0527fe 1103 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1104 goto got_it;
80aecb99 1105 else {
1df70142 1106 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1107 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1108 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1109 if ( f != c
1110 && (f == c1 || f == c2)
1111 && (ln == foldlen ||
a6872d42 1112 !ibcmp_utf8((char *) foldbuf,
66423254 1113 (char **)0, foldlen, do_utf8,
d07ddd77 1114 m,
eb160463 1115 (char **)0, ln, (bool)UTF))
3b0527fe 1116 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1117 goto got_it;
1118 }
1aa99e6b
IH
1119 s += len;
1120 }
09091399 1121 }
1aa99e6b
IH
1122 }
1123 else {
1124 if (c1 == c2)
1125 while (s <= e) {
1126 if ( *(U8*)s == c1
1127 && (ln == 1 || !(OP(c) == EXACTF
1128 ? ibcmp(s, m, ln)
1129 : ibcmp_locale(s, m, ln)))
3b0527fe 1130 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1131 goto got_it;
1132 s++;
1133 }
1134 else
1135 while (s <= e) {
1136 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1137 && (ln == 1 || !(OP(c) == EXACTF
1138 ? ibcmp(s, m, ln)
1139 : ibcmp_locale(s, m, ln)))
3b0527fe 1140 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1141 goto got_it;
1142 s++;
1143 }
b3c9acc1
IZ
1144 }
1145 break;
bbce6d69 1146 case BOUNDL:
3280af22 1147 PL_reg_flags |= RF_tainted;
bbce6d69 1148 /* FALL THROUGH */
a0d0e21e 1149 case BOUND:
ffc61ed2 1150 if (do_utf8) {
12d33761 1151 if (s == PL_bostr)
ffc61ed2
JH
1152 tmp = '\n';
1153 else {
6136c704 1154 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1155 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1156 }
1157 tmp = ((OP(c) == BOUND ?
9041c2e3 1158 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1159 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1160 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1161 if (tmp == !(OP(c) == BOUND ?
3568d838 1162 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1163 isALNUM_LC_utf8((U8*)s)))
1164 {
1165 tmp = !tmp;
3b0527fe 1166 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1167 goto got_it;
1168 }
078c425b 1169 s += uskip;
a687059c 1170 }
a0d0e21e 1171 }
667bb95a 1172 else {
12d33761 1173 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1174 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1175 while (s < strend) {
1176 if (tmp ==
1177 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1178 tmp = !tmp;
3b0527fe 1179 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1180 goto got_it;
1181 }
1182 s++;
a0ed51b3 1183 }
a0ed51b3 1184 }
3b0527fe 1185 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1186 goto got_it;
1187 break;
bbce6d69 1188 case NBOUNDL:
3280af22 1189 PL_reg_flags |= RF_tainted;
bbce6d69 1190 /* FALL THROUGH */
a0d0e21e 1191 case NBOUND:
ffc61ed2 1192 if (do_utf8) {
12d33761 1193 if (s == PL_bostr)
ffc61ed2
JH
1194 tmp = '\n';
1195 else {
6136c704 1196 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1197 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1198 }
1199 tmp = ((OP(c) == NBOUND ?
9041c2e3 1200 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1201 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1202 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1203 if (tmp == !(OP(c) == NBOUND ?
3568d838 1204 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1205 isALNUM_LC_utf8((U8*)s)))
1206 tmp = !tmp;
3b0527fe 1207 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2 1208 goto got_it;
078c425b 1209 s += uskip;
ffc61ed2 1210 }
a0d0e21e 1211 }
667bb95a 1212 else {
12d33761 1213 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1214 tmp = ((OP(c) == NBOUND ?
1215 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1216 while (s < strend) {
1217 if (tmp ==
1218 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1219 tmp = !tmp;
3b0527fe 1220 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1221 goto got_it;
1222 s++;
1223 }
a0ed51b3 1224 }
3b0527fe 1225 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1226 goto got_it;
1227 break;
a0d0e21e 1228 case ALNUM:
ffc61ed2 1229 if (do_utf8) {
1a4fad37 1230 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1231 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1232 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1233 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1234 goto got_it;
1235 else
1236 tmp = doevery;
1237 }
bbce6d69 1238 else
ffc61ed2 1239 tmp = 1;
078c425b 1240 s += uskip;
bbce6d69 1241 }
bbce6d69 1242 }
ffc61ed2
JH
1243 else {
1244 while (s < strend) {
1245 if (isALNUM(*s)) {
3b0527fe 1246 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1247 goto got_it;
1248 else
1249 tmp = doevery;
1250 }
a0ed51b3 1251 else
ffc61ed2
JH
1252 tmp = 1;
1253 s++;
a0ed51b3 1254 }
a0ed51b3
LW
1255 }
1256 break;
bbce6d69 1257 case ALNUML:
3280af22 1258 PL_reg_flags |= RF_tainted;
ffc61ed2 1259 if (do_utf8) {
078c425b 1260 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1261 if (isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1262 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1263 goto got_it;
1264 else
1265 tmp = doevery;
1266 }
a687059c 1267 else
ffc61ed2 1268 tmp = 1;
078c425b 1269 s += uskip;
a0d0e21e 1270 }
a0d0e21e 1271 }
ffc61ed2
JH
1272 else {
1273 while (s < strend) {
1274 if (isALNUM_LC(*s)) {
3b0527fe 1275 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1276 goto got_it;
1277 else
1278 tmp = doevery;
1279 }
a0ed51b3 1280 else
ffc61ed2
JH
1281 tmp = 1;
1282 s++;
a0ed51b3 1283 }
a0ed51b3
LW
1284 }
1285 break;
a0d0e21e 1286 case NALNUM:
ffc61ed2 1287 if (do_utf8) {
1a4fad37 1288 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1289 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1290 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1291 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1292 goto got_it;
1293 else
1294 tmp = doevery;
1295 }
bbce6d69 1296 else
ffc61ed2 1297 tmp = 1;
078c425b 1298 s += uskip;
bbce6d69 1299 }
bbce6d69 1300 }
ffc61ed2
JH
1301 else {
1302 while (s < strend) {
1303 if (!isALNUM(*s)) {
3b0527fe 1304 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1305 goto got_it;
1306 else
1307 tmp = doevery;
1308 }
a0ed51b3 1309 else
ffc61ed2
JH
1310 tmp = 1;
1311 s++;
a0ed51b3 1312 }
a0ed51b3
LW
1313 }
1314 break;
bbce6d69 1315 case NALNUML:
3280af22 1316 PL_reg_flags |= RF_tainted;
ffc61ed2 1317 if (do_utf8) {
078c425b 1318 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1319 if (!isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1320 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1321 goto got_it;
1322 else
1323 tmp = doevery;
1324 }
a687059c 1325 else
ffc61ed2 1326 tmp = 1;
078c425b 1327 s += uskip;
a687059c 1328 }
a0d0e21e 1329 }
ffc61ed2
JH
1330 else {
1331 while (s < strend) {
1332 if (!isALNUM_LC(*s)) {
3b0527fe 1333 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1334 goto got_it;
1335 else
1336 tmp = doevery;
1337 }
a0ed51b3 1338 else
ffc61ed2
JH
1339 tmp = 1;
1340 s++;
a0ed51b3 1341 }
a0ed51b3
LW
1342 }
1343 break;
a0d0e21e 1344 case SPACE:
ffc61ed2 1345 if (do_utf8) {
1a4fad37 1346 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1347 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1348 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
3b0527fe 1349 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1350 goto got_it;
1351 else
1352 tmp = doevery;
1353 }
a0d0e21e 1354 else
ffc61ed2 1355 tmp = 1;
078c425b 1356 s += uskip;
2304df62 1357 }
a0d0e21e 1358 }
ffc61ed2
JH
1359 else {
1360 while (s < strend) {
1361 if (isSPACE(*s)) {
3b0527fe 1362 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1363 goto got_it;
1364 else
1365 tmp = doevery;
1366 }
a0ed51b3 1367 else
ffc61ed2
JH
1368 tmp = 1;
1369 s++;
a0ed51b3 1370 }
a0ed51b3
LW
1371 }
1372 break;
bbce6d69 1373 case SPACEL:
3280af22 1374 PL_reg_flags |= RF_tainted;
ffc61ed2 1375 if (do_utf8) {
078c425b 1376 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1377 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
3b0527fe 1378 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1379 goto got_it;
1380 else
1381 tmp = doevery;
1382 }
bbce6d69 1383 else
ffc61ed2 1384 tmp = 1;
078c425b 1385 s += uskip;
bbce6d69 1386 }
bbce6d69 1387 }
ffc61ed2
JH
1388 else {
1389 while (s < strend) {
1390 if (isSPACE_LC(*s)) {
3b0527fe 1391 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1392 goto got_it;
1393 else
1394 tmp = doevery;
1395 }
a0ed51b3 1396 else
ffc61ed2
JH
1397 tmp = 1;
1398 s++;
a0ed51b3 1399 }
a0ed51b3
LW
1400 }
1401 break;
a0d0e21e 1402 case NSPACE:
ffc61ed2 1403 if (do_utf8) {
1a4fad37 1404 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1405 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1406 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
3b0527fe 1407 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1408 goto got_it;
1409 else
1410 tmp = doevery;
1411 }
a0d0e21e 1412 else
ffc61ed2 1413 tmp = 1;
078c425b 1414 s += uskip;
a687059c 1415 }
a0d0e21e 1416 }
ffc61ed2
JH
1417 else {
1418 while (s < strend) {
1419 if (!isSPACE(*s)) {
3b0527fe 1420 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1421 goto got_it;
1422 else
1423 tmp = doevery;
1424 }
a0ed51b3 1425 else
ffc61ed2
JH
1426 tmp = 1;
1427 s++;
a0ed51b3 1428 }
a0ed51b3
LW
1429 }
1430 break;
bbce6d69 1431 case NSPACEL:
3280af22 1432 PL_reg_flags |= RF_tainted;
ffc61ed2 1433 if (do_utf8) {
078c425b 1434 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1435 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
3b0527fe 1436 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1437 goto got_it;
1438 else
1439 tmp = doevery;
1440 }
bbce6d69 1441 else
ffc61ed2 1442 tmp = 1;
078c425b 1443 s += uskip;
bbce6d69 1444 }
bbce6d69 1445 }
ffc61ed2
JH
1446 else {
1447 while (s < strend) {
1448 if (!isSPACE_LC(*s)) {
3b0527fe 1449 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1450 goto got_it;
1451 else
1452 tmp = doevery;
1453 }
a0ed51b3 1454 else
ffc61ed2
JH
1455 tmp = 1;
1456 s++;
a0ed51b3 1457 }
a0ed51b3
LW
1458 }
1459 break;
a0d0e21e 1460 case DIGIT:
ffc61ed2 1461 if (do_utf8) {
1a4fad37 1462 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1463 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1464 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1465 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1466 goto got_it;
1467 else
1468 tmp = doevery;
1469 }
a0d0e21e 1470 else
ffc61ed2 1471 tmp = 1;
078c425b 1472 s += uskip;
2b69d0c2 1473 }
a0d0e21e 1474 }
ffc61ed2
JH
1475 else {
1476 while (s < strend) {
1477 if (isDIGIT(*s)) {
3b0527fe 1478 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1479 goto got_it;
1480 else
1481 tmp = doevery;
1482 }
a0ed51b3 1483 else
ffc61ed2
JH
1484 tmp = 1;
1485 s++;
a0ed51b3 1486 }
a0ed51b3
LW
1487 }
1488 break;
b8c5462f
JH
1489 case DIGITL:
1490 PL_reg_flags |= RF_tainted;
ffc61ed2 1491 if (do_utf8) {
078c425b 1492 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1493 if (isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1494 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1495 goto got_it;
1496 else
1497 tmp = doevery;
1498 }
b8c5462f 1499 else
ffc61ed2 1500 tmp = 1;
078c425b 1501 s += uskip;
b8c5462f 1502 }
b8c5462f 1503 }
ffc61ed2
JH
1504 else {
1505 while (s < strend) {
1506 if (isDIGIT_LC(*s)) {
3b0527fe 1507 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1508 goto got_it;
1509 else
1510 tmp = doevery;
1511 }
b8c5462f 1512 else
ffc61ed2
JH
1513 tmp = 1;
1514 s++;
b8c5462f 1515 }
b8c5462f
JH
1516 }
1517 break;
a0d0e21e 1518 case NDIGIT:
ffc61ed2 1519 if (do_utf8) {
1a4fad37 1520 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1521 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1522 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1523 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1524 goto got_it;
1525 else
1526 tmp = doevery;
1527 }
a0d0e21e 1528 else
ffc61ed2 1529 tmp = 1;
078c425b 1530 s += uskip;
a687059c 1531 }
a0d0e21e 1532 }
ffc61ed2
JH
1533 else {
1534 while (s < strend) {
1535 if (!isDIGIT(*s)) {
3b0527fe 1536 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1537 goto got_it;
1538 else
1539 tmp = doevery;
1540 }
a0ed51b3 1541 else
ffc61ed2
JH
1542 tmp = 1;
1543 s++;
a0ed51b3 1544 }
a0ed51b3
LW
1545 }
1546 break;
b8c5462f
JH
1547 case NDIGITL:
1548 PL_reg_flags |= RF_tainted;
ffc61ed2 1549 if (do_utf8) {
078c425b 1550 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1551 if (!isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1552 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1553 goto got_it;
1554 else
1555 tmp = doevery;
1556 }
b8c5462f 1557 else
ffc61ed2 1558 tmp = 1;
078c425b 1559 s += uskip;
b8c5462f 1560 }
a0ed51b3 1561 }
ffc61ed2
JH
1562 else {
1563 while (s < strend) {
1564 if (!isDIGIT_LC(*s)) {
3b0527fe 1565 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1566 goto got_it;
1567 else
1568 tmp = doevery;
1569 }
cf93c79d 1570 else
ffc61ed2
JH
1571 tmp = 1;
1572 s++;
b8c5462f 1573 }
b8c5462f
JH
1574 }
1575 break;
b3c9acc1 1576 default:
3c3eec57
GS
1577 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1578 break;
d6a28714 1579 }
6eb5f6b9
JH
1580 return 0;
1581 got_it:
1582 return s;
1583}
1584
1585/*
1586 - regexec_flags - match a regexp against a string
1587 */
1588I32
1589Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1590 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1591/* strend: pointer to null at end of string */
1592/* strbeg: real beginning of string */
1593/* minend: end of match must be >=minend after stringarg. */
1594/* data: May be used for some additional optimizations. */
1595/* nosave: For optimizations. */
1596{
97aff369 1597 dVAR;
6eb5f6b9
JH
1598 register char *s;
1599 register regnode *c;
1600 register char *startpos = stringarg;
6eb5f6b9
JH
1601 I32 minlen; /* must match at least this many chars */
1602 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1603 I32 end_shift = 0; /* Same for the end. */ /* CC */
1604 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1605 char *scream_olds = NULL;
6eb5f6b9 1606 SV* oreplsv = GvSV(PL_replgv);
1df70142 1607 const bool do_utf8 = DO_UTF8(sv);
2757e526 1608 I32 multiline;
2a782b5b 1609#ifdef DEBUGGING
2757e526
JH
1610 SV* dsv0;
1611 SV* dsv1;
2a782b5b 1612#endif
3b0527fe 1613 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1614
1615 GET_RE_DEBUG_FLAGS_DECL;
1616
9d4ba2ae 1617 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1618
1619 /* Be paranoid... */
1620 if (prog == NULL || startpos == NULL) {
1621 Perl_croak(aTHX_ "NULL regexp parameter");
1622 return 0;
1623 }
1624
2757e526 1625 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1626 reginfo.prog = prog;
2757e526
JH
1627
1628#ifdef DEBUGGING
1629 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1630 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1631#endif
1632
bac06658
JH
1633 RX_MATCH_UTF8_set(prog, do_utf8);
1634
6eb5f6b9 1635 minlen = prog->minlen;
61a36c01 1636 if (strend - startpos < minlen) {
a3621e74 1637 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1638 "String too short [regexec_flags]...\n"));
1639 goto phooey;
1aa99e6b 1640 }
6eb5f6b9 1641
6eb5f6b9
JH
1642 /* Check validity of program. */
1643 if (UCHARAT(prog->program) != REG_MAGIC) {
1644 Perl_croak(aTHX_ "corrupted regexp program");
1645 }
1646
1647 PL_reg_flags = 0;
1648 PL_reg_eval_set = 0;
1649 PL_reg_maxiter = 0;
1650
1651 if (prog->reganch & ROPT_UTF8)
1652 PL_reg_flags |= RF_utf8;
1653
1654 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1655 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1656 PL_bostr = strbeg;
3b0527fe 1657 reginfo.sv = sv;
6eb5f6b9
JH
1658
1659 /* Mark end of line for $ (and such) */
1660 PL_regeol = strend;
1661
1662 /* see how far we have to get to not match where we matched before */
3b0527fe 1663 reginfo.till = startpos+minend;
6eb5f6b9 1664
6eb5f6b9
JH
1665 /* If there is a "must appear" string, look for it. */
1666 s = startpos;
1667
3b0527fe 1668 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1669 MAGIC *mg;
1670
1671 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1672 reginfo.ganch = startpos;
6eb5f6b9
JH
1673 else if (sv && SvTYPE(sv) >= SVt_PVMG
1674 && SvMAGIC(sv)
14befaf4
DM
1675 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1676 && mg->mg_len >= 0) {
3b0527fe 1677 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1678 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1679 if (s > reginfo.ganch)
6eb5f6b9 1680 goto phooey;
3b0527fe 1681 s = reginfo.ganch;
6eb5f6b9
JH
1682 }
1683 }
1684 else /* pos() not defined */
3b0527fe 1685 reginfo.ganch = strbeg;
6eb5f6b9
JH
1686 }
1687
a0714e2c 1688 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1689 re_scream_pos_data d;
1690
1691 d.scream_olds = &scream_olds;
1692 d.scream_pos = &scream_pos;
1693 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1694 if (!s) {
a3621e74 1695 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1696 goto phooey; /* not present */
3fa9c3d7 1697 }
6eb5f6b9
JH
1698 }
1699
a3621e74 1700 DEBUG_EXECUTE_r({
1df70142
AL
1701 const char * const s0 = UTF
1702 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1703 UNI_DISPLAY_REGEX)
1704 : prog->precomp;
1705 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1706 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1707 UNI_DISPLAY_REGEX) : startpos;
1df70142 1708 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1709 if (!PL_colorset)
1710 reginitcolors();
1711 PerlIO_printf(Perl_debug_log,
a0288114 1712 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1713 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1714 len0, len0, s0,
2a782b5b 1715 PL_colors[1],
9e55ce06 1716 len0 > 60 ? "..." : "",
2a782b5b 1717 PL_colors[0],
9e55ce06
JH
1718 (int)(len1 > 60 ? 60 : len1),
1719 s1, PL_colors[1],
1720 (len1 > 60 ? "..." : "")
2a782b5b
JH
1721 );
1722 });
6eb5f6b9
JH
1723
1724 /* Simplest case: anchored match need be tried only once. */
1725 /* [unless only anchor is BOL and multiline is set] */
1726 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1727 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1728 goto got_it;
7fba1cd6 1729 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1730 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1731 {
1732 char *end;
1733
1734 if (minlen)
1735 dontbother = minlen - 1;
1aa99e6b 1736 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1737 /* for multiline we only have to try after newlines */
33b8afdf 1738 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1739 if (s == startpos)
1740 goto after_try;
1741 while (1) {
3b0527fe 1742 if (regtry(&reginfo, s))
6eb5f6b9
JH
1743 goto got_it;
1744 after_try:
1745 if (s >= end)
1746 goto phooey;
1747 if (prog->reganch & RE_USE_INTUIT) {
1748 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1749 if (!s)
1750 goto phooey;
1751 }
1752 else
1753 s++;
1754 }
1755 } else {
1756 if (s > startpos)
1757 s--;
1758 while (s < end) {
1759 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1760 if (regtry(&reginfo, s))
6eb5f6b9
JH
1761 goto got_it;
1762 }
1763 }
1764 }
1765 }
1766 goto phooey;
1767 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1768 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1769 goto got_it;
1770 goto phooey;
1771 }
1772
1773 /* Messy cases: unanchored match. */
33b8afdf 1774 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1775 /* we have /x+whatever/ */
1776 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1777 char ch;
bf93d4cc
GS
1778#ifdef DEBUGGING
1779 int did_match = 0;
1780#endif
33b8afdf
JH
1781 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1782 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1783 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1784
1aa99e6b 1785 if (do_utf8) {
6eb5f6b9
JH
1786 while (s < strend) {
1787 if (*s == ch) {
a3621e74 1788 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1789 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1790 s += UTF8SKIP(s);
1791 while (s < strend && *s == ch)
1792 s += UTF8SKIP(s);
1793 }
1794 s += UTF8SKIP(s);
1795 }
1796 }
1797 else {
1798 while (s < strend) {
1799 if (*s == ch) {
a3621e74 1800 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1801 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1802 s++;
1803 while (s < strend && *s == ch)
1804 s++;
1805 }
1806 s++;
1807 }
1808 }
a3621e74 1809 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1810 PerlIO_printf(Perl_debug_log,
b7953727
JH
1811 "Did not find anchored character...\n")
1812 );
6eb5f6b9 1813 }
a0714e2c
SS
1814 else if (prog->anchored_substr != NULL
1815 || prog->anchored_utf8 != NULL
1816 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1817 && prog->float_max_offset < strend - s)) {
1818 SV *must;
1819 I32 back_max;
1820 I32 back_min;
1821 char *last;
6eb5f6b9 1822 char *last1; /* Last position checked before */
bf93d4cc
GS
1823#ifdef DEBUGGING
1824 int did_match = 0;
1825#endif
33b8afdf
JH
1826 if (prog->anchored_substr || prog->anchored_utf8) {
1827 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1828 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1829 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1830 back_max = back_min = prog->anchored_offset;
1831 } else {
1832 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1833 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1834 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1835 back_max = prog->float_max_offset;
1836 back_min = prog->float_min_offset;
1837 }
1838 if (must == &PL_sv_undef)
1839 /* could not downgrade utf8 check substring, so must fail */
1840 goto phooey;
1841
1842 last = HOP3c(strend, /* Cannot start after this */
1843 -(I32)(CHR_SVLEN(must)
1844 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1845
1846 if (s > PL_bostr)
1847 last1 = HOPc(s, -1);
1848 else
1849 last1 = s - 1; /* bogus */
1850
a0288114 1851 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1852 check_substr==must. */
1853 scream_pos = -1;
1854 dontbother = end_shift;
1855 strend = HOPc(strend, -dontbother);
1856 while ( (s <= last) &&
9041c2e3 1857 ((flags & REXEC_SCREAM)
1aa99e6b 1858 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1859 end_shift, &scream_pos, 0))
1aa99e6b 1860 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1861 (unsigned char*)strend, must,
7fba1cd6 1862 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1863 /* we may be pointing at the wrong string */
1864 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1865 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1866 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1867 if (HOPc(s, -back_max) > last1) {
1868 last1 = HOPc(s, -back_min);
1869 s = HOPc(s, -back_max);
1870 }
1871 else {
52657f30 1872 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1873
1874 last1 = HOPc(s, -back_min);
52657f30 1875 s = t;
6eb5f6b9 1876 }
1aa99e6b 1877 if (do_utf8) {
6eb5f6b9 1878 while (s <= last1) {
3b0527fe 1879 if (regtry(&reginfo, s))
6eb5f6b9
JH
1880 goto got_it;
1881 s += UTF8SKIP(s);
1882 }
1883 }
1884 else {
1885 while (s <= last1) {
3b0527fe 1886 if (regtry(&reginfo, s))
6eb5f6b9
JH
1887 goto got_it;
1888 s++;
1889 }
1890 }
1891 }
a3621e74 1892 DEBUG_EXECUTE_r(if (!did_match)
b7953727 1893 PerlIO_printf(Perl_debug_log,
a0288114 1894 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1895 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1896 ? "anchored" : "floating"),
1897 PL_colors[0],
1898 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 1899 SvPVX_const(must),
b7953727
JH
1900 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1901 );
6eb5f6b9
JH
1902 goto phooey;
1903 }
155aba94 1904 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1905 if (minlen) {
1906 I32 op = (U8)OP(prog->regstclass);
66e933ab 1907 /* don't bother with what can't match */
f14c76ed
RGS
1908 if (PL_regkind[op] != EXACT && op != CANY)
1909 strend = HOPc(strend, -(minlen - 1));
1910 }
a3621e74 1911 DEBUG_EXECUTE_r({
ffc61ed2 1912 SV *prop = sv_newmortal();
cfd0369c
NC
1913 const char *s0;
1914 const char *s1;
9e55ce06
JH
1915 int len0;
1916 int len1;
1917
32fc9b6a 1918 regprop(prog, prop, c);
9e55ce06 1919 s0 = UTF ?
3f7c398e 1920 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 1921 UNI_DISPLAY_REGEX) :
cfd0369c 1922 SvPVX_const(prop);
9e55ce06
JH
1923 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1924 s1 = UTF ?
c728cb41 1925 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1926 len1 = UTF ? SvCUR(dsv1) : strend - s;
1927 PerlIO_printf(Perl_debug_log,
a0288114 1928 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
9e55ce06
JH
1929 len0, len0, s0,
1930 len1, len1, s1);
ffc61ed2 1931 });
3b0527fe 1932 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 1933 goto got_it;
a3621e74 1934 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1935 }
1936 else {
1937 dontbother = 0;
a0714e2c 1938 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1939 /* Trim the end. */
d6a28714 1940 char *last;
33b8afdf
JH
1941 SV* float_real;
1942
1943 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1944 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1945 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1946
1947 if (flags & REXEC_SCREAM) {
33b8afdf 1948 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1949 end_shift, &scream_pos, 1); /* last one */
1950 if (!last)
ffc61ed2 1951 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1952 /* we may be pointing at the wrong string */
1953 else if (RX_MATCH_COPIED(prog))
3f7c398e 1954 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1955 }
d6a28714
JH
1956 else {
1957 STRLEN len;
cfd0369c 1958 const char * const little = SvPV_const(float_real, len);
d6a28714 1959
33b8afdf 1960 if (SvTAIL(float_real)) {
d6a28714
JH
1961 if (memEQ(strend - len + 1, little, len - 1))
1962 last = strend - len + 1;
7fba1cd6 1963 else if (!multiline)
9041c2e3 1964 last = memEQ(strend - len, little, len)
bd61b366 1965 ? strend - len : NULL;
b8c5462f 1966 else
d6a28714
JH
1967 goto find_last;
1968 } else {
1969 find_last:
9041c2e3 1970 if (len)
d6a28714 1971 last = rninstr(s, strend, little, little + len);
b8c5462f 1972 else
a0288114 1973 last = strend; /* matching "$" */
b8c5462f 1974 }
b8c5462f 1975 }
bf93d4cc 1976 if (last == NULL) {
a3621e74 1977 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 1978 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 1979 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1980 goto phooey; /* Should not happen! */
1981 }
d6a28714
JH
1982 dontbother = strend - last + prog->float_min_offset;
1983 }
1984 if (minlen && (dontbother < minlen))
1985 dontbother = minlen - 1;
1986 strend -= dontbother; /* this one's always in bytes! */
1987 /* We don't know much -- general case. */
1aa99e6b 1988 if (do_utf8) {
d6a28714 1989 for (;;) {
3b0527fe 1990 if (regtry(&reginfo, s))
d6a28714
JH
1991 goto got_it;
1992 if (s >= strend)
1993 break;
b8c5462f 1994 s += UTF8SKIP(s);
d6a28714
JH
1995 };
1996 }
1997 else {
1998 do {
3b0527fe 1999 if (regtry(&reginfo, s))
d6a28714
JH
2000 goto got_it;
2001 } while (s++ < strend);
2002 }
2003 }
2004
2005 /* Failure. */
2006 goto phooey;
2007
2008got_it:
2009 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2010
2011 if (PL_reg_eval_set) {
2012 /* Preserve the current value of $^R */
2013 if (oreplsv != GvSV(PL_replgv))
2014 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2015 restored, the value remains
2016 the same. */
4f639d21 2017 restore_pos(aTHX_ prog);
d6a28714
JH
2018 }
2019
2020 /* make sure $`, $&, $', and $digit will work later */
2021 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2022 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2023 if (flags & REXEC_COPY_STR) {
2024 I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2025#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2026 if ((SvIsCOW(sv)
2027 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2028 if (DEBUG_C_TEST) {
2029 PerlIO_printf(Perl_debug_log,
2030 "Copy on write: regexp capture, type %d\n",
2031 (int) SvTYPE(sv));
2032 }
2033 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2034 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2035 assert (SvPOKp(prog->saved_copy));
2036 } else
2037#endif
2038 {
2039 RX_MATCH_COPIED_on(prog);
2040 s = savepvn(strbeg, i);
2041 prog->subbeg = s;
2042 }
d6a28714 2043 prog->sublen = i;
d6a28714
JH
2044 }
2045 else {
2046 prog->subbeg = strbeg;
2047 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2048 }
2049 }
9041c2e3 2050
d6a28714
JH
2051 return 1;
2052
2053phooey:
a3621e74 2054 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2055 PL_colors[4], PL_colors[5]));
d6a28714 2056 if (PL_reg_eval_set)
4f639d21 2057 restore_pos(aTHX_ prog);
d6a28714
JH
2058 return 0;
2059}
2060
2061/*
2062 - regtry - try match at specific point
2063 */
2064STATIC I32 /* 0 failure, 1 success */
3b0527fe 2065S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2066{
97aff369 2067 dVAR;
d6a28714
JH
2068 register I32 *sp;
2069 register I32 *ep;
2070 CHECKPOINT lastcp;
3b0527fe 2071 regexp *prog = reginfo->prog;
a3621e74 2072 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2073
02db2b7b
IZ
2074#ifdef DEBUGGING
2075 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2076#endif
d6a28714
JH
2077 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2078 MAGIC *mg;
2079
2080 PL_reg_eval_set = RS_init;
a3621e74 2081 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2082 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2083 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2084 ));
e8347627 2085 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2086 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2087 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2088 SAVETMPS;
2089 /* Apparently this is not needed, judging by wantarray. */
e8347627 2090 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2091 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2092
3b0527fe 2093 if (reginfo->sv) {
d6a28714 2094 /* Make $_ available to executed code. */
3b0527fe 2095 if (reginfo->sv != DEFSV) {
59f00321 2096 SAVE_DEFSV;
3b0527fe 2097 DEFSV = reginfo->sv;
b8c5462f 2098 }
d6a28714 2099
3b0527fe
DM
2100 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2101 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2102 /* prepare for quick setting of pos */
d300d9fa
NC
2103#ifdef PERL_OLD_COPY_ON_WRITE
2104 if (SvIsCOW(sv))
2105 sv_force_normal_flags(sv, 0);
2106#endif
3b0527fe 2107 mg = sv_magicext(reginfo->sv, (SV*)0, PERL_MAGIC_regex_global,
d300d9fa 2108 &PL_vtbl_mglob, NULL, 0);
d6a28714 2109 mg->mg_len = -1;
b8c5462f 2110 }
d6a28714
JH
2111 PL_reg_magic = mg;
2112 PL_reg_oldpos = mg->mg_len;
4f639d21 2113 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2114 }
09687e5a 2115 if (!PL_reg_curpm) {
a02a5408 2116 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2117#ifdef USE_ITHREADS
2118 {
2119 SV* repointer = newSViv(0);
577e12cc 2120 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2121 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2122 av_push(PL_regex_padav,repointer);
2123 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2124 PL_regex_pad = AvARRAY(PL_regex_padav);
2125 }
2126#endif
2127 }
aaa362c4 2128 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2129 PL_reg_oldcurpm = PL_curpm;
2130 PL_curpm = PL_reg_curpm;
2131 if (RX_MATCH_COPIED(prog)) {
2132 /* Here is a serious problem: we cannot rewrite subbeg,
2133 since it may be needed if this match fails. Thus
2134 $` inside (?{}) could fail... */
2135 PL_reg_oldsaved = prog->subbeg;
2136 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2137#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2138 PL_nrs = prog->saved_copy;
2139#endif
d6a28714
JH
2140 RX_MATCH_COPIED_off(prog);
2141 }
2142 else
bd61b366 2143 PL_reg_oldsaved = NULL;
d6a28714
JH
2144 prog->subbeg = PL_bostr;
2145 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2146 }
973dddac 2147 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2148 PL_reginput = startpos;
2149 PL_regstartp = prog->startp;
2150 PL_regendp = prog->endp;
2151 PL_reglastparen = &prog->lastparen;
a01268b5 2152 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2153 prog->lastparen = 0;
03994de8 2154 prog->lastcloseparen = 0;
d6a28714 2155 PL_regsize = 0;
a3621e74 2156 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2157 if (PL_reg_start_tmpl <= prog->nparens) {
2158 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2159 if(PL_reg_start_tmp)
2160 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2161 else
a02a5408 2162 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2163 }
2164
2165 /* XXXX What this code is doing here?!!! There should be no need
2166 to do this again and again, PL_reglastparen should take care of
3dd2943c 2167 this! --ilya*/
dafc8851
JH
2168
2169 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2170 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2171 * PL_reglastparen), is not needed at all by the test suite
2172 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2173 * enough, for building DynaLoader, or otherwise this
2174 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2175 * will happen. Meanwhile, this code *is* needed for the
2176 * above-mentioned test suite tests to succeed. The common theme
2177 * on those tests seems to be returning null fields from matches.
2178 * --jhi */
dafc8851 2179#if 1
d6a28714
JH
2180 sp = prog->startp;
2181 ep = prog->endp;
2182 if (prog->nparens) {
097eb12c 2183 register I32 i;
eb160463 2184 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2185 *++sp = -1;
2186 *++ep = -1;
2187 }
2188 }
dafc8851 2189#endif
02db2b7b 2190 REGCP_SET(lastcp);
3b0527fe 2191 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2192 prog->endp[0] = PL_reginput - PL_bostr;
2193 return 1;
2194 }
02db2b7b 2195 REGCP_UNWIND(lastcp);
d6a28714
JH
2196 return 0;
2197}
2198
02db2b7b
IZ
2199#define RE_UNWIND_BRANCH 1
2200#define RE_UNWIND_BRANCHJ 2
2201
2202union re_unwind_t;
2203
2204typedef struct { /* XX: makes sense to enlarge it... */
2205 I32 type;
2206 I32 prev;
2207 CHECKPOINT lastcp;
2208} re_unwind_generic_t;
2209
2210typedef struct {
2211 I32 type;
2212 I32 prev;
2213 CHECKPOINT lastcp;
2214 I32 lastparen;
2215 regnode *next;
2216 char *locinput;
2217 I32 nextchr;
3a2830be 2218 int minmod;
02db2b7b
IZ
2219#ifdef DEBUGGING
2220 int regindent;
2221#endif
2222} re_unwind_branch_t;
2223
2224typedef union re_unwind_t {
2225 I32 type;
2226 re_unwind_generic_t generic;
2227 re_unwind_branch_t branch;
2228} re_unwind_t;
2229
8ba1375e
MJD
2230#define sayYES goto yes
2231#define sayNO goto no
e0f9d4a8 2232#define sayNO_ANYOF goto no_anyof
8ba1375e 2233#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2234#define sayNO_FINAL goto no_final
2235#define sayNO_SILENT goto do_no
2236#define saySAME(x) if (x) goto yes; else goto no
2237
3ab3c9b4
HS
2238#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2239#define POSCACHE_SEEN 1 /* we know what we're caching */
2240#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
7409bbd3 2241
3ab3c9b4 2242#define CACHEsayYES STMT_START { \
d8319b27 2243 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3
DM
2244 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2245 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2246 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2247 } \
2248 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2249 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2250 } \
2251 else { \
3ab3c9b4
HS
2252 /* cache records failure, but this is success */ \
2253 DEBUG_r( \
2254 PerlIO_printf(Perl_debug_log, \
2255 "%*s (remove success from failure cache)\n", \
2256 REPORT_CODE_OFF+PL_regindent*2, "") \
2257 ); \
d8319b27 2258 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2259 } \
2260 } \
2261 sayYES; \
2262} STMT_END
7409bbd3 2263
3ab3c9b4 2264#define CACHEsayNO STMT_START { \
d8319b27 2265 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3 2266 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
3ab3c9b4 2267 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
7409bbd3
DM
2268 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2269 } \
2270 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2271 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2272 } \
2273 else { \
3ab3c9b4
HS
2274 /* cache records success, but this is failure */ \
2275 DEBUG_r( \
2276 PerlIO_printf(Perl_debug_log, \
2277 "%*s (remove failure from success cache)\n", \
2278 REPORT_CODE_OFF+PL_regindent*2, "") \
2279 ); \
d8319b27 2280 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2281 } \
2282 } \
2283 sayNO; \
2284} STMT_END
2285
a3621e74
YO
2286/* this is used to determine how far from the left messages like
2287 'failed...' are printed. Currently 29 makes these messages line
2288 up with the opcode they refer to. Earlier perls used 25 which
2289 left these messages outdented making reviewing a debug output
2290 quite difficult.
2291*/
2292#define REPORT_CODE_OFF 29
2293
2294
2295/* Make sure there is a test for this +1 options in re_tests */
2296#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2297
9e137952
DM
2298/* this value indiciates that the c1/c2 "next char" test should be skipped */
2299#define CHRTEST_VOID -1000
2300
86545054
DM
2301#define SLAB_FIRST(s) (&(s)->states[0])
2302#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2303
5d9a96ca
DM
2304/* grab a new slab and return the first slot in it */
2305
2306STATIC regmatch_state *
2307S_push_slab(pTHX)
2308{
2309 regmatch_slab *s = PL_regmatch_slab->next;
2310 if (!s) {
2311 Newx(s, 1, regmatch_slab);
2312 s->prev = PL_regmatch_slab;
2313 s->next = NULL;
2314 PL_regmatch_slab->next = s;
2315 }
2316 PL_regmatch_slab = s;
86545054 2317 return SLAB_FIRST(s);
5d9a96ca 2318}
5b47454d 2319
95b24440
DM
2320/* simulate a recursive call to regmatch */
2321
2322#define REGMATCH(ns, where) \
5d9a96ca
DM
2323 st->scan = scan; \
2324 scan = (ns); \
2325 st->resume_state = resume_##where; \
95b24440
DM
2326 goto start_recurse; \
2327 resume_point_##where:
2328
aa283a38
DM
2329
2330/* push a new regex state. Set newst to point to it */
2331
2332#define PUSH_STATE(newst, resume) \
2333 depth++; \
2334 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH STATE(%d)\n", depth)); \
2335 st->scan = scan; \
2336 st->next = next; \
2337 st->n = n; \
2338 st->locinput = locinput; \
2339 st->resume_state = resume; \
2340 newst = st+1; \
86545054 2341 if (newst > SLAB_LAST(PL_regmatch_slab)) \
aa283a38
DM
2342 newst = S_push_slab(aTHX); \
2343 PL_regmatch_state = newst; \
2344 newst->cc = 0; \
2345 newst->minmod = 0; \
2346 newst->sw = 0; \
2347 newst->logical = 0; \
2348 newst->unwind = 0; \
2349 locinput = PL_reginput; \
2350 nextchr = UCHARAT(locinput);
2351
2352#define POP_STATE \
2353 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth)); \
2354 depth--; \
2355 st--; \
86545054 2356 if (st < SLAB_FIRST(PL_regmatch_slab)) { \
aa283a38 2357 PL_regmatch_slab = PL_regmatch_slab->prev; \
86545054 2358 st = SLAB_LAST(PL_regmatch_slab); \
aa283a38
DM
2359 } \
2360 PL_regmatch_state = st; \
2361 scan = st->scan; \
2362 next = st->next; \
2363 n = st->n; \
2364 locinput = st->locinput; \
2365 nextchr = UCHARAT(locinput);
2366
d6a28714
JH
2367/*
2368 - regmatch - main matching routine
2369 *
2370 * Conceptually the strategy is simple: check to see whether the current
2371 * node matches, call self recursively to see whether the rest matches,
2372 * and then act accordingly. In practice we make some effort to avoid
2373 * recursion, in particular by going through "ordinary" nodes (that don't
2374 * need to know whether the rest of the match failed) by a loop instead of
2375 * by recursion.
2376 */
2377/* [lwall] I've hoisted the register declarations to the outer block in order to
2378 * maybe save a little bit of pushing and popping on the stack. It also takes
2379 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2380 *
2381 * This function used to be heavily recursive, but since this had the
2382 * effect of blowing the CPU stack on complex regexes, it has been
2383 * restructured to be iterative, and to save state onto the heap rather
2384 * than the stack. Essentially whereever regmatch() used to be called, it
2385 * pushes the current state, notes where to return, then jumps back into
2386 * the main loop.
2387 *
2388 * Originally the structure of this function used to look something like
2389
2390 S_regmatch() {
2391 int a = 1, b = 2;
2392 ...
2393 while (scan != NULL) {
5d9a96ca 2394 a++; // do stuff with a and b
95b24440
DM
2395 ...
2396 switch (OP(scan)) {
2397 case FOO: {
2398 int local = 3;
2399 ...
2400 if (regmatch(...)) // recurse
2401 goto yes;
2402 }
2403 ...
2404 }
2405 }
2406 yes:
2407 return 1;
2408 }
2409
2410 * Now it looks something like this:
2411
5d9a96ca 2412 typedef struct {
95b24440
DM
2413 int a, b, local;
2414 int resume_state;
5d9a96ca 2415 } regmatch_state;
95b24440
DM
2416
2417 S_regmatch() {
5d9a96ca
DM
2418 regmatch_state *st = new();
2419 int depth=0;
2420 st->a++; // do stuff with a and b
95b24440
DM
2421 ...
2422 while (scan != NULL) {
2423 ...
2424 switch (OP(scan)) {
2425 case FOO: {
5d9a96ca 2426 st->local = 3;
95b24440 2427 ...
5d9a96ca
DM
2428 st->scan = scan;
2429 scan = ...;
2430 st->resume_state = resume_FOO;
2431 goto start_recurse; // recurse
95b24440 2432
5d9a96ca
DM
2433 resume_point_FOO:
2434 if (result)
95b24440
DM
2435 goto yes;
2436 }
2437 ...
2438 }
5d9a96ca
DM
2439 start_recurse:
2440 st = new(); push a new state
2441 st->a = 1; st->b = 2;
2442 depth++;
95b24440 2443 }
5d9a96ca 2444 yes:
95b24440 2445 result = 1;
5d9a96ca
DM
2446 if (depth--) {
2447 st = pop();
95b24440
DM
2448 switch (resume_state) {
2449 case resume_FOO:
2450 goto resume_point_FOO;
2451 ...
2452 }
2453 }
2454 return result
2455 }
2456
2457 * WARNING: this means that any line in this function that contains a
2458 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2459 * regmatch() using gotos instead. Thus the values of any local variables
2460 * not saved in the regmatch_state structure will have been lost when
2461 * execution resumes on the next line .
5d9a96ca
DM
2462 *
2463 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2464 * PL_regmatch_state always points to the currently active state, and
2465 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2466 * The first time regmatch is called, the first slab is allocated, and is
2467 * never freed until interpreter desctruction. When the slab is full,
2468 * a new one is allocated chained to the end. At exit from regmatch, slabs
2469 * allocated since entry are freed.
d6a28714 2470 */
95b24440
DM
2471
2472
d6a28714 2473STATIC I32 /* 0 failure, 1 success */
3b0527fe 2474S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2475{
27da23d5 2476 dVAR;
95b24440 2477 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2478 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2479
3b0527fe
DM
2480 regexp *rex = reginfo->prog;
2481
5d9a96ca
DM
2482 regmatch_slab *orig_slab;
2483 regmatch_state *orig_state;
a3621e74 2484
5d9a96ca
DM
2485 /* the current state. This is a cached copy of PL_regmatch_state */
2486 register regmatch_state *st;
95b24440 2487
5d9a96ca
DM
2488 /* cache heavy used fields of st in registers */
2489 register regnode *scan;
2490 register regnode *next;
2491 register I32 n = 0; /* initialize to shut up compiler warning */
2492 register char *locinput = PL_reginput;
95b24440 2493
5d9a96ca
DM
2494 /* these variables are NOT saved during a recusive RFEGMATCH: */
2495 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2496 bool result; /* return value of S_regmatch */
2497 regnode *inner; /* Next node in internal branch. */
2498 int depth = 0; /* depth of recursion */
aa283a38 2499 regmatch_state *newst; /* when pushing a state, this is the new one */
77cb431f
DM
2500 regmatch_state *yes_state = NULL; /* state to pop to on success of
2501 subpattern */
95b24440
DM
2502
2503#ifdef DEBUGGING
ab74612d 2504 SV *re_debug_flags = NULL;
a3621e74 2505 GET_RE_DEBUG_FLAGS;
d6a28714
JH
2506 PL_regindent++;
2507#endif
2508
5d9a96ca
DM
2509 /* on first ever call to regmatch, allocate first slab */
2510 if (!PL_regmatch_slab) {
2511 Newx(PL_regmatch_slab, 1, regmatch_slab);
2512 PL_regmatch_slab->prev = NULL;
2513 PL_regmatch_slab->next = NULL;
86545054 2514 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2515 }
2516
2517 /* remember current high-water mark for exit */
2518 /* XXX this should be done with SAVE* instead */
2519 orig_slab = PL_regmatch_slab;
2520 orig_state = PL_regmatch_state;
2521
2522 /* grab next free state slot */
2523 st = ++PL_regmatch_state;
86545054 2524 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2525 st = PL_regmatch_state = S_push_slab(aTHX);
2526
2527 st->minmod = 0;
2528 st->sw = 0;
2529 st->logical = 0;
2530 st->unwind = 0;
2531 st->cc = NULL;
d6a28714
JH
2532 /* Note that nextchr is a byte even in UTF */
2533 nextchr = UCHARAT(locinput);
2534 scan = prog;
2535 while (scan != NULL) {
8ba1375e 2536
a3621e74 2537 DEBUG_EXECUTE_r( {
6136c704 2538 SV * const prop = sv_newmortal();
1df70142
AL
2539 const int docolor = *PL_colors[0];
2540 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2541 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2542 /* The part of the string before starttry has one color
2543 (pref0_len chars), between starttry and current
2544 position another one (pref_len - pref0_len chars),
2545 after the current position the third one.
2546 We assume that pref0_len <= pref_len, otherwise we
2547 decrease pref0_len. */
9041c2e3 2548 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2549 ? (5 + taill) - l : locinput - PL_bostr;
2550 int pref0_len;
d6a28714 2551
df1ffd02 2552 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2553 pref_len++;
2554 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2555 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2556 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2557 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2558 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2559 l--;
d6a28714
JH
2560 if (pref0_len < 0)
2561 pref0_len = 0;
2562 if (pref0_len > pref_len)
2563 pref0_len = pref_len;
32fc9b6a 2564 regprop(rex, prop, scan);
2a782b5b 2565 {
1df70142 2566 const char * const s0 =
f14c76ed 2567 do_utf8 && OP(scan) != CANY ?
95b24440 2568 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
c728cb41 2569 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2570 locinput - pref_len;
1df70142
AL
2571 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2572 const char * const s1 = do_utf8 && OP(scan) != CANY ?
95b24440
DM
2573 pv_uni_display(PERL_DEBUG_PAD(1),
2574 (U8*)(locinput - pref_len + pref0_len),
c728cb41 2575 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2576 locinput - pref_len + pref0_len;
1df70142
AL
2577 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2578 const char * const s2 = do_utf8 && OP(scan) != CANY ?
95b24440 2579 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
c728cb41 2580 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2581 locinput;
1df70142 2582 const int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2583 PerlIO_printf(Perl_debug_log,
2584 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2585 (IV)(locinput - PL_bostr),
2586 PL_colors[4],
2587 len0, s0,
2588 PL_colors[5],
2589 PL_colors[2],
2590 len1, s1,
2591 PL_colors[3],
2592 (docolor ? "" : "> <"),
2593 PL_colors[0],
2594 len2, s2,
2595 PL_colors[1],
2596 15 - l - pref_len + 1,
2597 "",
4f639d21 2598 (IV)(scan - rex->program), PL_regindent*2, "",
3f7c398e 2599 SvPVX_const(prop));
2a782b5b
JH
2600 }
2601 });
d6a28714
JH
2602
2603 next = scan + NEXT_OFF(scan);
2604 if (next == scan)
2605 next = NULL;
2606
2607 switch (OP(scan)) {
2608 case BOL:
7fba1cd6 2609 if (locinput == PL_bostr)
d6a28714 2610 {
3b0527fe 2611 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2612 break;
2613 }
d6a28714
JH
2614 sayNO;
2615 case MBOL:
12d33761
HS
2616 if (locinput == PL_bostr ||
2617 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2618 {
b8c5462f
JH
2619 break;
2620 }
d6a28714
JH
2621 sayNO;
2622 case SBOL:
c2a73568 2623 if (locinput == PL_bostr)
b8c5462f 2624 break;
d6a28714
JH
2625 sayNO;
2626 case GPOS:
3b0527fe 2627 if (locinput == reginfo->ganch)
d6a28714
JH
2628 break;
2629 sayNO;
2630 case EOL:
d6a28714
JH
2631 goto seol;
2632 case MEOL:
d6a28714 2633 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2634 sayNO;
b8c5462f 2635 break;
d6a28714
JH
2636 case SEOL:
2637 seol:
2638 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2639 sayNO;
d6a28714 2640 if (PL_regeol - locinput > 1)
b8c5462f 2641 sayNO;
b8c5462f 2642 break;
d6a28714
JH
2643 case EOS:
2644 if (PL_regeol != locinput)
b8c5462f 2645 sayNO;
d6a28714 2646 break;
ffc61ed2 2647 case SANY:
d6a28714 2648 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2649 sayNO;
f33976b4
DB
2650 if (do_utf8) {
2651 locinput += PL_utf8skip[nextchr];
2652 if (locinput > PL_regeol)
2653 sayNO;
2654 nextchr = UCHARAT(locinput);
2655 }
2656 else
2657 nextchr = UCHARAT(++locinput);
2658 break;
2659 case CANY:
2660 if (!nextchr && locinput >= PL_regeol)
2661 sayNO;
b8c5462f 2662 nextchr = UCHARAT(++locinput);
a0d0e21e 2663 break;
ffc61ed2 2664 case REG_ANY:
1aa99e6b
IH
2665 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2666 sayNO;
2667 if (do_utf8) {
b8c5462f 2668 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2669 if (locinput > PL_regeol)
2670 sayNO;
a0ed51b3 2671 nextchr = UCHARAT(locinput);
a0ed51b3 2672 }
1aa99e6b
IH
2673 else
2674 nextchr = UCHARAT(++locinput);
a0ed51b3 2675 break;
a3621e74
YO
2676
2677
2678
2679 /*
2680 traverse the TRIE keeping track of all accepting states
2681 we transition through until we get to a failing node.
2682
a3621e74
YO
2683
2684 */
5b47454d 2685 case TRIE:
a3621e74
YO
2686 case TRIEF:
2687 case TRIEFL:
2688 {
a3621e74
YO
2689 U8 *uc = ( U8* )locinput;
2690 U32 state = 1;
2691 U16 charid = 0;
2692 U32 base = 0;
2693 UV uvc = 0;
2694 STRLEN len = 0;
2695 STRLEN foldlen = 0;
a3621e74
YO
2696 U8 *uscan = (U8*)NULL;
2697 STRLEN bufflen=0;
95b24440 2698 SV *sv_accept_buff = NULL;
5b47454d
DM
2699 const enum { trie_plain, trie_utf8, trie_uft8_fold }
2700 trie_type = do_utf8 ?
2701 (OP(scan) == TRIE ? trie_utf8 : trie_uft8_fold)
2702 : trie_plain;
2703
7087a21c
NC
2704 /* what trie are we using right now */
2705 reg_trie_data *trie
32fc9b6a 2706 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
d8319b27 2707 st->u.trie.accepted = 0; /* how many accepting states we have seen */
95b24440 2708 result = 0;
a3621e74
YO
2709
2710 while ( state && uc <= (U8*)PL_regeol ) {
2711
5b47454d 2712 if (trie->states[ state ].wordnum) {
d8319b27 2713 if (!st->u.trie.accepted ) {
5b47454d
DM
2714 ENTER;
2715 SAVETMPS;
2716 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2717 sv_accept_buff=newSV(bufflen *
2718 sizeof(reg_trie_accepted) - 1);
2719 SvCUR_set(sv_accept_buff,
2720 sizeof(reg_trie_accepted));
2721 SvPOK_on(sv_accept_buff);
2722 sv_2mortal(sv_accept_buff);
d8319b27 2723 st->u.trie.accept_buff =
5b47454d
DM
2724 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2725 }
2726 else {
d8319b27 2727 if (st->u.trie.accepted >= bufflen) {
5b47454d 2728 bufflen *= 2;
d8319b27 2729 st->u.trie.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2730 SvGROW(sv_accept_buff,
2731 bufflen * sizeof(reg_trie_accepted));
2732 }
2733 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2734 + sizeof(reg_trie_accepted));
2735 }
d8319b27
DM
2736 st->u.trie.accept_buff[st->u.trie.accepted].wordnum = trie->states[state].wordnum;
2737 st->u.trie.accept_buff[st->u.trie.accepted].endpos = uc;
2738 ++st->u.trie.accepted;
5b47454d 2739 }
a3621e74
YO
2740
2741 base = trie->states[ state ].trans.base;
2742
2743 DEBUG_TRIE_EXECUTE_r(
2744 PerlIO_printf( Perl_debug_log,
e4584336 2745 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2746 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27 2747 (UV)state, (UV)base, (UV)st->u.trie.accepted );
a3621e74
YO
2748 );
2749
2750 if ( base ) {
5b47454d
DM
2751 switch (trie_type) {
2752 case trie_uft8_fold:
a3621e74
YO
2753 if ( foldlen>0 ) {
2754 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2755 foldlen -= len;
2756 uscan += len;
2757 len=0;
2758 } else {
1df70142 2759 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2760 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2761 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2762 foldlen -= UNISKIP( uvc );
2763 uscan = foldbuf + UNISKIP( uvc );
2764 }
5b47454d
DM
2765 break;
2766 case trie_utf8:
2767 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2768 &len, uniflags );
2769 break;
2770 case trie_plain:
e4584336 2771 uvc = (UV)*uc;
a3621e74
YO
2772 len = 1;
2773 }
2774
5b47454d
DM
2775 if (uvc < 256) {
2776 charid = trie->charmap[ uvc ];
2777 }
2778 else {
2779 charid = 0;
2780 if (trie->widecharmap) {
2781 SV** svpp = (SV**)NULL;
2782 svpp = hv_fetch(trie->widecharmap,
2783 (char*)&uvc, sizeof(UV), 0);
2784 if (svpp)
2785 charid = (U16)SvIV(*svpp);
2786 }
2787 }
a3621e74 2788
5b47454d
DM
2789 if (charid &&
2790 (base + charid > trie->uniquecharcount )
2791 && (base + charid - 1 - trie->uniquecharcount
2792 < trie->lasttrans)
2793 && trie->trans[base + charid - 1 -
2794 trie->uniquecharcount].check == state)
2795 {
2796 state = trie->trans[base + charid - 1 -
2797 trie->uniquecharcount ].next;
2798 }
2799 else {
2800 state = 0;
2801 }
2802 uc += len;
2803
2804 }
2805 else {
a3621e74
YO
2806 state = 0;
2807 }
2808 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2809 PerlIO_printf( Perl_debug_log,
2810 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2811 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2812 );
2813 }
d8319b27 2814 if (!st->u.trie.accepted )
a3621e74 2815 sayNO;
a3621e74
YO
2816
2817 /*
2818 There was at least one accepting state that we
2819 transitioned through. Presumably the number of accepting
2820 states is going to be low, typically one or two. So we
2821 simply scan through to find the one with lowest wordnum.
2822 Once we find it, we swap the last state into its place
2823 and decrement the size. We then try to match the rest of
2824 the pattern at the point where the word ends, if we
2825 succeed then we end the loop, otherwise the loop
2826 eventually terminates once all of the accepting states
2827 have been tried.
2828 */
a3621e74 2829
d8319b27 2830 if ( st->u.trie.accepted == 1 ) {
a3621e74 2831 DEBUG_EXECUTE_r({
097eb12c 2832 SV ** const tmp = av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 );
a3621e74
YO
2833 PerlIO_printf( Perl_debug_log,
2834 "%*s %sonly one match : #%d <%s>%s\n",
e4584336 2835 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
d8319b27 2836 st->u.trie.accept_buff[ 0 ].wordnum,
cfd0369c 2837 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
a3621e74
YO
2838 PL_colors[5] );
2839 });
d8319b27 2840 PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos;
a3621e74
YO
2841 /* in this case we free tmps/leave before we call regmatch
2842 as we wont be using accept_buff again. */
2843 FREETMPS;
2844 LEAVE;
95b24440
DM
2845 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
2846 /*** all unsaved local vars undefined at this point */
a3621e74
YO
2847 } else {
2848 DEBUG_EXECUTE_r(
e4584336 2849 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
d8319b27 2850 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)st->u.trie.accepted,
a3621e74
YO
2851 PL_colors[5] );
2852 );
d8319b27 2853 while ( !result && st->u.trie.accepted-- ) {
a3621e74
YO
2854 U32 best = 0;
2855 U32 cur;
d8319b27 2856 for( cur = 1 ; cur <= st->u.trie.accepted ; cur++ ) {
e4584336
RB
2857 DEBUG_TRIE_EXECUTE_r(
2858 PerlIO_printf( Perl_debug_log,
2859 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2860 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27
DM
2861 (IV)best, st->u.trie.accept_buff[ best ].wordnum, (IV)cur,
2862 st->u.trie.accept_buff[ cur ].wordnum, PL_colors[5] );
e4584336 2863 );
a3621e74 2864
d8319b27
DM
2865 if (st->u.trie.accept_buff[cur].wordnum <
2866 st->u.trie.accept_buff[best].wordnum)
e822a8b4 2867 best = cur;
a3621e74
YO
2868 }
2869 DEBUG_EXECUTE_r({
87830502 2870 reg_trie_data * const trie = (reg_trie_data*)
32fc9b6a 2871 rex->data->data[ARG(scan)];
d8319b27 2872 SV ** const tmp = av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 );
a3621e74 2873 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
e4584336 2874 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
d8319b27 2875 st->u.trie.accept_buff[best].wordnum,
cfd0369c 2876 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",scan,
a3621e74
YO
2877 PL_colors[5] );
2878 });
d8319b27
DM
2879 if ( best<st->u.trie.accepted ) {
2880 reg_trie_accepted tmp = st->u.trie.accept_buff[ best ];
2881 st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ];
2882 st->u.trie.accept_buff[ st->u.trie.accepted ] = tmp;
2883 best = st->u.trie.accepted;
a3621e74 2884 }
d8319b27 2885 PL_reginput = (char *)st->u.trie.accept_buff[ best ].endpos;
a3621e74
YO
2886
2887 /*
2888 as far as I can tell we only need the SAVETMPS/FREETMPS
2889 for re's with EVAL in them but I'm leaving them in for
2890 all until I can be sure.
2891 */
2892 SAVETMPS;
95b24440
DM
2893 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
2894 /*** all unsaved local vars undefined at this point */
a3621e74
YO
2895 FREETMPS;
2896 }
2897 FREETMPS;
2898 LEAVE;
2899 }
2900
95b24440 2901 if (result) {
a3621e74
YO
2902 sayYES;
2903 } else {
2904 sayNO;
2905 }
2906 }
2907 /* unreached codepoint */
95b24440
DM
2908 case EXACT: {
2909 char *s = STRING(scan);
5d9a96ca 2910 st->ln = STR_LEN(scan);
eb160463 2911 if (do_utf8 != UTF) {
bc517b45 2912 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2913 char *l = locinput;
5d9a96ca 2914 const char *e = s + st->ln;
a72c7584 2915
5ff6fc6d
JH
2916 if (do_utf8) {
2917 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2918 while (s < e) {
a3b680e6 2919 STRLEN ulen;
1aa99e6b 2920 if (l >= PL_regeol)
5ff6fc6d
JH
2921 sayNO;
2922 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2923 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2924 uniflags))
5ff6fc6d 2925 sayNO;
bc517b45 2926 l += ulen;
5ff6fc6d 2927 s ++;
1aa99e6b 2928 }
5ff6fc6d
JH
2929 }
2930 else {
2931 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2932 while (s < e) {
a3b680e6 2933 STRLEN ulen;
1aa99e6b
IH
2934 if (l >= PL_regeol)
2935 sayNO;
5ff6fc6d 2936 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2937 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2938 uniflags))
1aa99e6b 2939 sayNO;
bc517b45 2940 s += ulen;
a72c7584 2941 l ++;
1aa99e6b 2942 }
5ff6fc6d 2943 }
1aa99e6b
IH
2944 locinput = l;
2945 nextchr = UCHARAT(locinput);
2946 break;
2947 }
bc517b45 2948 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2949 /* Inline the first character, for speed. */
2950 if (UCHARAT(s) != nextchr)
2951 sayNO;
5d9a96ca 2952 if (PL_regeol - locinput < st->ln)
d6a28714 2953 sayNO;
5d9a96ca 2954 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 2955 sayNO;
5d9a96ca 2956 locinput += st->ln;
d6a28714
JH
2957 nextchr = UCHARAT(locinput);
2958 break;
95b24440 2959 }
d6a28714 2960 case EXACTFL:
b8c5462f
JH
2961 PL_reg_flags |= RF_tainted;
2962 /* FALL THROUGH */
95b24440
DM
2963 case EXACTF: {
2964 char *s = STRING(scan);
5d9a96ca 2965 st->ln = STR_LEN(scan);
d6a28714 2966
d07ddd77
JH
2967 if (do_utf8 || UTF) {
2968 /* Either target or the pattern are utf8. */
d6a28714 2969 char *l = locinput;
d07ddd77 2970 char *e = PL_regeol;
bc517b45 2971
5d9a96ca 2972 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 2973 l, &e, 0, do_utf8)) {
5486206c
JH
2974 /* One more case for the sharp s:
2975 * pack("U0U*", 0xDF) =~ /ss/i,
2976 * the 0xC3 0x9F are the UTF-8
2977 * byte sequence for the U+00DF. */
2978 if (!(do_utf8 &&
2979 toLOWER(s[0]) == 's' &&
5d9a96ca 2980 st->ln >= 2 &&
5486206c
JH
2981 toLOWER(s[1]) == 's' &&
2982 (U8)l[0] == 0xC3 &&
2983 e - l >= 2 &&
2984 (U8)l[1] == 0x9F))
2985 sayNO;
2986 }
d07ddd77
JH
2987 locinput = e;
2988 nextchr = UCHARAT(locinput);
2989 break;
a0ed51b3 2990 }
d6a28714 2991
bc517b45
JH
2992 /* Neither the target and the pattern are utf8. */
2993
d6a28714
JH
2994 /* Inline the first character, for speed. */
2995 if (UCHARAT(s) != nextchr &&
2996 UCHARAT(s) != ((OP(scan) == EXACTF)
2997 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2998 sayNO;
5d9a96ca 2999 if (PL_regeol - locinput < st->ln)
b8c5462f 3000 sayNO;
5d9a96ca
DM
3001 if (st->ln > 1 && (OP(scan) == EXACTF
3002 ? ibcmp(s, locinput, st->ln)
3003 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 3004 sayNO;
5d9a96ca 3005 locinput += st->ln;
d6a28714 3006 nextchr = UCHARAT(locinput);
a0d0e21e 3007 break;
95b24440 3008 }
d6a28714 3009 case ANYOF:
ffc61ed2 3010 if (do_utf8) {
9e55ce06
JH
3011 STRLEN inclasslen = PL_regeol - locinput;
3012
32fc9b6a 3013 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 3014 sayNO_ANYOF;
ffc61ed2
JH
3015 if (locinput >= PL_regeol)
3016 sayNO;
0f0076b4 3017 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3018 nextchr = UCHARAT(locinput);
e0f9d4a8 3019 break;
ffc61ed2
JH
3020 }
3021 else {
3022 if (nextchr < 0)
3023 nextchr = UCHARAT(locinput);
32fc9b6a 3024 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 3025 sayNO_ANYOF;
ffc61ed2
JH
3026 if (!nextchr && locinput >= PL_regeol)
3027 sayNO;
3028 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3029 break;
3030 }
3031 no_anyof:
3032 /* If we might have the case of the German sharp s
3033 * in a casefolding Unicode character class. */
3034
ebc501f0
JH
3035 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3036 locinput += SHARP_S_SKIP;
e0f9d4a8 3037 nextchr = UCHARAT(locinput);
ffc61ed2 3038 }
e0f9d4a8
JH
3039 else
3040 sayNO;
b8c5462f 3041 break;
d6a28714 3042 case ALNUML:
b8c5462f
JH
3043 PL_reg_flags |= RF_tainted;
3044 /* FALL THROUGH */
d6a28714 3045 case ALNUM:
b8c5462f 3046 if (!nextchr)
4633a7c4 3047 sayNO;
ffc61ed2 3048 if (do_utf8) {
1a4fad37 3049 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3050 if (!(OP(scan) == ALNUM
3568d838 3051 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3052 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3053 {
3054 sayNO;
a0ed51b3 3055 }
b8c5462f 3056 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3057 nextchr = UCHARAT(locinput);
3058 break;
3059 }
ffc61ed2 3060 if (!(OP(scan) == ALNUM
d6a28714 3061 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3062 sayNO;
b8c5462f 3063 nextchr = UCHARAT(++locinput);
a0d0e21e 3064 break;
d6a28714 3065 case NALNUML:
b8c5462f
JH
3066 PL_reg_flags |= RF_tainted;
3067 /* FALL THROUGH */
d6a28714
JH
3068 case NALNUM:
3069 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3070 sayNO;
ffc61ed2 3071 if (do_utf8) {
1a4fad37 3072 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3073 if (OP(scan) == NALNUM
3568d838 3074 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3075 : isALNUM_LC_utf8((U8*)locinput))
3076 {
b8c5462f 3077 sayNO;
d6a28714 3078 }
b8c5462f
JH
3079 locinput += PL_utf8skip[nextchr];
3080 nextchr = UCHARAT(locinput);
3081 break;
3082 }
ffc61ed2 3083 if (OP(scan) == NALNUM
d6a28714 3084 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3085 sayNO;
76e3520e 3086 nextchr = UCHARAT(++locinput);
a0d0e21e 3087 break;
d6a28714
JH
3088 case BOUNDL:
3089 case NBOUNDL:
3280af22 3090 PL_reg_flags |= RF_tainted;
bbce6d69 3091 /* FALL THROUGH */
d6a28714
JH
3092 case BOUND:
3093 case NBOUND:
3094 /* was last char in word? */
ffc61ed2 3095 if (do_utf8) {
12d33761 3096 if (locinput == PL_bostr)
5d9a96ca 3097 st->ln = '\n';
ffc61ed2 3098 else {
a3b680e6 3099 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3100
4ad0818d 3101 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3102 }
3103 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3104 st->ln = isALNUM_uni(st->ln);
1a4fad37 3105 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3106 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3107 }
3108 else {
5d9a96ca 3109 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3110 n = isALNUM_LC_utf8((U8*)locinput);
3111 }
a0ed51b3 3112 }
d6a28714 3113 else {
5d9a96ca 3114 st->ln = (locinput != PL_bostr) ?
12d33761 3115 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3116 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3117 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3118 n = isALNUM(nextchr);
3119 }
3120 else {
5d9a96ca 3121 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3122 n = isALNUM_LC(nextchr);
3123 }
d6a28714 3124 }
5d9a96ca 3125 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3126 OP(scan) == BOUNDL))
3127 sayNO;
a0ed51b3 3128 break;
d6a28714 3129 case SPACEL:
3280af22 3130 PL_reg_flags |= RF_tainted;
bbce6d69 3131 /* FALL THROUGH */
d6a28714 3132 case SPACE:
9442cb0e 3133 if (!nextchr)
4633a7c4 3134 sayNO;
1aa99e6b 3135 if (do_utf8) {
fd400ab9 3136 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3137 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3138 if (!(OP(scan) == SPACE
3568d838 3139 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3140 : isSPACE_LC_utf8((U8*)locinput)))
3141 {
3142 sayNO;
3143 }
3144 locinput += PL_utf8skip[nextchr];
3145 nextchr = UCHARAT(locinput);
3146 break;
d6a28714 3147 }
ffc61ed2
JH
3148 if (!(OP(scan) == SPACE
3149 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3150 sayNO;
3151 nextchr = UCHARAT(++locinput);
3152 }
3153 else {
3154 if (!(OP(scan) == SPACE
3155 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3156 sayNO;
3157 nextchr = UCHARAT(++locinput);
a0ed51b3 3158 }
a0ed51b3 3159 break;
d6a28714 3160 case NSPACEL:
3280af22 3161 PL_reg_flags |= RF_tainted;
bbce6d69 3162 /* FALL THROUGH */
d6a28714 3163 case NSPACE:
9442cb0e 3164 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3165 sayNO;
1aa99e6b 3166 if (do_utf8) {
1a4fad37 3167 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3168 if (OP(scan) == NSPACE
3568d838 3169 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3170 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3171 {
3172 sayNO;
3173 }
3174 locinput += PL_utf8skip[nextchr];
3175 nextchr = UCHARAT(locinput);
3176 break;
a0ed51b3 3177 }
ffc61ed2 3178 if (OP(scan) == NSPACE
d6a28714 3179 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3180 sayNO;
76e3520e 3181 nextchr = UCHARAT(++locinput);
a0d0e21e 3182 break;
d6a28714 3183 case DIGITL:
a0ed51b3
LW
3184 PL_reg_flags |= RF_tainted;
3185 /* FALL THROUGH */
d6a28714 3186 case DIGIT:
9442cb0e 3187 if (!nextchr)
a0ed51b3 3188 sayNO;
1aa99e6b 3189 if (do_utf8) {
1a4fad37 3190 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3191 if (!(OP(scan) == DIGIT
3568d838 3192 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3193 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3194 {
a0ed51b3 3195 sayNO;
dfe13c55 3196 }
6f06b55f 3197 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3198 nextchr = UCHARAT(locinput);
3199 break;
3200 }
ffc61ed2 3201 if (!(OP(scan) == DIGIT
9442cb0e 3202 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3203 sayNO;
3204 nextchr = UCHARAT(++locinput);
3205 break;
d6a28714 3206 case NDIGITL:
b8c5462f
JH
3207 PL_reg_flags |= RF_tainted;
3208 /* FALL THROUGH */
d6a28714 3209 case NDIGIT:
9442cb0e 3210 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3211 sayNO;
1aa99e6b 3212 if (do_utf8) {
1a4fad37 3213 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3214 if (OP(scan) == NDIGIT
3568d838 3215 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3216 : isDIGIT_LC_utf8((U8*)locinput))
3217 {
a0ed51b3 3218 sayNO;
9442cb0e 3219 }
6f06b55f 3220 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3221 nextchr = UCHARAT(locinput);
3222 break;
3223 }
ffc61ed2 3224 if (OP(scan) == NDIGIT
9442cb0e 3225 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3226 sayNO;
3227 nextchr = UCHARAT(++locinput);
3228 break;
3229 case CLUMP:
b7c83a7e 3230 if (locinput >= PL_regeol)
a0ed51b3 3231 sayNO;
b7c83a7e 3232 if (do_utf8) {
1a4fad37 3233 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3234 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3235 sayNO;
3236 locinput += PL_utf8skip[nextchr];
3237 while (locinput < PL_regeol &&
3238 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3239 locinput += UTF8SKIP(locinput);
3240 if (locinput > PL_regeol)
3241 sayNO;
eb08e2da
JH
3242 }
3243 else
3244 locinput++;
a0ed51b3
LW
3245 nextchr = UCHARAT(locinput);
3246 break;
c8756f30 3247 case REFFL:
3280af22 3248 PL_reg_flags |= RF_tainted;
c8756f30 3249 /* FALL THROUGH */
c277df42 3250 case REF:
95b24440
DM
3251 case REFF: {
3252 char *s;
c277df42 3253 n = ARG(scan); /* which paren pair */
5d9a96ca 3254 st->ln = PL_regstartp[n];
2c2d71f5 3255 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3256 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3257 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3258 if (st->ln == PL_regendp[n])
a0d0e21e 3259 break;
a0ed51b3 3260
5d9a96ca 3261 s = PL_bostr + st->ln;
1aa99e6b 3262 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3263 char *l = locinput;
a3b680e6 3264 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3265 /*
3266 * Note that we can't do the "other character" lookup trick as
3267 * in the 8-bit case (no pun intended) because in Unicode we
3268 * have to map both upper and title case to lower case.
3269 */
3270 if (OP(scan) == REFF) {
3271 while (s < e) {
a3b680e6
AL
3272 STRLEN ulen1, ulen2;
3273 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3274 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3275
a0ed51b3
LW
3276 if (l >= PL_regeol)
3277 sayNO;
a2a2844f
JH
3278 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3279 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3280 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3281 sayNO;
a2a2844f
JH
3282 s += ulen1;
3283 l += ulen2;
a0ed51b3
LW
3284 }
3285 }
3286 locinput = l;
3287 nextchr = UCHARAT(locinput);
3288 break;
3289 }
3290
a0d0e21e 3291 /* Inline the first character, for speed. */
76e3520e 3292 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3293 (OP(scan) == REF ||
3294 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3295 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3296 sayNO;
5d9a96ca
DM
3297 st->ln = PL_regendp[n] - st->ln;
3298 if (locinput + st->ln > PL_regeol)
4633a7c4 3299 sayNO;
5d9a96ca
DM
3300 if (st->ln > 1 && (OP(scan) == REF
3301 ? memNE(s, locinput, st->ln)
c8756f30 3302 : (OP(scan) == REFF
5d9a96ca
DM
3303 ? ibcmp(s, locinput, st->ln)
3304 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3305 sayNO;
5d9a96ca 3306 locinput += st->ln;
76e3520e 3307 nextchr = UCHARAT(locinput);
a0d0e21e 3308 break;
95b24440 3309 }
a0d0e21e
LW
3310
3311 case NOTHING:
c277df42 3312 case TAIL:
a0d0e21e
LW
3313 break;
3314 case BACK:
3315 break;
c277df42
IZ
3316 case EVAL:
3317 {
c277df42 3318 SV *ret;
8e5e9ebe 3319 {
4aabdb9b
DM
3320 /* execute the code in the {...} */
3321 dSP;
6136c704 3322 SV ** const before = SP;
4aabdb9b
DM
3323 OP_4tree * const oop = PL_op;
3324 COP * const ocurcop = PL_curcop;
3325 PAD *old_comppad;
4aabdb9b
DM
3326
3327 n = ARG(scan);
32fc9b6a 3328 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3329 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3330 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3331 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3332
8e5e9ebe
RGS
3333 CALLRUNOPS(aTHX); /* Scalar context. */
3334 SPAGAIN;
3335 if (SP == before)
075aa684 3336 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3337 else {
3338 ret = POPs;
3339 PUTBACK;
3340 }
4aabdb9b
DM
3341
3342 PL_op = oop;
3343 PAD_RESTORE_LOCAL(old_comppad);
3344 PL_curcop = ocurcop;
3345 if (!st->logical) {
3346 /* /(?{...})/ */
3347 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3348 break;
3349 }
8e5e9ebe 3350 }
4aabdb9b
DM
3351 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3352 regexp *re;
4aabdb9b 3353 {
4f639d21
DM
3354 /* extract RE object from returned value; compiling if
3355 * necessary */
3356
6136c704 3357 MAGIC *mg = NULL;
4aabdb9b 3358 SV *sv;
faf82a0b
AE
3359 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3360 mg = mg_find(sv, PERL_MAGIC_qr);
3361 else if (SvSMAGICAL(ret)) {
3362 if (SvGMAGICAL(ret))
3363 sv_unmagic(ret, PERL_MAGIC_qr);
3364 else
3365 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3366 }
faf82a0b 3367
0f5d15d6
IZ
3368 if (mg) {
3369 re = (regexp *)mg->mg_obj;
df0003d4 3370 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3371 }
3372 else {
3373 STRLEN len;
6136c704 3374 const char * const t = SvPV_const(ret, len);
0f5d15d6 3375 PMOP pm;
a3b680e6 3376 const I32 osize = PL_regsize;
0f5d15d6 3377
5fcd1c1b 3378 Zero(&pm, 1, PMOP);
4aabdb9b 3379 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3380 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);