This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Integrate:
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
40d34c0d
SB
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
bb3580c7
NC
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
40d34c0d
SB
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
50/* *These* symbols are masked to allow static link. */
51# define Perl_pregexec my_pregexec
9041c2e3 52# define Perl_reginitcolors my_reginitcolors
490a3f88 53# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
9041c2e3 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 **** Alterations to Henry's code are...
79 ****
e6906430 80 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
2c351e65 81 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 82 ****
9ef589d8
LW
83 **** You may distribute under the terms of either the GNU General Public
84 **** License or the Artistic License, as specified in the README file.
a687059c
LW
85 *
86 * Beware that some of this code is subtly aware of the way operator
87 * precedence is structured in regular expressions. Serious changes in
88 * regular-expression syntax might require a total rethink.
89 */
90#include "EXTERN.h"
864dbfa3 91#define PERL_IN_REGEXEC_C
a687059c 92#include "perl.h"
0f5d15d6 93
a687059c
LW
94#include "regcomp.h"
95
c277df42
IZ
96#define RF_tainted 1 /* tainted information used? */
97#define RF_warned 2 /* warned about big count? */
ce862d02 98#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
99#define RF_utf8 8 /* String contains multibyte chars? */
100
eb160463 101#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
102
103#define RS_init 1 /* eval environment created */
104#define RS_set 2 /* replsv value is set */
c277df42 105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
ba7b4546 110#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 111
c277df42
IZ
112/*
113 * Forwards.
114 */
115
33b8afdf 116#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 117#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 118
dfe13c55
GS
119#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
120#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
121#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
122#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
123#define HOPc(pos,off) ((char*)HOP(pos,off))
124#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 125
efb30f32 126#define HOPBACK(pos, off) ( \
e54858b0 127 (PL_reg_match_utf8) \
efb30f32
HS
128 ? reghopmaybe((U8*)pos, -off) \
129 : (pos - off >= PL_bostr) \
130 ? (U8*)(pos - off) \
131 : (U8*)NULL \
132)
133#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
134
1aa99e6b
IH
135#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
136#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
137#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
138#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
139#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
140#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
141
fab1f349 142#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
47a91a97 143 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((U8*)str); assert(ok); LEAVE; } } STMT_END
fab1f349
AL
144#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
145#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
146#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
147#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 148
5f80c4cf 149/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
150#define JUMPABLE(rn) ( \
151 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
152 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
153 OP(rn) == PLUS || OP(rn) == MINMOD || \
154 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
155)
156
cca55fe3
JP
157#define HAS_TEXT(rn) ( \
158 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
159)
e2d8ce26 160
a84d97b6
HS
161/*
162 Search for mandatory following text node; for lookahead, the text must
163 follow but for lookbehind (rn->flags != 0) we skip to the next step.
164*/
cca55fe3 165#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 166 while (JUMPABLE(rn)) \
a84d97b6 167 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 168 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
169 else if (OP(rn) == PLUS) \
170 rn = NEXTOPER(rn); \
a84d97b6
HS
171 else if (OP(rn) == IFMATCH) \
172 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 173 else rn += NEXT_OFF(rn); \
5f80c4cf 174} STMT_END
74750237 175
acfe0abc 176static void restore_pos(pTHX_ void *arg);
51371543 177
76e3520e 178STATIC CHECKPOINT
cea2e8a9 179S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 180{
8c18bf38 181 const int retval = PL_savestack_ix;
b1ce53c5 182#define REGCP_PAREN_ELEMS 4
8c18bf38 183 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
184 int p;
185
e49a9654
IH
186 if (paren_elems_to_push < 0)
187 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
188
a01268b5 189#define REGCP_OTHER_ELEMS 6
f3479639 190 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 191 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 192/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
193 SSPUSHINT(PL_regendp[p]);
194 SSPUSHINT(PL_regstartp[p]);
3280af22 195 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
196 SSPUSHINT(p);
197 }
b1ce53c5 198/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
199 SSPUSHINT(PL_regsize);
200 SSPUSHINT(*PL_reglastparen);
a01268b5 201 SSPUSHINT(*PL_reglastcloseparen);
3280af22 202 SSPUSHPTR(PL_reginput);
41123dfd
JH
203#define REGCP_FRAME_ELEMS 2
204/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
205 * are needed for the regexp context stack bookkeeping. */
206 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 207 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 208
a0d0e21e
LW
209 return retval;
210}
211
c277df42 212/* These are needed since we do not localize EVAL nodes: */
02db2b7b 213# define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
faccc32b 214 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 215 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 216
02db2b7b 217# define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
c3464db5 218 PerlIO_printf(Perl_debug_log, \
faccc32b 219 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 220 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 221
76e3520e 222STATIC char *
cea2e8a9 223S_regcppop(pTHX)
a0d0e21e 224{
b1ce53c5 225 I32 i;
a0d0e21e
LW
226 U32 paren = 0;
227 char *input;
b1ce53c5
JH
228
229 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 230 i = SSPOPINT;
b1ce53c5
JH
231 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
232 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 233 input = (char *) SSPOPPTR;
a01268b5 234 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
235 *PL_reglastparen = SSPOPINT;
236 PL_regsize = SSPOPINT;
b1ce53c5
JH
237
238 /* Now restore the parentheses context. */
41123dfd
JH
239 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
240 i > 0; i -= REGCP_PAREN_ELEMS) {
228fe6e6 241 I32 tmps;
a0d0e21e 242 paren = (U32)SSPOPINT;
3280af22 243 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
244 PL_regstartp[paren] = SSPOPINT;
245 tmps = SSPOPINT;
3280af22
NIS
246 if (paren <= *PL_reglastparen)
247 PL_regendp[paren] = tmps;
c277df42 248 DEBUG_r(
c3464db5 249 PerlIO_printf(Perl_debug_log,
b900a521 250 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 251 (UV)paren, (IV)PL_regstartp[paren],
b900a521 252 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 253 (IV)PL_regendp[paren],
3280af22 254 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 255 );
a0d0e21e 256 }
c277df42 257 DEBUG_r(
eb160463 258 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 259 PerlIO_printf(Perl_debug_log,
faccc32b
JH
260 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
261 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
262 }
263 );
daf18116 264#if 1
dafc8851
JH
265 /* It would seem that the similar code in regtry()
266 * already takes care of this, and in fact it is in
267 * a better location to since this code can #if 0-ed out
268 * but the code in regtry() is needed or otherwise tests
269 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
270 * (as of patchlevel 7877) will fail. Then again,
271 * this code seems to be necessary or otherwise
272 * building DynaLoader will fail:
273 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
274 * --jhi */
eb160463
GS
275 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
276 if ((I32)paren > PL_regsize)
cf93c79d
IZ
277 PL_regstartp[paren] = -1;
278 PL_regendp[paren] = -1;
a0d0e21e 279 }
dafc8851 280#endif
a0d0e21e
LW
281 return input;
282}
283
0f5d15d6 284STATIC char *
cea2e8a9 285S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6 286{
8c18bf38 287 const I32 tmp = PL_savestack_ix;
0f5d15d6
IZ
288
289 PL_savestack_ix = ss;
290 regcppop();
291 PL_savestack_ix = tmp;
942e002e 292 return Nullch;
0f5d15d6
IZ
293}
294
295typedef struct re_cc_state
296{
297 I32 ss;
298 regnode *node;
299 struct re_cc_state *prev;
300 CURCUR *cc;
301 regexp *re;
302} re_cc_state;
303
02db2b7b 304#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 305
29d1e993
HS
306#define TRYPAREN(paren, n, input) { \
307 if (paren) { \
308 if (n) { \
309 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
310 PL_regendp[paren] = input - PL_bostr; \
311 } \
312 else \
313 PL_regendp[paren] = -1; \
314 } \
315 if (regmatch(next)) \
316 sayYES; \
317 if (paren && n) \
318 PL_regendp[paren] = -1; \
319}
320
321
a687059c 322/*
e50aee73 323 * pregexec and friends
a687059c
LW
324 */
325
326/*
c277df42 327 - pregexec - match a regexp against a string
a687059c 328 */
c277df42 329I32
864dbfa3 330Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 331 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
332/* strend: pointer to null at end of string */
333/* strbeg: real beginning of string */
334/* minend: end of match must be >=minend after stringarg. */
335/* nosave: For optimizations. */
336{
337 return
9041c2e3 338 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
339 nosave ? 0 : REXEC_COPY_STR);
340}
0f5d15d6
IZ
341
342STATIC void
cea2e8a9 343S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
344{
345 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
346#ifdef DEBUGGING
347 PL_regprogram = prog->program;
348#endif
349 PL_regnpar = prog->nparens;
9041c2e3
NIS
350 PL_regdata = prog->data;
351 PL_reg_re = prog;
0f5d15d6 352}
22e551b9 353
9041c2e3 354/*
cad2e5aa
JH
355 * Need to implement the following flags for reg_anch:
356 *
357 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
358 * USE_INTUIT_ML
359 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
360 * INTUIT_AUTORITATIVE_ML
361 * INTUIT_ONCE_NOML - Intuit can match in one location only.
362 * INTUIT_ONCE_ML
363 *
364 * Another flag for this function: SECOND_TIME (so that float substrs
365 * with giant delta may be not rechecked).
366 */
367
368/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
369
fdac8c4b 370/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
371 Otherwise, only SvCUR(sv) is used to get strbeg. */
372
373/* XXXX We assume that strpos is strbeg unless sv. */
374
6eb5f6b9
JH
375/* XXXX Some places assume that there is a fixed substring.
376 An update may be needed if optimizer marks as "INTUITable"
377 RExen without fixed substrings. Similarly, it is assumed that
378 lengths of all the strings are no more than minlen, thus they
379 cannot come from lookahead.
380 (Or minlen should take into account lookahead.) */
381
2c2d71f5
JH
382/* A failure to find a constant substring means that there is no need to make
383 an expensive call to REx engine, thus we celebrate a failure. Similarly,
384 finding a substring too deep into the string means that less calls to
30944b6d
IZ
385 regtry() should be needed.
386
387 REx compiler's optimizer found 4 possible hints:
388 a) Anchored substring;
389 b) Fixed substring;
390 c) Whether we are anchored (beginning-of-line or \G);
391 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 392 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
393 string which does not contradict any of them.
394 */
2c2d71f5 395
6eb5f6b9
JH
396/* Most of decisions we do here should have been done at compile time.
397 The nodes of the REx which we used for the search should have been
398 deleted from the finite automaton. */
399
cad2e5aa
JH
400char *
401Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
402 char *strend, U32 flags, re_scream_pos_data *data)
403{
b7953727 404 register I32 start_shift = 0;
cad2e5aa 405 /* Should be nonnegative! */
b7953727 406 register I32 end_shift = 0;
2c2d71f5
JH
407 register char *s;
408 register SV *check;
a1933d95 409 char *strbeg;
cad2e5aa 410 char *t;
8c18bf38 411 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 412 I32 ml_anch;
0e2d6244
SS
413 register char *other_last = NULL; /* other substr checked before this */
414 char *check_at = NULL; /* check substr found at this pos */
973ccfc8 415 const I32 multiline = PL_multiline | (prog->reganch & PMf_MULTILINE);
30944b6d 416#ifdef DEBUGGING
c6d79d47
AL
417 const char * const i_strpos = strpos;
418 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 419#endif
2ea5368e 420 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 421
b8d68ded
JH
422 if (prog->reganch & ROPT_UTF8) {
423 DEBUG_r(PerlIO_printf(Perl_debug_log,
424 "UTF-8 regex...\n"));
425 PL_reg_flags |= RF_utf8;
426 }
427
2a782b5b 428 DEBUG_r({
228fe6e6 429 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
430 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
431 strpos;
228fe6e6 432 const int len = PL_reg_match_utf8 ?
b8d68ded 433 strlen(s) : strend - strpos;
2a782b5b
JH
434 if (!PL_colorset)
435 reginitcolors();
b8d68ded
JH
436 if (PL_reg_match_utf8)
437 DEBUG_r(PerlIO_printf(Perl_debug_log,
438 "UTF-8 target...\n"));
2a782b5b 439 PerlIO_printf(Perl_debug_log,
5332c881 440 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
2a782b5b
JH
441 PL_colors[4],PL_colors[5],PL_colors[0],
442 prog->precomp,
443 PL_colors[1],
444 (strlen(prog->precomp) > 60 ? "..." : ""),
445 PL_colors[0],
446 (int)(len > 60 ? 60 : len),
447 s, PL_colors[1],
448 (len > 60 ? "..." : "")
449 );
450 });
cad2e5aa 451
c344f387
JH
452 /* CHR_DIST() would be more correct here but it makes things slow. */
453 if (prog->minlen > strend - strpos) {
a72c7584
JH
454 DEBUG_r(PerlIO_printf(Perl_debug_log,
455 "String too short... [re_intuit_start]\n"));
cad2e5aa 456 goto fail;
2c2d71f5 457 }
a1933d95 458 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 459 PL_regeol = strend;
33b8afdf
JH
460 if (do_utf8) {
461 if (!prog->check_utf8 && prog->check_substr)
462 to_utf8_substr(prog);
463 check = prog->check_utf8;
464 } else {
465 if (!prog->check_substr && prog->check_utf8)
466 to_byte_substr(prog);
467 check = prog->check_substr;
468 }
469 if (check == &PL_sv_undef) {
470 DEBUG_r(PerlIO_printf(Perl_debug_log,
471 "Non-utf string cannot match utf check string\n"));
472 goto fail;
473 }
2c2d71f5 474 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
475 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
476 || ( (prog->reganch & ROPT_ANCH_BOL)
973ccfc8 477 && !multiline ) ); /* Check after \n? */
cad2e5aa 478
7e25d62c
JH
479 if (!ml_anch) {
480 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
481 | ROPT_IMPLICIT)) /* not a real BOL */
fdac8c4b 482 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
483 && sv && !SvROK(sv)
484 && (strpos != strbeg)) {
485 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
486 goto fail;
487 }
488 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 489 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 490 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
491 I32 slen;
492
1aa99e6b 493 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
494 if (SvTAIL(check)) {
495 slen = SvCUR(check); /* >= 1 */
cad2e5aa 496
9041c2e3 497 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5
JH
498 || (strend - s == slen && strend[-1] != '\n')) {
499 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
500 goto fail_finish;
cad2e5aa
JH
501 }
502 /* Now should match s[0..slen-2] */
503 slen--;
fdac8c4b 504 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 505 || (slen > 1
fdac8c4b 506 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5
JH
507 report_neq:
508 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
509 goto fail_finish;
510 }
cad2e5aa 511 }
fdac8c4b 512 else if (*SvPVX_const(check) != *s
653099ff 513 || ((slen = SvCUR(check)) > 1
fdac8c4b 514 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 515 goto report_neq;
7b146116 516 check_at = s;
2c2d71f5 517 goto success_at_start;
7e25d62c 518 }
cad2e5aa 519 }
2c2d71f5 520 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 521 s = strpos;
2c2d71f5 522 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 523 end_shift = prog->minlen - start_shift -
653099ff 524 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 525 if (!ml_anch) {
8c18bf38 526 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 527 - (SvTAIL(check) != 0);
8c18bf38 528 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
529
530 if (end_shift < eshift)
531 end_shift = eshift;
532 }
cad2e5aa 533 }
2c2d71f5 534 else { /* Can match at random position */
cad2e5aa
JH
535 ml_anch = 0;
536 s = strpos;
2c2d71f5
JH
537 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
538 /* Should be nonnegative! */
539 end_shift = prog->minlen - start_shift -
653099ff 540 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
541 }
542
2c2d71f5 543#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 544 if (end_shift < 0)
6bbae5e6 545 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
546#endif
547
2c2d71f5
JH
548 restart:
549 /* Find a possible match in the region s..strend by looking for
550 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 551 if (flags & REXEC_SCREAM) {
cad2e5aa 552 I32 p = -1; /* Internal iterator of scream. */
8c18bf38 553 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 554
2c2d71f5
JH
555 if (PL_screamfirst[BmRARE(check)] >= 0
556 || ( BmRARE(check) == '\n'
557 && (BmPREVIOUS(check) == SvCUR(check) - 1)
558 && SvTAIL(check) ))
9041c2e3 559 s = screaminstr(sv, check,
2c2d71f5 560 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 561 else
2c2d71f5 562 goto fail_finish;
88af3bc3
JH
563 /* we may be pointing at the wrong string */
564 if (s && RX_MATCH_COPIED(prog))
fdac8c4b 565 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
566 if (data)
567 *data->scream_olds = s;
568 }
f33976b4 569 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
570 s = fbm_instr((U8*)(s + start_shift),
571 (U8*)(strend - end_shift),
973ccfc8 572 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 573 else
1aa99e6b
IH
574 s = fbm_instr(HOP3(s, start_shift, strend),
575 HOP3(strend, -end_shift, strbeg),
973ccfc8 576 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
577
578 /* Update the count-of-usability, remove useless subpatterns,
579 unshift s. */
2c2d71f5 580
5332c881
AL
581 /* FIXME - DEBUG_EXECUTE_r if that is merged to maint. */
582 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 583 (s ? "Found" : "Did not find"),
33b8afdf 584 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 585 PL_colors[0],
7b0972df 586 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
fdac8c4b 587 SvPVX_const(check),
2c2d71f5
JH
588 PL_colors[1], (SvTAIL(check) ? "$" : ""),
589 (s ? " at offset " : "...\n") ) );
590
591 if (!s)
592 goto fail_finish;
593
6eb5f6b9
JH
594 check_at = s;
595
2c2d71f5 596 /* Finish the diagnostic message */
30944b6d 597 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
598
599 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
600 Start with the other substr.
601 XXXX no SCREAM optimization yet - and a very coarse implementation
5332c881 602 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
603 *always* match. Probably should be marked during compile...
604 Probably it is right to do no SCREAM here...
605 */
606
33b8afdf 607 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 608 /* Take into account the "other" substring. */
2c2d71f5
JH
609 /* XXXX May be hopelessly wrong for UTF... */
610 if (!other_last)
6eb5f6b9 611 other_last = strpos;
33b8afdf 612 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
613 do_other_anchored:
614 {
c6d79d47
AL
615 char * const last = HOP3c(s, -start_shift, strbeg);
616 char *last1, *last2;
2c2d71f5 617 char *s1 = s;
33b8afdf 618 SV* must;
2c2d71f5 619
2c2d71f5
JH
620 t = s - prog->check_offset_max;
621 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 622 && (!do_utf8
1aa99e6b 623 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 624 && t > strpos)))
30944b6d 625 /* EMPTY */;
2c2d71f5
JH
626 else
627 t = strpos;
1aa99e6b 628 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
629 if (t < other_last) /* These positions already checked */
630 t = other_last;
1aa99e6b 631 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
632 if (last < last1)
633 last1 = last;
634 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
635 /* On end-of-str: see comment below. */
33b8afdf
JH
636 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
637 if (must == &PL_sv_undef) {
638 s = (char*)NULL;
639 DEBUG_r(must = prog->anchored_utf8); /* for debug */
640 }
641 else
642 s = fbm_instr(
643 (unsigned char*)t,
644 HOP3(HOP3(last1, prog->anchored_offset, strend)
645 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
646 must,
973ccfc8 647 multiline ? FBMrf_MULTILINE : 0
33b8afdf 648 );
1aa99e6b 649 DEBUG_r(PerlIO_printf(Perl_debug_log,
5332c881 650 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
651 (s ? "Found" : "Contradicts"),
652 PL_colors[0],
33b8afdf
JH
653 (int)(SvCUR(must)
654 - (SvTAIL(must)!=0)),
fdac8c4b 655 SvPVX_const(must),
33b8afdf 656 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
657 if (!s) {
658 if (last1 >= last2) {
659 DEBUG_r(PerlIO_printf(Perl_debug_log,
660 ", giving up...\n"));
661 goto fail_finish;
662 }
663 DEBUG_r(PerlIO_printf(Perl_debug_log,
664 ", trying floating at offset %ld...\n",
1aa99e6b
IH
665 (long)(HOP3c(s1, 1, strend) - i_strpos)));
666 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
667 s = HOP3c(last, 1, strend);
2c2d71f5
JH
668 goto restart;
669 }
670 else {
671 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 672 (long)(s - i_strpos)));
1aa99e6b
IH
673 t = HOP3c(s, -prog->anchored_offset, strbeg);
674 other_last = HOP3c(s, 1, strend);
30944b6d 675 s = s1;
2c2d71f5
JH
676 if (t == strpos)
677 goto try_at_start;
2c2d71f5
JH
678 goto try_at_offset;
679 }
30944b6d 680 }
2c2d71f5
JH
681 }
682 else { /* Take into account the floating substring. */
33b8afdf
JH
683 char *last, *last1;
684 char *s1 = s;
685 SV* must;
686
687 t = HOP3c(s, -start_shift, strbeg);
688 last1 = last =
689 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
690 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
691 last = HOP3c(t, prog->float_max_offset, strend);
692 s = HOP3c(t, prog->float_min_offset, strend);
693 if (s < other_last)
694 s = other_last;
2c2d71f5 695 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
696 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
697 /* fbm_instr() takes into account exact value of end-of-str
698 if the check is SvTAIL(ed). Since false positives are OK,
699 and end-of-str is not later than strend we are OK. */
700 if (must == &PL_sv_undef) {
701 s = (char*)NULL;
702 DEBUG_r(must = prog->float_utf8); /* for debug message */
703 }
704 else
2c2d71f5 705 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
706 (unsigned char*)last + SvCUR(must)
707 - (SvTAIL(must)!=0),
973ccfc8 708 must, multiline ? FBMrf_MULTILINE : 0);
5332c881
AL
709 /* FIXME - DEBUG_EXECUTE_r if that is merged to maint */
710 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
711 (s ? "Found" : "Contradicts"),
712 PL_colors[0],
713 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
fdac8c4b 714 SvPVX_const(must),
33b8afdf
JH
715 PL_colors[1], (SvTAIL(must) ? "$" : "")));
716 if (!s) {
717 if (last1 == last) {
2c2d71f5 718 DEBUG_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
719 ", giving up...\n"));
720 goto fail_finish;
2c2d71f5 721 }
33b8afdf
JH
722 DEBUG_r(PerlIO_printf(Perl_debug_log,
723 ", trying anchored starting at offset %ld...\n",
724 (long)(s1 + 1 - i_strpos)));
725 other_last = last;
726 s = HOP3c(t, 1, strend);
727 goto restart;
728 }
729 else {
730 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
731 (long)(s - i_strpos)));
732 other_last = s; /* Fix this later. --Hugo */
733 s = s1;
734 if (t == strpos)
735 goto try_at_start;
736 goto try_at_offset;
737 }
2c2d71f5 738 }
cad2e5aa 739 }
2c2d71f5
JH
740
741 t = s - prog->check_offset_max;
2c2d71f5 742 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 743 && (!do_utf8
1aa99e6b
IH
744 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
745 && t > strpos))) {
2c2d71f5
JH
746 /* Fixed substring is found far enough so that the match
747 cannot start at strpos. */
748 try_at_offset:
cad2e5aa 749 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
750 /* Eventually fbm_*() should handle this, but often
751 anchored_offset is not 0, so this check will not be wasted. */
752 /* XXXX In the code below we prefer to look for "^" even in
753 presence of anchored substrings. And we search even
754 beyond the found float position. These pessimizations
755 are historical artefacts only. */
756 find_anchor:
2c2d71f5 757 while (t < strend - prog->minlen) {
cad2e5aa 758 if (*t == '\n') {
4ee3650e 759 if (t < check_at - prog->check_offset_min) {
33b8afdf 760 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
761 /* Since we moved from the found position,
762 we definitely contradict the found anchored
30944b6d
IZ
763 substr. Due to the above check we do not
764 contradict "check" substr.
765 Thus we can arrive here only if check substr
766 is float. Redo checking for "other"=="fixed".
767 */
9041c2e3 768 strpos = t + 1;
30944b6d
IZ
769 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
770 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
771 goto do_other_anchored;
772 }
4ee3650e
GS
773 /* We don't contradict the found floating substring. */
774 /* XXXX Why not check for STCLASS? */
cad2e5aa 775 s = t + 1;
2c2d71f5 776 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
30944b6d 777 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
778 goto set_useful;
779 }
4ee3650e
GS
780 /* Position contradicts check-string */
781 /* XXXX probably better to look for check-string
782 than for "\n", so one should lower the limit for t? */
783 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
30944b6d 784 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 785 other_last = strpos = s = t + 1;
cad2e5aa
JH
786 goto restart;
787 }
788 t++;
789 }
2c2d71f5
JH
790 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
791 PL_colors[0],PL_colors[1]));
792 goto fail_finish;
cad2e5aa 793 }
f5952150
GS
794 else {
795 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
796 PL_colors[0],PL_colors[1]));
797 }
cad2e5aa
JH
798 s = t;
799 set_useful:
33b8afdf 800 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
801 }
802 else {
f5952150 803 /* The found string does not prohibit matching at strpos,
2c2d71f5 804 - no optimization of calling REx engine can be performed,
f5952150
GS
805 unless it was an MBOL and we are not after MBOL,
806 or a future STCLASS check will fail this. */
2c2d71f5
JH
807 try_at_start:
808 /* Even in this situation we may use MBOL flag if strpos is offset
809 wrt the start of the string. */
05b4157f 810 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 811 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
812 /* May be due to an implicit anchor of m{.*foo} */
813 && !(prog->reganch & ROPT_IMPLICIT))
814 {
cad2e5aa
JH
815 t = strpos;
816 goto find_anchor;
817 }
30944b6d 818 DEBUG_r( if (ml_anch)
f5952150
GS
819 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
820 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
30944b6d 821 );
2c2d71f5 822 success_at_start:
30944b6d 823 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
824 && (do_utf8 ? (
825 prog->check_utf8 /* Could be deleted already */
826 && --BmUSEFUL(prog->check_utf8) < 0
827 && (prog->check_utf8 == prog->float_utf8)
828 ) : (
829 prog->check_substr /* Could be deleted already */
830 && --BmUSEFUL(prog->check_substr) < 0
831 && (prog->check_substr == prog->float_substr)
832 )))
66e933ab 833 {
cad2e5aa 834 /* If flags & SOMETHING - do not do it many times on the same match */
f5952150 835 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
836 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
837 if (do_utf8 ? prog->check_substr : prog->check_utf8)
838 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
0e2d6244
SS
839 prog->check_substr = prog->check_utf8 = NULL; /* disable */
840 prog->float_substr = prog->float_utf8 = NULL; /* clear */
841 check = NULL; /* abort */
cad2e5aa 842 s = strpos;
3cf5c195
IZ
843 /* XXXX This is a remnant of the old implementation. It
844 looks wasteful, since now INTUIT can use many
6eb5f6b9 845 other heuristics. */
cad2e5aa
JH
846 prog->reganch &= ~RE_USE_INTUIT;
847 }
848 else
849 s = strpos;
850 }
851
6eb5f6b9
JH
852 /* Last resort... */
853 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
854 if (prog->regstclass) {
855 /* minlen == 0 is possible if regstclass is \b or \B,
856 and the fixed substr is ''$.
857 Since minlen is already taken into account, s+1 is before strend;
858 accidentally, minlen >= 1 guaranties no false positives at s + 1
859 even for \b or \B. But (minlen? 1 : 0) below assumes that
860 regstclass does not come from lookahead... */
861 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
862 This leaves EXACTF only, which is dealt with in find_byclass(). */
c6d79d47 863 const U8* const str = (U8*)STRING(prog->regstclass);
ec6f298e 864 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
47a91a97
NC
865 ? CHR_DIST((U8 *)str+STR_LEN(prog->regstclass),
866 (U8 *)str)
66e933ab 867 : 1);
8c18bf38 868 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 869 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 870 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
871 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
872 cl_l, strend)
873 : strend);
6eb5f6b9
JH
874
875 t = s;
975adce1 876 cache_re(prog);
ec6f298e 877 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
6eb5f6b9
JH
878 if (!s) {
879#ifdef DEBUGGING
4358d444 880 const char *what = NULL;
6eb5f6b9
JH
881#endif
882 if (endpos == strend) {
883 DEBUG_r( PerlIO_printf(Perl_debug_log,
884 "Could not match STCLASS...\n") );
885 goto fail;
886 }
66e933ab
GS
887 DEBUG_r( PerlIO_printf(Perl_debug_log,
888 "This position contradicts STCLASS...\n") );
653099ff
GS
889 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
890 goto fail;
6eb5f6b9 891 /* Contradict one of substrings */
33b8afdf
JH
892 if (prog->anchored_substr || prog->anchored_utf8) {
893 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
6eb5f6b9
JH
894 DEBUG_r( what = "anchored" );
895 hop_and_restart:
1aa99e6b 896 s = HOP3c(t, 1, strend);
66e933ab
GS
897 if (s + start_shift + end_shift > strend) {
898 /* XXXX Should be taken into account earlier? */
899 DEBUG_r( PerlIO_printf(Perl_debug_log,
900 "Could not match STCLASS...\n") );
901 goto fail;
902 }
5e39e1e5
HS
903 if (!check)
904 goto giveup;
6eb5f6b9 905 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 906 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
907 what, (long)(s + start_shift - i_strpos)) );
908 goto restart;
909 }
66e933ab 910 /* Have both, check_string is floating */
6eb5f6b9
JH
911 if (t + start_shift >= check_at) /* Contradicts floating=check */
912 goto retry_floating_check;
913 /* Recheck anchored substring, but not floating... */
9041c2e3 914 s = check_at;
5e39e1e5
HS
915 if (!check)
916 goto giveup;
6eb5f6b9 917 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 918 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
919 (long)(other_last - i_strpos)) );
920 goto do_other_anchored;
921 }
60e71179
GS
922 /* Another way we could have checked stclass at the
923 current position only: */
924 if (ml_anch) {
925 s = t = t + 1;
5e39e1e5
HS
926 if (!check)
927 goto giveup;
60e71179 928 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150
GS
929 "Looking for /%s^%s/m starting at offset %ld...\n",
930 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
60e71179 931 goto try_at_offset;
66e933ab 932 }
33b8afdf 933 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 934 goto fail;
6eb5f6b9
JH
935 /* Check is floating subtring. */
936 retry_floating_check:
937 t = check_at - start_shift;
938 DEBUG_r( what = "floating" );
939 goto hop_and_restart;
940 }
b7953727
JH
941 if (t != s) {
942 DEBUG_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 943 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
944 (long)(t - i_strpos), (long)(s - i_strpos))
945 );
946 }
947 else {
948 DEBUG_r(PerlIO_printf(Perl_debug_log,
949 "Does not contradict STCLASS...\n");
950 );
951 }
6eb5f6b9 952 }
5e39e1e5
HS
953 giveup:
954 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
955 PL_colors[4], (check ? "Guessed" : "Giving up"),
956 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 957 return s;
2c2d71f5
JH
958
959 fail_finish: /* Substring not found */
33b8afdf
JH
960 if (prog->check_substr || prog->check_utf8) /* could be removed already */
961 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 962 fail:
2c2d71f5 963 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
cad2e5aa 964 PL_colors[4],PL_colors[5]));
0e2d6244 965 return NULL;
cad2e5aa 966}
9661b544 967
6eb5f6b9 968/* We know what class REx starts with. Try to find this position... */
3c3eec57 969STATIC char *
8c18bf38 970S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 norun)
a687059c 971{
228fe6e6 972 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 973 char *m;
d8093b23 974 STRLEN ln;
bcefe257 975 STRLEN lnc;
45bfbaf1 976 register STRLEN uskip;
d8093b23
G
977 unsigned int c1;
978 unsigned int c2;
6eb5f6b9
JH
979 char *e;
980 register I32 tmp = 1; /* Scratch variable? */
8c18bf38 981 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 982
6eb5f6b9
JH
983 /* We know what class it must start with. */
984 switch (OP(c)) {
6eb5f6b9 985 case ANYOF:
d46a7032 986 if (do_utf8) {
45bfbaf1 987 while (s + (uskip = UTF8SKIP(s)) <= strend) {
d46a7032
JH
988 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
989 !UTF8_IS_INVARIANT((U8)s[0]) ?
990 reginclass(c, (U8*)s, 0, do_utf8) :
991 REGINCLASS(c, (U8*)s)) {
992 if (tmp && (norun || regtry(prog, s)))
993 goto got_it;
994 else
995 tmp = doevery;
996 }
997 else
998 tmp = 1;
45bfbaf1 999 s += uskip;
d46a7032
JH
1000 }
1001 }
1002 else {
1003 while (s < strend) {
1004 STRLEN skip = 1;
1005
1006 if (REGINCLASS(c, (U8*)s) ||
1007 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1008 /* The assignment of 2 is intentional:
1009 * for the folded sharp s, the skip is 2. */
1010 (skip = SHARP_S_SKIP))) {
1011 if (tmp && (norun || regtry(prog, s)))
1012 goto got_it;
1013 else
1014 tmp = doevery;
1015 }
1016 else
1017 tmp = 1;
1018 s += skip;
1019 }
a0d0e21e 1020 }
6eb5f6b9 1021 break;
f33976b4
DB
1022 case CANY:
1023 while (s < strend) {
1024 if (tmp && (norun || regtry(prog, s)))
1025 goto got_it;
1026 else
1027 tmp = doevery;
1028 s++;
1029 }
1030 break;
6eb5f6b9 1031 case EXACTF:
bcefe257
NIS
1032 m = STRING(c);
1033 ln = STR_LEN(c); /* length to match in octets/bytes */
1034 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1035 if (UTF) {
a2a2844f 1036 STRLEN ulen1, ulen2;
bcefe257 1037 U8 *sm = (U8 *) m;
a2a469f9
JH
1038 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1039 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
f5e9f069 1040 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
a2a2844f
JH
1041
1042 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1043 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1044
a2a469f9 1045 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
f5e9f069 1046 0, uniflags);
a2a469f9 1047 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
f5e9f069 1048 0, uniflags);
bcefe257
NIS
1049 lnc = 0;
1050 while (sm < ((U8 *) m + ln)) {
1051 lnc++;
1052 sm += UTF8SKIP(sm);
1053 }
1aa99e6b
IH
1054 }
1055 else {
1056 c1 = *(U8*)m;
1057 c2 = PL_fold[c1];
1058 }
6eb5f6b9
JH
1059 goto do_exactf;
1060 case EXACTFL:
bcefe257
NIS
1061 m = STRING(c);
1062 ln = STR_LEN(c);
1063 lnc = (I32) ln;
d8093b23 1064 c1 = *(U8*)m;
6eb5f6b9
JH
1065 c2 = PL_fold_locale[c1];
1066 do_exactf:
dc904a75 1067 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1068
6eb5f6b9
JH
1069 if (norun && e < s)
1070 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1071
60a8b682
JH
1072 /* The idea in the EXACTF* cases is to first find the
1073 * first character of the EXACTF* node and then, if
1074 * necessary, case-insensitively compare the full
1075 * text of the node. The c1 and c2 are the first
1076 * characters (though in Unicode it gets a bit
1077 * more complicated because there are more cases
7f16dd3d
JH
1078 * than just upper and lower: one needs to use
1079 * the so-called folding case for case-insensitive
1080 * matching (called "loose matching" in Unicode).
1081 * ibcmp_utf8() will do just that. */
60a8b682 1082
1aa99e6b 1083 if (do_utf8) {
575cac57 1084 UV c, f;
a2a469f9 1085 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1086 STRLEN len, foldlen;
f5e9f069 1087 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
09091399 1088 if (c1 == c2) {
bcefe257
NIS
1089 /* Upper and lower of 1st char are equal -
1090 * probably not a "letter". */
1aa99e6b 1091 while (s <= e) {
a2a469f9 1092 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
f5e9f069 1093 uniflags);
80aecb99
JH
1094 if ( c == c1
1095 && (ln == len ||
66423254 1096 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1097 m, (char **)0, ln, (bool)UTF))
55da9344 1098 && (norun || regtry(prog, s)) )
1aa99e6b 1099 goto got_it;
80aecb99 1100 else {
228fe6e6 1101 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1102 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1103 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1104 if ( f != c
1105 && (f == c1 || f == c2)
1106 && (ln == foldlen ||
66423254
JH
1107 !ibcmp_utf8((char *) foldbuf,
1108 (char **)0, foldlen, do_utf8,
d07ddd77 1109 m,
eb160463 1110 (char **)0, ln, (bool)UTF))
80aecb99
JH
1111 && (norun || regtry(prog, s)) )
1112 goto got_it;
1113 }
1aa99e6b
IH
1114 s += len;
1115 }
09091399
JH
1116 }
1117 else {
1aa99e6b 1118 while (s <= e) {
a2a469f9 1119 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
f5e9f069 1120 uniflags);
80aecb99 1121
60a8b682 1122 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1123 * Note that not all the possible combinations
1124 * are handled here: some of them are handled
1125 * by the standard folding rules, and some of
1126 * them (the character class or ANYOF cases)
1127 * are handled during compiletime in
1128 * regexec.c:S_regclass(). */
880bd946
JH
1129 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1130 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1131 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1132
1133 if ( (c == c1 || c == c2)
1134 && (ln == len ||
66423254 1135 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1136 m, (char **)0, ln, (bool)UTF))
55da9344 1137 && (norun || regtry(prog, s)) )
1aa99e6b 1138 goto got_it;
80aecb99 1139 else {
228fe6e6 1140 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1141 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1142 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1143 if ( f != c
1144 && (f == c1 || f == c2)
1145 && (ln == foldlen ||
a6872d42 1146 !ibcmp_utf8((char *) foldbuf,
66423254 1147 (char **)0, foldlen, do_utf8,
d07ddd77 1148 m,
eb160463 1149 (char **)0, ln, (bool)UTF))
80aecb99
JH
1150 && (norun || regtry(prog, s)) )
1151 goto got_it;
1152 }
1aa99e6b
IH
1153 s += len;
1154 }
09091399 1155 }
1aa99e6b
IH
1156 }
1157 else {
1158 if (c1 == c2)
1159 while (s <= e) {
1160 if ( *(U8*)s == c1
1161 && (ln == 1 || !(OP(c) == EXACTF
1162 ? ibcmp(s, m, ln)
1163 : ibcmp_locale(s, m, ln)))
1164 && (norun || regtry(prog, s)) )
1165 goto got_it;
1166 s++;
1167 }
1168 else
1169 while (s <= e) {
1170 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1171 && (ln == 1 || !(OP(c) == EXACTF
1172 ? ibcmp(s, m, ln)
1173 : ibcmp_locale(s, m, ln)))
1174 && (norun || regtry(prog, s)) )
1175 goto got_it;
1176 s++;
1177 }
b3c9acc1
IZ
1178 }
1179 break;
bbce6d69 1180 case BOUNDL:
3280af22 1181 PL_reg_flags |= RF_tainted;
bbce6d69 1182 /* FALL THROUGH */
a0d0e21e 1183 case BOUND:
ffc61ed2 1184 if (do_utf8) {
12d33761 1185 if (s == PL_bostr)
ffc61ed2
JH
1186 tmp = '\n';
1187 else {
b4f7163a 1188 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1189
b4f7163a 1190 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1191 }
1192 tmp = ((OP(c) == BOUND ?
9041c2e3 1193 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
fab1f349 1194 LOAD_UTF8_CHARCLASS_ALNUM();
45bfbaf1 1195 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1196 if (tmp == !(OP(c) == BOUND ?
3568d838 1197 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1198 isALNUM_LC_utf8((U8*)s)))
1199 {
1200 tmp = !tmp;
1201 if ((norun || regtry(prog, s)))
1202 goto got_it;
1203 }
45bfbaf1 1204 s += uskip;
a687059c 1205 }
a0d0e21e 1206 }
667bb95a 1207 else {
12d33761 1208 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1209 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1210 while (s < strend) {
1211 if (tmp ==
1212 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1213 tmp = !tmp;
1214 if ((norun || regtry(prog, s)))
1215 goto got_it;
1216 }
1217 s++;
a0ed51b3 1218 }
a0ed51b3 1219 }
6eb5f6b9 1220 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1221 goto got_it;
1222 break;
bbce6d69 1223 case NBOUNDL:
3280af22 1224 PL_reg_flags |= RF_tainted;
bbce6d69 1225 /* FALL THROUGH */
a0d0e21e 1226 case NBOUND:
ffc61ed2 1227 if (do_utf8) {
12d33761 1228 if (s == PL_bostr)
ffc61ed2
JH
1229 tmp = '\n';
1230 else {
b4f7163a 1231 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1232
b4f7163a 1233 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1234 }
1235 tmp = ((OP(c) == NBOUND ?
9041c2e3 1236 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
fab1f349 1237 LOAD_UTF8_CHARCLASS_ALNUM();
45bfbaf1 1238 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1239 if (tmp == !(OP(c) == NBOUND ?
3568d838 1240 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1241 isALNUM_LC_utf8((U8*)s)))
1242 tmp = !tmp;
1243 else if ((norun || regtry(prog, s)))
1244 goto got_it;
45bfbaf1 1245 s += uskip;
ffc61ed2 1246 }
a0d0e21e 1247 }
667bb95a 1248 else {
12d33761 1249 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1250 tmp = ((OP(c) == NBOUND ?
1251 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1252 while (s < strend) {
1253 if (tmp ==
1254 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1255 tmp = !tmp;
1256 else if ((norun || regtry(prog, s)))
1257 goto got_it;
1258 s++;
1259 }
a0ed51b3 1260 }
6eb5f6b9 1261 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1262 goto got_it;
1263 break;
a0d0e21e 1264 case ALNUM:
ffc61ed2 1265 if (do_utf8) {
fab1f349 1266 LOAD_UTF8_CHARCLASS_ALNUM();
45bfbaf1 1267 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1268 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1269 if (tmp && (norun || regtry(prog, s)))
1270 goto got_it;
1271 else
1272 tmp = doevery;
1273 }
bbce6d69 1274 else
ffc61ed2 1275 tmp = 1;
45bfbaf1 1276 s += uskip;
bbce6d69 1277 }
bbce6d69 1278 }
ffc61ed2
JH
1279 else {
1280 while (s < strend) {
1281 if (isALNUM(*s)) {
1282 if (tmp && (norun || regtry(prog, s)))
1283 goto got_it;
1284 else
1285 tmp = doevery;
1286 }
a0ed51b3 1287 else
ffc61ed2
JH
1288 tmp = 1;
1289 s++;
a0ed51b3 1290 }
a0ed51b3
LW
1291 }
1292 break;
bbce6d69 1293 case ALNUML:
3280af22 1294 PL_reg_flags |= RF_tainted;
ffc61ed2 1295 if (do_utf8) {
45bfbaf1 1296 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1297 if (isALNUM_LC_utf8((U8*)s)) {
1298 if (tmp && (norun || regtry(prog, s)))
1299 goto got_it;
1300 else
1301 tmp = doevery;
1302 }
a687059c 1303 else
ffc61ed2 1304 tmp = 1;
45bfbaf1 1305 s += uskip;
a0d0e21e 1306 }
a0d0e21e 1307 }
ffc61ed2
JH
1308 else {
1309 while (s < strend) {
1310 if (isALNUM_LC(*s)) {
1311 if (tmp && (norun || regtry(prog, s)))
1312 goto got_it;
1313 else
1314 tmp = doevery;
1315 }
a0ed51b3 1316 else
ffc61ed2
JH
1317 tmp = 1;
1318 s++;
a0ed51b3 1319 }
a0ed51b3
LW
1320 }
1321 break;
a0d0e21e 1322 case NALNUM:
ffc61ed2 1323 if (do_utf8) {
fab1f349 1324 LOAD_UTF8_CHARCLASS_ALNUM();
45bfbaf1 1325 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1326 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1327 if (tmp && (norun || regtry(prog, s)))
1328 goto got_it;
1329 else
1330 tmp = doevery;
1331 }
bbce6d69 1332 else
ffc61ed2 1333 tmp = 1;
45bfbaf1 1334 s += uskip;
bbce6d69 1335 }
bbce6d69 1336 }
ffc61ed2
JH
1337 else {
1338 while (s < strend) {
1339 if (!isALNUM(*s)) {
1340 if (tmp && (norun || regtry(prog, s)))
1341 goto got_it;
1342 else
1343 tmp = doevery;
1344 }
a0ed51b3 1345 else
ffc61ed2
JH
1346 tmp = 1;
1347 s++;
a0ed51b3 1348 }
a0ed51b3
LW
1349 }
1350 break;
bbce6d69 1351 case NALNUML:
3280af22 1352 PL_reg_flags |= RF_tainted;
ffc61ed2 1353 if (do_utf8) {
45bfbaf1 1354 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1355 if (!isALNUM_LC_utf8((U8*)s)) {
1356 if (tmp && (norun || regtry(prog, s)))
1357 goto got_it;
1358 else
1359 tmp = doevery;
1360 }
a687059c 1361 else
ffc61ed2 1362 tmp = 1;
45bfbaf1 1363 s += uskip;
a687059c 1364 }
a0d0e21e 1365 }
ffc61ed2
JH
1366 else {
1367 while (s < strend) {
1368 if (!isALNUM_LC(*s)) {
1369 if (tmp && (norun || regtry(prog, s)))
1370 goto got_it;
1371 else
1372 tmp = doevery;
1373 }
a0ed51b3 1374 else
ffc61ed2
JH
1375 tmp = 1;
1376 s++;
a0ed51b3 1377 }
a0ed51b3
LW
1378 }
1379 break;
a0d0e21e 1380 case SPACE:
ffc61ed2 1381 if (do_utf8) {
fab1f349 1382 LOAD_UTF8_CHARCLASS_SPACE();
45bfbaf1 1383 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1384 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1385 if (tmp && (norun || regtry(prog, s)))
1386 goto got_it;
1387 else
1388 tmp = doevery;
1389 }
a0d0e21e 1390 else
ffc61ed2 1391 tmp = 1;
45bfbaf1 1392 s += uskip;
2304df62 1393 }
a0d0e21e 1394 }
ffc61ed2
JH
1395 else {
1396 while (s < strend) {
1397 if (isSPACE(*s)) {
1398 if (tmp && (norun || regtry(prog, s)))
1399 goto got_it;
1400 else
1401 tmp = doevery;
1402 }
a0ed51b3 1403 else
ffc61ed2
JH
1404 tmp = 1;
1405 s++;
a0ed51b3 1406 }
a0ed51b3
LW
1407 }
1408 break;
bbce6d69 1409 case SPACEL:
3280af22 1410 PL_reg_flags |= RF_tainted;
ffc61ed2 1411 if (do_utf8) {
45bfbaf1 1412 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1413 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1414 if (tmp && (norun || regtry(prog, s)))
1415 goto got_it;
1416 else
1417 tmp = doevery;
1418 }
bbce6d69 1419 else
ffc61ed2 1420 tmp = 1;
45bfbaf1 1421 s += uskip;
bbce6d69 1422 }
bbce6d69 1423 }
ffc61ed2
JH
1424 else {
1425 while (s < strend) {
1426 if (isSPACE_LC(*s)) {
1427 if (tmp && (norun || regtry(prog, s)))
1428 goto got_it;
1429 else
1430 tmp = doevery;
1431 }
a0ed51b3 1432 else
ffc61ed2
JH
1433 tmp = 1;
1434 s++;
a0ed51b3 1435 }
a0ed51b3
LW
1436 }
1437 break;
a0d0e21e 1438 case NSPACE:
ffc61ed2 1439 if (do_utf8) {
fab1f349 1440 LOAD_UTF8_CHARCLASS_SPACE();
45bfbaf1 1441 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1442 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1443 if (tmp && (norun || regtry(prog, s)))
1444 goto got_it;
1445 else
1446 tmp = doevery;
1447 }
a0d0e21e 1448 else
ffc61ed2 1449 tmp = 1;
45bfbaf1 1450 s += uskip;
a687059c 1451 }
a0d0e21e 1452 }
ffc61ed2
JH
1453 else {
1454 while (s < strend) {
1455 if (!isSPACE(*s)) {
1456 if (tmp && (norun || regtry(prog, s)))
1457 goto got_it;
1458 else
1459 tmp = doevery;
1460 }
a0ed51b3 1461 else
ffc61ed2
JH
1462 tmp = 1;
1463 s++;
a0ed51b3 1464 }
a0ed51b3
LW
1465 }
1466 break;
bbce6d69 1467 case NSPACEL:
3280af22 1468 PL_reg_flags |= RF_tainted;
ffc61ed2 1469 if (do_utf8) {
45bfbaf1 1470 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1471 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1472 if (tmp && (norun || regtry(prog, s)))
1473 goto got_it;
1474 else
1475 tmp = doevery;
1476 }
bbce6d69 1477 else
ffc61ed2 1478 tmp = 1;
45bfbaf1 1479 s += uskip;
bbce6d69 1480 }
bbce6d69 1481 }
ffc61ed2
JH
1482 else {
1483 while (s < strend) {
1484 if (!isSPACE_LC(*s)) {
1485 if (tmp && (norun || regtry(prog, s)))
1486 goto got_it;
1487 else
1488 tmp = doevery;
1489 }
a0ed51b3 1490 else
ffc61ed2
JH
1491 tmp = 1;
1492 s++;
a0ed51b3 1493 }
a0ed51b3
LW
1494 }
1495 break;
a0d0e21e 1496 case DIGIT:
ffc61ed2 1497 if (do_utf8) {
fab1f349 1498 LOAD_UTF8_CHARCLASS_DIGIT();
45bfbaf1 1499 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1500 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1501 if (tmp && (norun || regtry(prog, s)))
1502 goto got_it;
1503 else
1504 tmp = doevery;
1505 }
a0d0e21e 1506 else
ffc61ed2 1507 tmp = 1;
45bfbaf1 1508 s += uskip;
2b69d0c2 1509 }
a0d0e21e 1510 }
ffc61ed2
JH
1511 else {
1512 while (s < strend) {
1513 if (isDIGIT(*s)) {
1514 if (tmp && (norun || regtry(prog, s)))
1515 goto got_it;
1516 else
1517 tmp = doevery;
1518 }
a0ed51b3 1519 else
ffc61ed2
JH
1520 tmp = 1;
1521 s++;
a0ed51b3 1522 }
a0ed51b3
LW
1523 }
1524 break;
b8c5462f
JH
1525 case DIGITL:
1526 PL_reg_flags |= RF_tainted;
ffc61ed2 1527 if (do_utf8) {
45bfbaf1 1528 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1529 if (isDIGIT_LC_utf8((U8*)s)) {
1530 if (tmp && (norun || regtry(prog, s)))
1531 goto got_it;
1532 else
1533 tmp = doevery;
1534 }
b8c5462f 1535 else
ffc61ed2 1536 tmp = 1;
45bfbaf1 1537 s += uskip;
b8c5462f 1538 }
b8c5462f 1539 }
ffc61ed2
JH
1540 else {
1541 while (s < strend) {
1542 if (isDIGIT_LC(*s)) {
1543 if (tmp && (norun || regtry(prog, s)))
1544 goto got_it;
1545 else
1546 tmp = doevery;
1547 }
b8c5462f 1548 else
ffc61ed2
JH
1549 tmp = 1;
1550 s++;
b8c5462f 1551 }
b8c5462f
JH
1552 }
1553 break;
a0d0e21e 1554 case NDIGIT:
ffc61ed2 1555 if (do_utf8) {
fab1f349 1556 LOAD_UTF8_CHARCLASS_DIGIT();
45bfbaf1 1557 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1558 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1559 if (tmp && (norun || regtry(prog, s)))
1560 goto got_it;
1561 else
1562 tmp = doevery;
1563 }
a0d0e21e 1564 else
ffc61ed2 1565 tmp = 1;
45bfbaf1 1566 s += uskip;
a687059c 1567 }
a0d0e21e 1568 }
ffc61ed2
JH
1569 else {
1570 while (s < strend) {
1571 if (!isDIGIT(*s)) {
1572 if (tmp && (norun || regtry(prog, s)))
1573 goto got_it;
1574 else
1575 tmp = doevery;
1576 }
a0ed51b3 1577 else
ffc61ed2
JH
1578 tmp = 1;
1579 s++;
a0ed51b3 1580 }
a0ed51b3
LW
1581 }
1582 break;
b8c5462f
JH
1583 case NDIGITL:
1584 PL_reg_flags |= RF_tainted;
ffc61ed2 1585 if (do_utf8) {
45bfbaf1 1586 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1587 if (!isDIGIT_LC_utf8((U8*)s)) {
1588 if (tmp && (norun || regtry(prog, s)))
1589 goto got_it;
1590 else
1591 tmp = doevery;
1592 }
b8c5462f 1593 else
ffc61ed2 1594 tmp = 1;
45bfbaf1 1595 s += uskip;
b8c5462f 1596 }
a0ed51b3 1597 }
ffc61ed2
JH
1598 else {
1599 while (s < strend) {
1600 if (!isDIGIT_LC(*s)) {
1601 if (tmp && (norun || regtry(prog, s)))
1602 goto got_it;
1603 else
1604 tmp = doevery;
1605 }
cf93c79d 1606 else
ffc61ed2
JH
1607 tmp = 1;
1608 s++;
b8c5462f 1609 }
b8c5462f
JH
1610 }
1611 break;
b3c9acc1 1612 default:
3c3eec57
GS
1613 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1614 break;
d6a28714 1615 }
6eb5f6b9
JH
1616 return 0;
1617 got_it:
1618 return s;
1619}
1620
1621/*
1622 - regexec_flags - match a regexp against a string
1623 */
1624I32
1625Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1626 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1627/* strend: pointer to null at end of string */
1628/* strbeg: real beginning of string */
1629/* minend: end of match must be >=minend after stringarg. */
1630/* data: May be used for some additional optimizations. */
1631/* nosave: For optimizations. */
1632{
6eb5f6b9
JH
1633 register char *s;
1634 register regnode *c;
1635 register char *startpos = stringarg;
6eb5f6b9
JH
1636 I32 minlen; /* must match at least this many chars */
1637 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1638 I32 end_shift = 0; /* Same for the end. */ /* CC */
1639 I32 scream_pos = -1; /* Internal iterator of scream. */
1640 char *scream_olds;
1641 SV* oreplsv = GvSV(PL_replgv);
228fe6e6 1642 const bool do_utf8 = DO_UTF8(sv);
973ccfc8 1643 const I32 multiline = PL_multiline | (prog->reganch & PMf_MULTILINE);
2a782b5b 1644#ifdef DEBUGGING
9e55ce06
JH
1645 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1646 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1647#endif
a2592645 1648 PERL_UNUSED_ARG(data);
2ea5368e 1649 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1650
1651 PL_regcc = 0;
1652
1653 cache_re(prog);
1654#ifdef DEBUGGING
aea4f609 1655 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1656#endif
1657
1658 /* Be paranoid... */
1659 if (prog == NULL || startpos == NULL) {
1660 Perl_croak(aTHX_ "NULL regexp parameter");
1661 return 0;
1662 }
1663
1664 minlen = prog->minlen;
61a36c01 1665 if (strend - startpos < minlen) {
a72c7584
JH
1666 DEBUG_r(PerlIO_printf(Perl_debug_log,
1667 "String too short [regexec_flags]...\n"));
1668 goto phooey;
1aa99e6b 1669 }
6eb5f6b9 1670
6eb5f6b9
JH
1671 /* Check validity of program. */
1672 if (UCHARAT(prog->program) != REG_MAGIC) {
1673 Perl_croak(aTHX_ "corrupted regexp program");
1674 }
1675
1676 PL_reg_flags = 0;
1677 PL_reg_eval_set = 0;
1678 PL_reg_maxiter = 0;
1679
1680 if (prog->reganch & ROPT_UTF8)
1681 PL_reg_flags |= RF_utf8;
1682
1683 /* Mark beginning of line for ^ and lookbehind. */
1684 PL_regbol = startpos;
1685 PL_bostr = strbeg;
1686 PL_reg_sv = sv;
1687
1688 /* Mark end of line for $ (and such) */
1689 PL_regeol = strend;
1690
1691 /* see how far we have to get to not match where we matched before */
1692 PL_regtill = startpos+minend;
1693
1694 /* We start without call_cc context. */
1695 PL_reg_call_cc = 0;
1696
1697 /* If there is a "must appear" string, look for it. */
1698 s = startpos;
1699
1700 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1701 MAGIC *mg;
1702
1703 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1704 PL_reg_ganch = startpos;
1705 else if (sv && SvTYPE(sv) >= SVt_PVMG
1706 && SvMAGIC(sv)
14befaf4
DM
1707 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1708 && mg->mg_len >= 0) {
6eb5f6b9
JH
1709 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1710 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1711 if (s > PL_reg_ganch)
6eb5f6b9
JH
1712 goto phooey;
1713 s = PL_reg_ganch;
1714 }
1715 }
1716 else /* pos() not defined */
1717 PL_reg_ganch = strbeg;
1718 }
1719
0e2d6244 1720 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1721 re_scream_pos_data d;
1722
1723 d.scream_olds = &scream_olds;
1724 d.scream_pos = &scream_pos;
1725 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7
JH
1726 if (!s) {
1727 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1728 goto phooey; /* not present */
3fa9c3d7 1729 }
6eb5f6b9
JH
1730 }
1731
2a782b5b 1732 DEBUG_r({
228fe6e6
AL
1733 const char * const s0 = UTF
1734 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1735 UNI_DISPLAY_REGEX)
1736 : prog->precomp;
1737 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1738 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1739 UNI_DISPLAY_REGEX) : startpos;
228fe6e6 1740 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1741 if (!PL_colorset)
1742 reginitcolors();
1743 PerlIO_printf(Perl_debug_log,
5332c881 1744 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
2a782b5b 1745 PL_colors[4],PL_colors[5],PL_colors[0],
9e55ce06 1746 len0, len0, s0,
2a782b5b 1747 PL_colors[1],
9e55ce06 1748 len0 > 60 ? "..." : "",
2a782b5b 1749 PL_colors[0],
9e55ce06
JH
1750 (int)(len1 > 60 ? 60 : len1),
1751 s1, PL_colors[1],
1752 (len1 > 60 ? "..." : "")
2a782b5b
JH
1753 );
1754 });
6eb5f6b9
JH
1755
1756 /* Simplest case: anchored match need be tried only once. */
1757 /* [unless only anchor is BOL and multiline is set] */
1758 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1759 if (s == startpos && regtry(prog, startpos))
1760 goto got_it;
973ccfc8 1761 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1762 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1763 {
1764 char *end;
1765
1766 if (minlen)
1767 dontbother = minlen - 1;
1aa99e6b 1768 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1769 /* for multiline we only have to try after newlines */
33b8afdf 1770 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1771 if (s == startpos)
1772 goto after_try;
1773 while (1) {
1774 if (regtry(prog, s))
1775 goto got_it;
1776 after_try:
1777 if (s >= end)
1778 goto phooey;
1779 if (prog->reganch & RE_USE_INTUIT) {
1780 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1781 if (!s)
1782 goto phooey;
1783 }
1784 else
1785 s++;
1786 }
1787 } else {
1788 if (s > startpos)
1789 s--;
1790 while (s < end) {
1791 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1792 if (regtry(prog, s))
1793 goto got_it;
1794 }
1795 }
1796 }
1797 }
1798 goto phooey;
1799 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1800 if (regtry(prog, PL_reg_ganch))
1801 goto got_it;
1802 goto phooey;
1803 }
1804
1805 /* Messy cases: unanchored match. */
33b8afdf 1806 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1807 /* we have /x+whatever/ */
1808 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1809 char ch;
bf93d4cc
GS
1810#ifdef DEBUGGING
1811 int did_match = 0;
1812#endif
33b8afdf
JH
1813 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1814 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
fdac8c4b 1815 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1816
1aa99e6b 1817 if (do_utf8) {
6eb5f6b9
JH
1818 while (s < strend) {
1819 if (*s == ch) {
bf93d4cc 1820 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1821 if (regtry(prog, s)) goto got_it;
1822 s += UTF8SKIP(s);
1823 while (s < strend && *s == ch)
1824 s += UTF8SKIP(s);
1825 }
1826 s += UTF8SKIP(s);
1827 }
1828 }
1829 else {
1830 while (s < strend) {
1831 if (*s == ch) {
bf93d4cc 1832 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1833 if (regtry(prog, s)) goto got_it;
1834 s++;
1835 while (s < strend && *s == ch)
1836 s++;
1837 }
1838 s++;
1839 }
1840 }
b7953727 1841 DEBUG_r(if (!did_match)
bf93d4cc 1842 PerlIO_printf(Perl_debug_log,
b7953727
JH
1843 "Did not find anchored character...\n")
1844 );
6eb5f6b9 1845 }
0e2d6244
SS
1846 else if (prog->anchored_substr != NULL
1847 || prog->anchored_utf8 != NULL
1848 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1849 && prog->float_max_offset < strend - s)) {
1850 SV *must;
1851 I32 back_max;
1852 I32 back_min;
1853 char *last;
6eb5f6b9 1854 char *last1; /* Last position checked before */
bf93d4cc
GS
1855#ifdef DEBUGGING
1856 int did_match = 0;
1857#endif
33b8afdf
JH
1858 if (prog->anchored_substr || prog->anchored_utf8) {
1859 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1860 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1861 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1862 back_max = back_min = prog->anchored_offset;
1863 } else {
1864 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1865 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1866 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1867 back_max = prog->float_max_offset;
1868 back_min = prog->float_min_offset;
1869 }
1870 if (must == &PL_sv_undef)
1871 /* could not downgrade utf8 check substring, so must fail */
1872 goto phooey;
1873
1874 last = HOP3c(strend, /* Cannot start after this */
1875 -(I32)(CHR_SVLEN(must)
1876 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1877
1878 if (s > PL_bostr)
1879 last1 = HOPc(s, -1);
1880 else
1881 last1 = s - 1; /* bogus */
1882
5332c881 1883 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1884 check_substr==must. */
1885 scream_pos = -1;
1886 dontbother = end_shift;
1887 strend = HOPc(strend, -dontbother);
1888 while ( (s <= last) &&
9041c2e3 1889 ((flags & REXEC_SCREAM)
1aa99e6b 1890 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1891 end_shift, &scream_pos, 0))
1aa99e6b 1892 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1893 (unsigned char*)strend, must,
973ccfc8 1894 multiline ? FBMrf_MULTILINE : 0))) ) {
88af3bc3
JH
1895 /* we may be pointing at the wrong string */
1896 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
fdac8c4b 1897 s = strbeg + (s - SvPVX_const(sv));
bf93d4cc 1898 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1899 if (HOPc(s, -back_max) > last1) {
1900 last1 = HOPc(s, -back_min);
1901 s = HOPc(s, -back_max);
1902 }
1903 else {
1904 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1905
1906 last1 = HOPc(s, -back_min);
1907 s = t;
1908 }
1aa99e6b 1909 if (do_utf8) {
6eb5f6b9
JH
1910 while (s <= last1) {
1911 if (regtry(prog, s))
1912 goto got_it;
1913 s += UTF8SKIP(s);
1914 }
1915 }
1916 else {
1917 while (s <= last1) {
1918 if (regtry(prog, s))
1919 goto got_it;
1920 s++;
1921 }
1922 }
1923 }
b7953727
JH
1924 DEBUG_r(if (!did_match)
1925 PerlIO_printf(Perl_debug_log,
5332c881 1926 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1927 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1928 ? "anchored" : "floating"),
1929 PL_colors[0],
1930 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
fdac8c4b 1931 SvPVX_const(must),
b7953727
JH
1932 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1933 );
6eb5f6b9
JH
1934 goto phooey;
1935 }
155aba94 1936 else if ((c = prog->regstclass)) {
e19fe81c
JH
1937 if (minlen) {
1938 I32 op = (U8)OP(prog->regstclass);
66e933ab 1939 /* don't bother with what can't match */
e19fe81c
JH
1940 if (PL_regkind[op] != EXACT && op != CANY)
1941 strend = HOPc(strend, -(minlen - 1));
1942 }
ffc61ed2
JH
1943 DEBUG_r({
1944 SV *prop = sv_newmortal();
71a0dd65
NC
1945 const char *s0;
1946 const char *s1;
9e55ce06
JH
1947 int len0;
1948 int len1;
1949
ffc61ed2 1950 regprop(prop, c);
9e55ce06 1951 s0 = UTF ?
fdac8c4b 1952 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 1953 UNI_DISPLAY_REGEX) :
71a0dd65 1954 SvPVX_const(prop);
9e55ce06
JH
1955 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1956 s1 = UTF ?
c728cb41 1957 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1958 len1 = UTF ? SvCUR(dsv1) : strend - s;
1959 PerlIO_printf(Perl_debug_log,
5332c881 1960 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
9e55ce06
JH
1961 len0, len0, s0,
1962 len1, len1, s1);
ffc61ed2 1963 });
ec6f298e 1964 if (find_byclass(prog, c, s, strend, 0))
6eb5f6b9 1965 goto got_it;
bf93d4cc 1966 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1967 }
1968 else {
1969 dontbother = 0;
0e2d6244 1970 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1971 /* Trim the end. */
d6a28714 1972 char *last;
33b8afdf
JH
1973 SV* float_real;
1974
1975 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1976 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1977 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1978
1979 if (flags & REXEC_SCREAM) {
33b8afdf 1980 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1981 end_shift, &scream_pos, 1); /* last one */
1982 if (!last)
ffc61ed2 1983 last = scream_olds; /* Only one occurrence. */
88af3bc3
JH
1984 /* we may be pointing at the wrong string */
1985 else if (RX_MATCH_COPIED(prog))
fdac8c4b 1986 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1987 }
d6a28714
JH
1988 else {
1989 STRLEN len;
71a0dd65 1990 const char * const little = SvPV_const(float_real, len);
d6a28714 1991
33b8afdf 1992 if (SvTAIL(float_real)) {
d6a28714
JH
1993 if (memEQ(strend - len + 1, little, len - 1))
1994 last = strend - len + 1;
973ccfc8 1995 else if (!multiline)
9041c2e3 1996 last = memEQ(strend - len, little, len)
0e2d6244 1997 ? strend - len : NULL;
b8c5462f 1998 else
d6a28714
JH
1999 goto find_last;
2000 } else {
2001 find_last:
9041c2e3 2002 if (len)
d6a28714 2003 last = rninstr(s, strend, little, little + len);
b8c5462f 2004 else
5332c881 2005 last = strend; /* matching "$" */
b8c5462f 2006 }
b8c5462f 2007 }
bf93d4cc
GS
2008 if (last == NULL) {
2009 DEBUG_r(PerlIO_printf(Perl_debug_log,
2010 "%sCan't trim the tail, match fails (should not happen)%s\n",
2011 PL_colors[4],PL_colors[5]));
2012 goto phooey; /* Should not happen! */
2013 }
d6a28714
JH
2014 dontbother = strend - last + prog->float_min_offset;
2015 }
2016 if (minlen && (dontbother < minlen))
2017 dontbother = minlen - 1;
2018 strend -= dontbother; /* this one's always in bytes! */
2019 /* We don't know much -- general case. */
1aa99e6b 2020 if (do_utf8) {
d6a28714
JH
2021 for (;;) {
2022 if (regtry(prog, s))
2023 goto got_it;
2024 if (s >= strend)
2025 break;
b8c5462f 2026 s += UTF8SKIP(s);
d6a28714
JH
2027 };
2028 }
2029 else {
2030 do {
2031 if (regtry(prog, s))
2032 goto got_it;
2033 } while (s++ < strend);
2034 }
2035 }
2036
2037 /* Failure. */
2038 goto phooey;
2039
2040got_it:
2041 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2042
2043 if (PL_reg_eval_set) {
2044 /* Preserve the current value of $^R */
2045 if (oreplsv != GvSV(PL_replgv))
2046 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2047 restored, the value remains
2048 the same. */
acfe0abc 2049 restore_pos(aTHX_ 0);
d6a28714
JH
2050 }
2051
2052 /* make sure $`, $&, $', and $digit will work later */
2053 if ( !(flags & REXEC_NOT_FIRST) ) {
2054 if (RX_MATCH_COPIED(prog)) {
2055 Safefree(prog->subbeg);
2056 RX_MATCH_COPIED_off(prog);
2057 }
2058 if (flags & REXEC_COPY_STR) {
2059 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2060
2061 s = savepvn(strbeg, i);
2062 prog->subbeg = s;
2063 prog->sublen = i;
2064 RX_MATCH_COPIED_on(prog);
2065 }
2066 else {
2067 prog->subbeg = strbeg;
2068 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2069 }
2070 }
9041c2e3 2071
d6a28714
JH
2072 return 1;
2073
2074phooey:
bf93d4cc
GS
2075 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2076 PL_colors[4],PL_colors[5]));
d6a28714 2077 if (PL_reg_eval_set)
acfe0abc 2078 restore_pos(aTHX_ 0);
d6a28714
JH
2079 return 0;
2080}
2081
2082/*
2083 - regtry - try match at specific point
2084 */
2085STATIC I32 /* 0 failure, 1 success */
2086S_regtry(pTHX_ regexp *prog, char *startpos)
2087{
d6a28714
JH
2088 register I32 i;
2089 register I32 *sp;
2090 register I32 *ep;
2091 CHECKPOINT lastcp;
2092
02db2b7b
IZ
2093#ifdef DEBUGGING
2094 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2095#endif
d6a28714
JH
2096 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2097 MAGIC *mg;
2098
2099 PL_reg_eval_set = RS_init;
2100 DEBUG_r(DEBUG_s(
b900a521
JH
2101 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2102 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2103 ));
e8347627 2104 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2105 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2106 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2107 SAVETMPS;
2108 /* Apparently this is not needed, judging by wantarray. */
e8347627 2109 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2110 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2111
2112 if (PL_reg_sv) {
2113 /* Make $_ available to executed code. */
2114 if (PL_reg_sv != DEFSV) {
4d1ff10f 2115 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
d6a28714
JH
2116 SAVESPTR(DEFSV);
2117 DEFSV = PL_reg_sv;
b8c5462f 2118 }
d6a28714 2119
9041c2e3 2120 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2121 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2122 /* prepare for quick setting of pos */
14befaf4 2123 sv_magic(PL_reg_sv, (SV*)0,
0e2d6244 2124 PERL_MAGIC_regex_global, NULL, 0);
14befaf4 2125 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2126 mg->mg_len = -1;
b8c5462f 2127 }
d6a28714
JH
2128 PL_reg_magic = mg;
2129 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2130 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2131 }
09687e5a 2132 if (!PL_reg_curpm) {
cd7a8267 2133 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2134#ifdef USE_ITHREADS
2135 {
2136 SV* repointer = newSViv(0);
577e12cc 2137 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2138 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2139 av_push(PL_regex_padav,repointer);
2140 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2141 PL_regex_pad = AvARRAY(PL_regex_padav);
2142 }
2143#endif
2144 }
aaa362c4 2145 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2146 PL_reg_oldcurpm = PL_curpm;
2147 PL_curpm = PL_reg_curpm;
2148 if (RX_MATCH_COPIED(prog)) {
2149 /* Here is a serious problem: we cannot rewrite subbeg,
2150 since it may be needed if this match fails. Thus
2151 $` inside (?{}) could fail... */
2152 PL_reg_oldsaved = prog->subbeg;
2153 PL_reg_oldsavedlen = prog->sublen;
2154 RX_MATCH_COPIED_off(prog);
2155 }
2156 else
0e2d6244 2157 PL_reg_oldsaved = NULL;
d6a28714
JH
2158 prog->subbeg = PL_bostr;
2159 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2160 }
2161 prog->startp[0] = startpos - PL_bostr;
2162 PL_reginput = startpos;
2163 PL_regstartp = prog->startp;
2164 PL_regendp = prog->endp;
2165 PL_reglastparen = &prog->lastparen;
a01268b5 2166 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2167 prog->lastparen = 0;
4f4e7967 2168 prog->lastcloseparen = 0;
d6a28714
JH
2169 PL_regsize = 0;
2170 DEBUG_r(PL_reg_starttry = startpos);
2171 if (PL_reg_start_tmpl <= prog->nparens) {
2172 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2173 if(PL_reg_start_tmp)
2174 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2175 else
cd7a8267 2176 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2177 }
2178
2179 /* XXXX What this code is doing here?!!! There should be no need
2180 to do this again and again, PL_reglastparen should take care of
3dd2943c 2181 this! --ilya*/
dafc8851
JH
2182
2183 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2184 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2185 * PL_reglastparen), is not needed at all by the test suite
2186 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2187 * enough, for building DynaLoader, or otherwise this
2188 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2189 * will happen. Meanwhile, this code *is* needed for the
2190 * above-mentioned test suite tests to succeed. The common theme
2191 * on those tests seems to be returning null fields from matches.
2192 * --jhi */
dafc8851 2193#if 1
d6a28714
JH
2194 sp = prog->startp;
2195 ep = prog->endp;
2196 if (prog->nparens) {
eb160463 2197 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2198 *++sp = -1;
2199 *++ep = -1;
2200 }
2201 }
dafc8851 2202#endif
02db2b7b 2203 REGCP_SET(lastcp);
d6a28714
JH
2204 if (regmatch(prog->program + 1)) {
2205 prog->endp[0] = PL_reginput - PL_bostr;
2206 return 1;
2207 }
02db2b7b 2208 REGCP_UNWIND(lastcp);
d6a28714
JH
2209 return 0;
2210}
2211
02db2b7b
IZ
2212#define RE_UNWIND_BRANCH 1
2213#define RE_UNWIND_BRANCHJ 2
2214
2215union re_unwind_t;
2216
2217typedef struct { /* XX: makes sense to enlarge it... */
2218 I32 type;
2219 I32 prev;
2220 CHECKPOINT lastcp;
2221} re_unwind_generic_t;
2222
2223typedef struct {
2224 I32 type;
2225 I32 prev;
2226 CHECKPOINT lastcp;
2227 I32 lastparen;
2228 regnode *next;
2229 char *locinput;
2230 I32 nextchr;
2231#ifdef DEBUGGING
2232 int regindent;
2233#endif
2234} re_unwind_branch_t;
2235
2236typedef union re_unwind_t {
2237 I32 type;
2238 re_unwind_generic_t generic;
2239 re_unwind_branch_t branch;
2240} re_unwind_t;
2241
8ba1375e
MJD
2242#define sayYES goto yes
2243#define sayNO goto no
e0f9d4a8 2244#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2245#define sayYES_FINAL goto yes_final
2246#define sayYES_LOUD goto yes_loud
2247#define sayNO_FINAL goto no_final
2248#define sayNO_SILENT goto do_no
2249#define saySAME(x) if (x) goto yes; else goto no
2250
6e00f9c6
HS
2251#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2252#define POSCACHE_SEEN 1 /* we know what we're caching */
2253#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2254#define CACHEsayYES STMT_START { \
2255 if (cache_offset | cache_bit) { \
2256 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2257 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2258 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2259 /* cache records failure, but this is success */ \
2260 DEBUG_r( \
2261 PerlIO_printf(Perl_debug_log, \
2262 "%*s (remove success from failure cache)\n", \
2263 REPORT_CODE_OFF+PL_regindent*2, "") \
2264 ); \
2265 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2266 } \
2267 } \
2268 sayYES; \
2269} STMT_END
2270#define CACHEsayNO STMT_START { \
2271 if (cache_offset | cache_bit) { \
2272 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2273 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2274 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2275 /* cache records success, but this is failure */ \
2276 DEBUG_r( \
2277 PerlIO_printf(Perl_debug_log, \
2278 "%*s (remove failure from success cache)\n", \
2279 REPORT_CODE_OFF+PL_regindent*2, "") \
2280 ); \
2281 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2282 } \
2283 } \
2284 sayNO; \
2285} STMT_END
2286
8ba1375e
MJD
2287#define REPORT_CODE_OFF 24
2288
d6a28714
JH
2289/*
2290 - regmatch - main matching routine
2291 *
2292 * Conceptually the strategy is simple: check to see whether the current
2293 * node matches, call self recursively to see whether the rest matches,
2294 * and then act accordingly. In practice we make some effort to avoid
2295 * recursion, in particular by going through "ordinary" nodes (that don't
2296 * need to know whether the rest of the match failed) by a loop instead of
2297 * by recursion.
2298 */
2299/* [lwall] I've hoisted the register declarations to the outer block in order to
2300 * maybe save a little bit of pushing and popping on the stack. It also takes
2301 * advantage of machines that use a register save mask on subroutine entry.
2302 */
2303STATIC I32 /* 0 failure, 1 success */
2304S_regmatch(pTHX_ regnode *prog)
2305{
d6a28714
JH
2306 register regnode *scan; /* Current node. */
2307 regnode *next; /* Next node. */
2308 regnode *inner; /* Next node in internal branch. */
2309 register I32 nextchr; /* renamed nextchr - nextchar colides with
2310 function of same name */
2311 register I32 n; /* no or next */
b7953727 2312 register I32 ln = 0; /* len or last */
0e2d6244 2313 register char *s = NULL; /* operand or save */
d6a28714 2314 register char *locinput = PL_reginput;
b7953727 2315 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2316 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2317 I32 unwind = 0;
b7953727 2318#if 0
02db2b7b 2319 I32 firstcp = PL_savestack_ix;
b7953727 2320#endif
3bad88ff 2321 register const bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2322#ifdef DEBUGGING
ce333219
JH
2323 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2324 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2325 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2a782b5b 2326#endif
f5e9f069 2327 U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
02db2b7b 2328
d6a28714
JH
2329#ifdef DEBUGGING
2330 PL_regindent++;
2331#endif
2332
2333 /* Note that nextchr is a byte even in UTF */
2334 nextchr = UCHARAT(locinput);
2335 scan = prog;
2336 while (scan != NULL) {
8ba1375e 2337
2a782b5b 2338 DEBUG_r( {
d6a28714 2339 SV *prop = sv_newmortal();
228fe6e6
AL
2340 const int docolor = *PL_colors[0];
2341 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2342 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2343 /* The part of the string before starttry has one color
2344 (pref0_len chars), between starttry and current
2345 position another one (pref_len - pref0_len chars),
2346 after the current position the third one.
2347 We assume that pref0_len <= pref_len, otherwise we
2348 decrease pref0_len. */
9041c2e3 2349 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2350 ? (5 + taill) - l : locinput - PL_bostr;
2351 int pref0_len;
d6a28714 2352
df1ffd02 2353 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2354 pref_len++;
2355 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2356 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2357 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2358 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2359 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2360 l--;
d6a28714
JH
2361 if (pref0_len < 0)
2362 pref0_len = 0;
2363 if (pref0_len > pref_len)
2364 pref0_len = pref_len;
2365 regprop(prop, scan);
2a782b5b 2366 {
228fe6e6 2367 const char * const s0 =
e19fe81c 2368 do_utf8 && OP(scan) != CANY ?
2a782b5b 2369 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2370 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2371 locinput - pref_len;
228fe6e6
AL
2372 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2373 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2374 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2375 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2376 locinput - pref_len + pref0_len;
228fe6e6
AL
2377 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2378 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2379 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2380 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2381 locinput;
228fe6e6 2382 const int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2383 PerlIO_printf(Perl_debug_log,
2384 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2385 (IV)(locinput - PL_bostr),
2386 PL_colors[4],
2387 len0, s0,
2388 PL_colors[5],
2389 PL_colors[2],
2390 len1, s1,
2391 PL_colors[3],
2392 (docolor ? "" : "> <"),
2393 PL_colors[0],
2394 len2, s2,
2395 PL_colors[1],
2396 15 - l - pref_len + 1,
2397 "",
2398 (IV)(scan - PL_regprogram), PL_regindent*2, "",
fdac8c4b 2399 SvPVX_const(prop));
2a782b5b
JH
2400 }
2401 });
d6a28714
JH
2402
2403 next = scan + NEXT_OFF(scan);
2404 if (next == scan)
2405 next = NULL;
2406
2407 switch (OP(scan)) {
2408 case BOL:
12d33761
HS
2409 if (locinput == PL_bostr || (PL_multiline &&
2410 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
d6a28714
JH
2411 {
2412 /* regtill = regbol; */
b8c5462f
JH
2413 break;
2414 }
d6a28714
JH
2415 sayNO;
2416 case MBOL:
12d33761
HS
2417 if (locinput == PL_bostr ||
2418 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2419 {
b8c5462f
JH
2420 break;
2421 }
d6a28714
JH
2422 sayNO;
2423 case SBOL:
c2a73568 2424 if (locinput == PL_bostr)
b8c5462f 2425 break;
d6a28714
JH
2426 sayNO;
2427 case GPOS:
2428 if (locinput == PL_reg_ganch)
2429 break;
2430 sayNO;
2431 case EOL:
2432 if (PL_multiline)
2433 goto meol;
2434 else
2435 goto seol;
2436 case MEOL:
2437 meol:
2438 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2439 sayNO;
b8c5462f 2440 break;
d6a28714
JH
2441 case SEOL:
2442 seol:
2443 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2444 sayNO;
d6a28714 2445 if (PL_regeol - locinput > 1)
b8c5462f 2446 sayNO;
b8c5462f 2447 break;
d6a28714
JH
2448 case EOS:
2449 if (PL_regeol != locinput)
b8c5462f 2450 sayNO;
d6a28714 2451 break;
ffc61ed2 2452 case SANY:
d6a28714 2453 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2454 sayNO;
f33976b4
DB
2455 if (do_utf8) {
2456 locinput += PL_utf8skip[nextchr];
2457 if (locinput > PL_regeol)
2458 sayNO;
2459 nextchr = UCHARAT(locinput);
2460 }
2461 else
2462 nextchr = UCHARAT(++locinput);
2463 break;
2464 case CANY:
2465 if (!nextchr && locinput >= PL_regeol)
2466 sayNO;
b8c5462f 2467 nextchr = UCHARAT(++locinput);
a0d0e21e 2468 break;
ffc61ed2 2469 case REG_ANY:
1aa99e6b
IH
2470 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2471 sayNO;
2472 if (do_utf8) {
b8c5462f 2473 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2474 if (locinput > PL_regeol)
2475 sayNO;
a0ed51b3 2476 nextchr = UCHARAT(locinput);
a0ed51b3 2477 }
1aa99e6b
IH
2478 else
2479 nextchr = UCHARAT(++locinput);
a0ed51b3 2480 break;
d6a28714 2481 case EXACT:
cd439c50
IZ
2482 s = STRING(scan);
2483 ln = STR_LEN(scan);
eb160463 2484 if (do_utf8 != UTF) {
bc517b45 2485 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2486 char *l = locinput;
8c18bf38 2487 const char *e = s + ln;
a72c7584 2488
5ff6fc6d
JH
2489 if (do_utf8) {
2490 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2491 while (s < e) {
8c18bf38 2492 STRLEN ulen;
1aa99e6b 2493 if (l >= PL_regeol)
5ff6fc6d
JH
2494 sayNO;
2495 if (NATIVE_TO_UNI(*(U8*)s) !=
a2a469f9 2496 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
f5e9f069 2497 uniflags))
5ff6fc6d 2498 sayNO;
bc517b45 2499 l += ulen;
5ff6fc6d 2500 s ++;
1aa99e6b 2501 }
5ff6fc6d
JH
2502 }
2503 else {
2504 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2505 while (s < e) {
8c18bf38 2506 STRLEN ulen;
1aa99e6b
IH
2507 if (l >= PL_regeol)
2508 sayNO;
5ff6fc6d 2509 if (NATIVE_TO_UNI(*((U8*)l)) !=
a2a469f9 2510 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
f5e9f069 2511 uniflags))
1aa99e6b 2512 sayNO;
bc517b45 2513 s += ulen;
a72c7584 2514 l ++;
1aa99e6b 2515 }
5ff6fc6d 2516 }
1aa99e6b
IH
2517 locinput = l;
2518 nextchr = UCHARAT(locinput);
2519 break;
2520 }
bc517b45 2521 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2522 /* Inline the first character, for speed. */
2523 if (UCHARAT(s) != nextchr)
2524 sayNO;
2525 if (PL_regeol - locinput < ln)
2526 sayNO;
2527 if (ln > 1 && memNE(s, locinput, ln))
2528 sayNO;
2529 locinput += ln;
2530 nextchr = UCHARAT(locinput);
2531 break;
2532 case EXACTFL:
b8c5462f
JH
2533 PL_reg_flags |= RF_tainted;
2534 /* FALL THROUGH */
d6a28714 2535 case EXACTF:
cd439c50
IZ
2536 s = STRING(scan);
2537 ln = STR_LEN(scan);
d6a28714 2538
d07ddd77
JH
2539 if (do_utf8 || UTF) {
2540 /* Either target or the pattern are utf8. */
d6a28714 2541 char *l = locinput;
d07ddd77 2542 char *e = PL_regeol;
bc517b45 2543
eb160463 2544 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2545 l, &e, 0, do_utf8)) {
5486206c
JH
2546 /* One more case for the sharp s:
2547 * pack("U0U*", 0xDF) =~ /ss/i,
2548 * the 0xC3 0x9F are the UTF-8
2549 * byte sequence for the U+00DF. */
2550 if (!(do_utf8 &&
2551 toLOWER(s[0]) == 's' &&
2552 ln >= 2 &&
2553 toLOWER(s[1]) == 's' &&
2554 (U8)l[0] == 0xC3 &&
2555 e - l >= 2 &&
2556 (U8)l[1] == 0x9F))
2557 sayNO;
2558 }
d07ddd77
JH
2559 locinput = e;
2560 nextchr = UCHARAT(locinput);
2561 break;
a0ed51b3 2562 }
d6a28714 2563
bc517b45
JH
2564 /* Neither the target and the pattern are utf8. */
2565
d6a28714
JH
2566 /* Inline the first character, for speed. */
2567 if (UCHARAT(s) != nextchr &&
2568 UCHARAT(s) != ((OP(scan) == EXACTF)
2569 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2570 sayNO;
d6a28714 2571 if (PL_regeol - locinput < ln)
b8c5462f 2572 sayNO;
d6a28714
JH
2573 if (ln > 1 && (OP(scan) == EXACTF
2574 ? ibcmp(s, locinput, ln)
2575 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2576 sayNO;
d6a28714
JH
2577 locinput += ln;
2578 nextchr = UCHARAT(locinput);
a0d0e21e 2579 break;
d6a28714 2580 case ANYOF:
ffc61ed2 2581 if (do_utf8) {
9e55ce06
JH
2582 STRLEN inclasslen = PL_regeol - locinput;
2583
ba7b4546 2584 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2585 sayNO_ANYOF;
ffc61ed2
JH
2586 if (locinput >= PL_regeol)
2587 sayNO;
0f0076b4 2588 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2589 nextchr = UCHARAT(locinput);
e0f9d4a8 2590 break;
ffc61ed2
JH
2591 }
2592 else {
2593 if (nextchr < 0)
2594 nextchr = UCHARAT(locinput);
7d3e948e 2595 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2596 sayNO_ANYOF;
ffc61ed2
JH
2597 if (!nextchr && locinput >= PL_regeol)
2598 sayNO;
2599 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2600 break;
2601 }
2602 no_anyof:
2603 /* If we might have the case of the German sharp s
2604 * in a casefolding Unicode character class. */
2605
ebc501f0
JH
2606 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2607 locinput += SHARP_S_SKIP;
e0f9d4a8 2608 nextchr = UCHARAT(locinput);
ffc61ed2 2609 }
e0f9d4a8
JH
2610 else
2611 sayNO;
b8c5462f 2612 break;
d6a28714 2613 case ALNUML:
b8c5462f
JH
2614 PL_reg_flags |= RF_tainted;
2615 /* FALL THROUGH */
d6a28714 2616 case ALNUM:
b8c5462f 2617 if (!nextchr)
4633a7c4 2618 sayNO;
ffc61ed2 2619 if (do_utf8) {
fab1f349 2620 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2621 if (!(OP(scan) == ALNUM
3568d838 2622 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2623 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2624 {
2625 sayNO;
a0ed51b3 2626 }
b8c5462f 2627 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2628 nextchr = UCHARAT(locinput);
2629 break;
2630 }
ffc61ed2 2631 if (!(OP(scan) == ALNUM
d6a28714 2632 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2633 sayNO;
b8c5462f 2634 nextchr = UCHARAT(++locinput);
a0d0e21e 2635 break;
d6a28714 2636 case NALNUML:
b8c5462f
JH
2637 PL_reg_flags |= RF_tainted;
2638 /* FALL THROUGH */
d6a28714
JH
2639 case NALNUM:
2640 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2641 sayNO;
ffc61ed2 2642 if (do_utf8) {
fab1f349 2643 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2644 if (OP(scan) == NALNUM
3568d838 2645 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2646 : isALNUM_LC_utf8((U8*)locinput))
2647 {
b8c5462f 2648 sayNO;
d6a28714 2649 }
b8c5462f
JH
2650 locinput += PL_utf8skip[nextchr];
2651 nextchr = UCHARAT(locinput);
2652 break;
2653 }
ffc61ed2 2654 if (OP(scan) == NALNUM
d6a28714 2655 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2656 sayNO;
76e3520e 2657 nextchr = UCHARAT(++locinput);
a0d0e21e 2658 break;
d6a28714
JH
2659 case BOUNDL:
2660 case NBOUNDL:
3280af22 2661 PL_reg_flags |= RF_tainted;
bbce6d69 2662 /* FALL THROUGH */
d6a28714
JH
2663 case BOUND:
2664 case NBOUND:
2665 /* was last char in word? */
ffc61ed2 2666 if (do_utf8) {
12d33761
HS
2667 if (locinput == PL_bostr)
2668 ln = '\n';
ffc61ed2 2669 else {
8c18bf38 2670 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2671
47a91a97 2672 ln = utf8n_to_uvchr((U8 *)r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2673 }
2674 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2675 ln = isALNUM_uni(ln);
fab1f349 2676 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 2677 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2678 }
2679 else {
9041c2e3 2680 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2681 n = isALNUM_LC_utf8((U8*)locinput);
2682 }
a0ed51b3 2683 }
d6a28714 2684 else {
12d33761
HS
2685 ln = (locinput != PL_bostr) ?
2686 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2687 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2688 ln = isALNUM(ln);
2689 n = isALNUM(nextchr);
2690 }
2691 else {
2692 ln = isALNUM_LC(ln);
2693 n = isALNUM_LC(nextchr);
2694 }
d6a28714 2695 }
ffc61ed2
JH
2696 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2697 OP(scan) == BOUNDL))
2698 sayNO;
a0ed51b3 2699 break;
d6a28714 2700 case SPACEL:
3280af22 2701 PL_reg_flags |= RF_tainted;
bbce6d69 2702 /* FALL THROUGH */
d6a28714 2703 case SPACE:
9442cb0e 2704 if (!nextchr)
4633a7c4 2705 sayNO;
1aa99e6b 2706 if (do_utf8) {
fd400ab9 2707 if (UTF8_IS_CONTINUED(nextchr)) {
fab1f349 2708 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 2709 if (!(OP(scan) == SPACE
3568d838 2710 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2711 : isSPACE_LC_utf8((U8*)locinput)))
2712 {
2713 sayNO;
2714 }
2715 locinput += PL_utf8skip[nextchr];
2716 nextchr = UCHARAT(locinput);
2717 break;
d6a28714 2718 }
ffc61ed2
JH
2719 if (!(OP(scan) == SPACE
2720 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2721 sayNO;
2722 nextchr = UCHARAT(++locinput);
2723 }
2724 else {
2725 if (!(OP(scan) == SPACE
2726 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2727 sayNO;
2728 nextchr = UCHARAT(++locinput);
a0ed51b3 2729 }
a0ed51b3 2730 break;
d6a28714 2731 case NSPACEL:
3280af22 2732 PL_reg_flags |= RF_tainted;
bbce6d69 2733 /* FALL THROUGH */
d6a28714 2734 case NSPACE:
9442cb0e 2735 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2736 sayNO;
1aa99e6b 2737 if (do_utf8) {
fab1f349 2738 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 2739 if (OP(scan) == NSPACE
3568d838 2740 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 2741 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
2742 {
2743 sayNO;
2744 }
2745 locinput += PL_utf8skip[nextchr];
2746 nextchr = UCHARAT(locinput);
2747 break;
a0ed51b3 2748 }
ffc61ed2 2749 if (OP(scan) == NSPACE
d6a28714 2750 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 2751 sayNO;
76e3520e 2752 nextchr = UCHARAT(++locinput);
a0d0e21e 2753 break;
d6a28714 2754 case DIGITL:
a0ed51b3
LW
2755 PL_reg_flags |= RF_tainted;
2756 /* FALL THROUGH */
d6a28714 2757 case DIGIT:
9442cb0e 2758 if (!nextchr)
a0ed51b3 2759 sayNO;
1aa99e6b 2760 if (do_utf8) {
fab1f349 2761 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 2762 if (!(OP(scan) == DIGIT
3568d838 2763 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 2764 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 2765 {
a0ed51b3 2766 sayNO;
dfe13c55 2767 }
6f06b55f 2768 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2769 nextchr = UCHARAT(locinput);
2770 break;
2771 }
ffc61ed2 2772 if (!(OP(scan) == DIGIT
9442cb0e 2773 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
2774 sayNO;
2775 nextchr = UCHARAT(++locinput);
2776 break;
d6a28714 2777 case NDIGITL:
b8c5462f
JH
2778 PL_reg_flags |= RF_tainted;
2779 /* FALL THROUGH */
d6a28714 2780 case NDIGIT:
9442cb0e 2781 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2782 sayNO;
1aa99e6b 2783 if (do_utf8) {
fab1f349 2784 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 2785 if (OP(scan) == NDIGIT
3568d838 2786 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
2787 : isDIGIT_LC_utf8((U8*)locinput))
2788 {
a0ed51b3 2789 sayNO;
9442cb0e 2790 }
6f06b55f 2791 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2792 nextchr = UCHARAT(locinput);
2793 break;
2794 }
ffc61ed2 2795 if (OP(scan) == NDIGIT
9442cb0e 2796 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
2797 sayNO;
2798 nextchr = UCHARAT(++locinput);
2799 break;
2800 case CLUMP:
b7c83a7e 2801 if (locinput >= PL_regeol)
a0ed51b3 2802 sayNO;
b7c83a7e 2803 if (do_utf8) {
fab1f349 2804 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
2805 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2806 sayNO;
2807 locinput += PL_utf8skip[nextchr];
2808 while (locinput < PL_regeol &&
2809 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2810 locinput += UTF8SKIP(locinput);
2811 if (locinput > PL_regeol)
2812 sayNO;
eb08e2da
JH
2813 }
2814 else
2815 locinput++;
a0ed51b3
LW
2816 nextchr = UCHARAT(locinput);
2817 break;
c8756f30 2818 case REFFL:
3280af22 2819 PL_reg_flags |= RF_tainted;
c8756f30 2820 /* FALL THROUGH */
c277df42 2821 case REF:
c8756f30 2822 case REFF:
c277df42 2823 n = ARG(scan); /* which paren pair */
cf93c79d 2824 ln = PL_regstartp[n];
2c2d71f5 2825 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 2826 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 2827 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 2828 if (ln == PL_regendp[n])
a0d0e21e 2829 break;
a0ed51b3 2830
cf93c79d 2831 s = PL_bostr + ln;
1aa99e6b 2832 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 2833 char *l = locinput;
8c18bf38 2834 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
2835 /*
2836 * Note that we can't do the "other character" lookup trick as
2837 * in the 8-bit case (no pun intended) because in Unicode we
2838 * have to map both upper and title case to lower case.
2839 */
2840 if (OP(scan) == REFF) {
2841 while (s < e) {
8c18bf38
AL
2842 STRLEN ulen1, ulen2;
2843 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
2844 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
2845
a0ed51b3
LW
2846 if (l >= PL_regeol)
2847 sayNO;
a2a2844f
JH
2848 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2849 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 2850 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 2851 sayNO;
a2a2844f
JH
2852 s += ulen1;
2853 l += ulen2;
a0ed51b3
LW
2854 }
2855 }
2856 locinput = l;
2857 nextchr = UCHARAT(locinput);
2858 break;
2859 }
2860
a0d0e21e 2861 /* Inline the first character, for speed. */
76e3520e 2862 if (UCHARAT(s) != nextchr &&
c8756f30
AK
2863 (OP(scan) == REF ||
2864 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 2865 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 2866 sayNO;
cf93c79d 2867 ln = PL_regendp[n] - ln;
3280af22 2868 if (locinput + ln > PL_regeol)
4633a7c4 2869 sayNO;
c8756f30
AK
2870 if (ln > 1 && (OP(scan) == REF
2871 ? memNE(s, locinput, ln)
2872 : (OP(scan) == REFF
2873 ? ibcmp(s, locinput, ln)
2874 : ibcmp_locale(s, locinput, ln))))
4633a7c4 2875 sayNO;
a0d0e21e 2876 locinput += ln;
76e3520e 2877 nextchr = UCHARAT(locinput);
a0d0e21e
LW
2878 break;
2879
2880 case NOTHING:
c277df42 2881 case TAIL:
a0d0e21e
LW
2882 break;
2883 case BACK:
2884 break;
c277df42
IZ
2885 case EVAL:
2886 {
2887 dSP;
533c011a 2888 OP_4tree *oop = PL_op;
3280af22 2889 COP *ocurcop = PL_curcop;
d7afa7f5 2890 PAD *old_comppad;
c277df42 2891 SV *ret;
9b8ea7f3 2892 struct regexp *oreg = PL_reg_re;
9041c2e3 2893
c277df42 2894 n = ARG(scan);
533c011a 2895 PL_op = (OP_4tree*)PL_regdata->data[n];
d7d93a81 2896 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
d7afa7f5 2897 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 2898 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 2899
8e5e9ebe
RGS
2900 {
2901 SV **before = SP;
2902 CALLRUNOPS(aTHX); /* Scalar context. */
2903 SPAGAIN;
2904 if (SP == before)
075aa684 2905 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
2906 else {
2907 ret = POPs;
2908 PUTBACK;
2909 }
2910 }
2911
0f5d15d6 2912 PL_op = oop;
d7afa7f5 2913 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 2914 PL_curcop = ocurcop;
c277df42 2915 if (logical) {
0f5d15d6
IZ
2916 if (logical == 2) { /* Postponed subexpression. */
2917 regexp *re;
22c35a8c 2918 MAGIC *mg = Null(MAGIC*);
0f5d15d6 2919 re_cc_state state;
0f5d15d6 2920 CHECKPOINT cp, lastcp;
5b7ea690 2921 int toggleutf;
5835a535 2922 register SV *sv;
0f5d15d6 2923
5835a535
JH
2924 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
2925 mg = mg_find(sv, PERL_MAGIC_qr);
2926 else if (SvSMAGICAL(ret)) {
2927 if (SvGMAGICAL(ret))
2928 sv_unmagic(ret, PERL_MAGIC_qr);
2929 else
2930 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 2931 }
5835a535 2932
0f5d15d6
IZ
2933 if (mg) {
2934 re = (regexp *)mg->mg_obj;
df0003d4 2935 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
2936 }
2937 else {
2938 STRLEN len;
c06c673c 2939 const char *t = SvPV_const(ret, len);
0f5d15d6 2940 PMOP pm;
8c18bf38
AL
2941 char * const oprecomp = PL_regprecomp;
2942 const I32 osize = PL_regsize;
2943 const I32 onpar = PL_regnpar;
0f5d15d6 2944
5fcd1c1b 2945 Zero(&pm, 1, PMOP);
5b7ea690 2946 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
c06c673c 2947 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 2948 if (!(SvFLAGS(ret)
5835a535
JH
2949 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
2950 | SVs_GMG)))
14befaf4
DM
2951 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2952 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
2953 PL_regprecomp = oprecomp;
2954 PL_regsize = osize;
2955 PL_regnpar = onpar;
2956 }
2957 DEBUG_r(
9041c2e3 2958 PerlIO_printf(Perl_debug_log,
5332c881 2959 "Entering embedded \"%s%.60s%s%s\"\n",
0f5d15d6
IZ
2960 PL_colors[0],
2961 re->precomp,
2962 PL_colors[1],
2963 (strlen(re->precomp) > 60 ? "..." : ""))
2964 );
2965 state.node = next;
2966 state.prev = PL_reg_call_cc;
2967 state.cc = PL_regcc;
2968 state.re = PL_reg_re;
2969
2ab05381 2970 PL_regcc = 0;
9041c2e3 2971
0f5d15d6 2972 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 2973 REGCP_SET(lastcp);
0f5d15d6
IZ
2974 cache_re(re);
2975 state.ss = PL_savestack_ix;
2976 *PL_reglastparen = 0;
a01268b5 2977 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
2978 PL_reg_call_cc = &state;
2979 PL_reginput = locinput;
5b7ea690
JH
2980 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2981 ((re->reganch & ROPT_UTF8) != 0);
2982 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2983
2984 /* XXXX This is too dramatic a measure... */
2985 PL_reg_maxiter = 0;
2986
0f5d15d6 2987 if (regmatch(re->program + 1)) {
2c914db6
IZ
2988 /* Even though we succeeded, we need to restore
2989 global variables, since we may be wrapped inside
2990 SUSPEND, thus the match may be not finished yet. */
2991
2992 /* XXXX Do this only if SUSPENDed? */
2993 PL_reg_call_cc = state.prev;
2994 PL_regcc = state.cc;
2995 PL_reg_re = state.re;
2996 cache_re(PL_reg_re);
5b7ea690 2997 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
2998
2999 /* XXXX This is too dramatic a measure... */
3000 PL_reg_maxiter = 0;
3001
3002 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
3003 ReREFCNT_dec(re);
3004 regcpblow(cp);
3005 sayYES;
3006 }
0f5d15d6 3007 ReREFCNT_dec(re);
02db2b7b 3008 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3009 regcppop();
3010 PL_reg_call_cc = state.prev;
3011 PL_regcc = state.cc;
3012 PL_reg_re = state.re;
d3790889 3013 cache_re(PL_reg_re);
5b7ea690 3014 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3015
3016 /* XXXX This is too dramatic a measure... */
3017 PL_reg_maxiter = 0;
3018
8e514ae6 3019 logical = 0;
0f5d15d6
IZ
3020 sayNO;
3021 }
c277df42 3022 sw = SvTRUE(ret);
0f5d15d6 3023 logical = 0;
a0ed51b3 3024 }
9b8ea7f3 3025 else {
3280af22 3026 sv_setsv(save_scalar(PL_replgv), ret);
9b8ea7f3
JH
3027 cache_re(oreg);
3028 }
c277df42
IZ
3029 break;
3030 }
a0d0e21e 3031 case OPEN:
c277df42 3032 n = ARG(scan); /* which paren pair */
3280af22
NIS
3033 PL_reg_start_tmp[n] = locinput;
3034 if (n > PL_regsize)
3035 PL_regsize = n;
a0d0e21e
LW
3036 break;
3037 case CLOSE:
c277df42 3038 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3039 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3040 PL_regendp[n] = locinput - PL_bostr;
eb160463 3041 if (n > (I32)*PL_reglastparen)
3280af22 3042 *PL_reglastparen = n;
a01268b5 3043 *PL_reglastcloseparen = n;
a0d0e21e 3044 break;
c277df42
IZ
3045 case GROUPP:
3046 n = ARG(scan); /* which paren pair */
eb160463 3047 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3048 break;
3049 case IFTHEN:
2c2d71f5 3050 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
3051 if (sw)
3052 next = NEXTOPER(NEXTOPER(scan));
3053 else {
3054 next = scan + ARG(scan);
3055 if (OP(next) == IFTHEN) /* Fake one. */
3056 next = NEXTOPER(NEXTOPER(next));
3057 }
3058 break;
3059 case LOGICAL:
0f5d15d6 3060 logical = scan->flags;
c277df42 3061 break;
2ab05381
IZ
3062/*******************************************************************
3063 PL_regcc contains infoblock about the innermost (...)* loop, and
3064 a pointer to the next outer infoblock.
3065
3066 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3067
3068 1) After matching X, regnode for CURLYX is processed;
3069
9041c2e3 3070 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3071 regmatch() recursively with the starting point at WHILEM node;
3072
3073 3) Each hit of WHILEM node tries to match A and Z (in the order
3074 depending on the current iteration, min/max of {min,max} and
3075 greediness). The information about where are nodes for "A"
3076 and "Z" is read from the infoblock, as is info on how many times "A"
3077 was already matched, and greediness.
3078
3079 4) After A matches, the same WHILEM node is hit again.
3080
3081 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3082 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3083 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3084 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3085 of the external loop.
3086
3087 Currently present infoblocks form a tree with a stem formed by PL_curcc
3088 and whatever it mentions via ->next, and additional attached trees
3089 corresponding to temporarily unset infoblocks as in "5" above.
3090
9041c2e3 3091 In the following picture infoblocks for outer loop of
2ab05381
IZ
3092 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3093 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3094 infoblocks are drawn below the "reset" infoblock.
3095
3096 In fact in the picture below we do not show failed matches for Z and T
3097 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3098 more obvious *why* one needs to *temporary* unset infoblocks.]
3099
3100 Matched REx position InfoBlocks Comment
3101 (Y(A)*?Z)*?T x
3102 Y(A)*?Z)*?T x <- O
3103 Y (A)*?Z)*?T x <- O
3104 Y A)*?Z)*?T x <- O <- I
3105 YA )*?Z)*?T x <- O <- I
3106 YA A)*?Z)*?T x <- O <- I
3107 YAA )*?Z)*?T x <- O <- I
3108 YAA Z)*?T x <- O # Temporary unset I
3109 I
3110
3111 YAAZ Y(A)*?Z)*?T x <- O
3112 I
3113
3114 YAAZY (A)*?Z)*?T x <- O
3115 I
3116
3117 YAAZY A)*?Z)*?T x <- O <- I
3118 I
3119
3120 YAAZYA )*?Z)*?T x <- O <- I
3121 I
3122
3123 YAAZYA Z)*?T x <- O # Temporary unset I
3124 I,I
3125
3126 YAAZYAZ )*?T x <- O
3127 I,I
3128
3129 YAAZYAZ T x # Temporary unset O
3130 O
3131 I,I
3132
3133 YAAZYAZT x
3134 O
3135 I,I
3136 *******************************************************************/
a0d0e21e
LW
3137 case CURLYX: {
3138 CURCUR cc;
3280af22 3139 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3140 /* No need to save/restore up to this paren */
3141 I32 parenfloor = scan->flags;
c277df42
IZ
3142
3143 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3144 next += ARG(next);
3280af22
NIS
3145 cc.oldcc = PL_regcc;
3146 PL_regcc = &cc;
cb434fcc
IZ
3147 /* XXXX Probably it is better to teach regpush to support
3148 parenfloor > PL_regsize... */
eb160463 3149 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3150 parenfloor = *PL_reglastparen; /* Pessimization... */
3151 cc.parenfloor = parenfloor;
a0d0e21e
LW
3152 cc.cur = -1;
3153 cc.min = ARG1(scan);
3154 cc.max = ARG2(scan);
c277df42 3155 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3156 cc.next = next;
3157 cc.minmod = minmod;
3158 cc.lastloc = 0;
3280af22 3159 PL_reginput = locinput;
a0d0e21e
LW
3160 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3161 regcpblow(cp);
3280af22 3162 PL_regcc = cc.oldcc;
4633a7c4 3163 saySAME(n);
a0d0e21e
LW
3164 }
3165 /* NOT REACHED */
3166 case WHILEM: {
3167 /*
3168 * This is really hard to understand, because after we match
3169 * what we're trying to match, we must make sure the rest of
2c2d71f5 3170 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3171 * to go back UP the parse tree by recursing ever deeper. And
3172 * if it fails, we have to reset our parent's current state
3173 * that we can try again after backing off.
3174 */
3175
c277df42 3176 CHECKPOINT cp, lastcp;
3280af22 3177 CURCUR* cc = PL_regcc;
c277df42 3178 char *lastloc = cc->lastloc; /* Detection of 0-len. */
6e00f9c6 3179 I32 cache_offset = 0, cache_bit = 0;
c277df42 3180
4633a7c4 3181 n = cc->cur + 1; /* how many we know we matched */
3280af22 3182 PL_reginput = locinput;
a0d0e21e 3183
c277df42 3184 DEBUG_r(
9041c2e3 3185 PerlIO_printf(Perl_debug_log,
91f3b821 3186 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3187 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3188 (long)n, (long)cc->min,
2797576d 3189 (long)cc->max, PTR2UV(cc))
c277df42 3190 );
4633a7c4 3191
a0d0e21e
LW
3192 /* If degenerate scan matches "", assume scan done. */
3193
579cf2c3 3194 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3195 PL_regcc = cc->oldcc;
2ab05381
IZ
3196 if (PL_regcc)
3197 ln = PL_regcc->cur;
c277df42 3198 DEBUG_r(
c3464db5
DD
3199 PerlIO_printf(Perl_debug_log,
3200 "%*s empty match detected, try continuation...\n",
3280af22 3201 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3202 );
a0d0e21e 3203 if (regmatch(cc->next))
4633a7c4 3204 sayYES;
2ab05381
IZ
3205 if (PL_regcc)
3206 PL_regcc->cur = ln;
3280af22 3207 PL_regcc = cc;
4633a7c4 3208 sayNO;
a0d0e21e
LW
3209 }
3210
3211 /* First just match a string of min scans. */
3212
3213 if (n < cc->min) {
3214 cc->cur = n;
3215 cc->lastloc = locinput;
4633a7c4
LW
3216 if (regmatch(cc->scan))
3217 sayYES;
3218 cc->cur = n - 1;
c277df42 3219 cc->lastloc = lastloc;
4633a7c4 3220 sayNO;
a0d0e21e
LW
3221 }
3222
2c2d71f5
JH
3223 if (scan->flags) {
3224 /* Check whether we already were at this position.
3225 Postpone detection until we know the match is not
3226 *that* much linear. */
3227 if (!PL_reg_maxiter) {
3228 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3229 PL_reg_leftiter = PL_reg_maxiter;
3230 }
3231 if (PL_reg_leftiter-- == 0) {
8c18bf38 3232 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
2c2d71f5 3233 if (PL_reg_poscache) {
eb160463 3234 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3235 Renew(PL_reg_poscache, size, char);
3236 PL_reg_poscache_size = size;
3237 }
3238 Zero(PL_reg_poscache, size, char);
3239 }
3240 else {
3241 PL_reg_poscache_size = size;
cd7a8267 3242 Newxz(PL_reg_poscache, size, char);
2c2d71f5
JH
3243 }
3244 DEBUG_r(
3245 PerlIO_printf(Perl_debug_log,
3246 "%sDetected a super-linear match, switching on caching%s...\n",
3247 PL_colors[4], PL_colors[5])
3248 );
3249 }
3250 if (PL_reg_leftiter < 0) {
6e00f9c6 3251 cache_offset = locinput - PL_bostr;
2c2d71f5 3252
6e00f9c6
HS
3253 cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3254 + cache_offset * (scan->flags>>4);
3255 cache_bit = cache_offset % 8;
3256 cache_offset /= 8;
3257 if (PL_reg_poscache[cache_offset] & (1<<cache_bit)) {
2c2d71f5
JH
3258 DEBUG_r(
3259 PerlIO_printf(Perl_debug_log,
3260 "%*s already tried at this position...\n",
3261 REPORT_CODE_OFF+PL_regindent*2, "")
3262 );
6e00f9c6
HS
3263 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3264 /* cache records success */
91513146
JH
3265 sayYES;
3266 else
6e00f9c6 3267 /* cache records failure */
91513146 3268 sayNO_SILENT;
2c2d71f5 3269 }
6e00f9c6 3270 PL_reg_poscache[cache_offset] |= (1<<cache_bit);
2c2d71f5
JH
3271 }
3272 }
3273
a0d0e21e
LW
3274 /* Prefer next over scan for minimal matching. */
3275
3276 if (cc->minmod) {
3280af22 3277 PL_regcc = cc->oldcc;
2ab05381
IZ
3278 if (PL_regcc)
3279 ln = PL_regcc->cur;
5f05dabc 3280 cp = regcppush(cc->parenfloor);
02db2b7b 3281 REGCP_SET(lastcp);
5f05dabc 3282 if (regmatch(cc->next)) {
c277df42 3283 regcpblow(cp);
6e00f9c6 3284 CACHEsayYES; /* All done. */
5f05dabc 3285 }
02db2b7b 3286 REGCP_UNWIND(lastcp);
5f05dabc 3287 regcppop();
2ab05381
IZ
3288 if (PL_regcc)
3289 PL_regcc->cur = ln;
3280af22 3290 PL_regcc = cc;
a0d0e21e 3291
c277df42 3292 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3293 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3294 && !(PL_reg_flags & RF_warned)) {
3295 PL_reg_flags |= RF_warned;
9014280d 3296 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3297 "Complex regular subexpression recursion",
3298 REG_INFTY - 1);
c277df42 3299 }
6e00f9c6 3300 CACHEsayNO;
c277df42 3301 }
a687059c 3302
c277df42 3303 DEBUG_r(
c3464db5
DD
3304 PerlIO_printf(Perl_debug_log,
3305 "%*s trying longer...\n",
3280af22 3306 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3307 );
a0d0e21e 3308 /* Try scanning more and see if it helps. */
3280af22 3309 PL_reginput = locinput;
a0d0e21e
LW
3310 cc->cur = n;
3311 cc->lastloc = locinput;
5f05dabc 3312 cp = regcppush(cc->parenfloor);
02db2b7b 3313 REGCP_SET(lastcp);
5f05dabc 3314 if (regmatch(cc->scan)) {
c277df42 3315 regcpblow(cp);
6e00f9c6 3316 CACHEsayYES;
5f05dabc 3317 }
02db2b7b 3318 REGCP_UNWIND(lastcp);
5f05dabc 3319 regcppop();
4633a7c4 3320 cc->cur = n - 1;
c277df42 3321 cc->lastloc = lastloc;
6e00f9c6 3322 CACHEsayNO;
a0d0e21e
LW
3323 }
3324
3325 /* Prefer scan over next for maximal matching. */
3326
3327 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3328 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3329 cc->cur = n;
3330 cc->lastloc = locinput;
02db2b7b 3331 REGCP_SET(lastcp);
5f05dabc 3332 if (regmatch(cc->scan)) {
c277df42 3333 regcpblow(cp);
6e00f9c6 3334 CACHEsayYES;
5f05dabc 3335 }
02db2b7b 3336 REGCP_UNWIND(lastcp);
a0d0e21e 3337 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3338 PL_reginput = locinput;
c277df42 3339 DEBUG_r(
c3464db5
DD
3340 PerlIO_printf(Perl_debug_log,
3341 "%*s failed, try continuation...\n",
3280af22 3342 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3343 );
3344 }
9041c2e3 3345 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3346 && !(PL_reg_flags & RF_warned)) {
3280af22 3347 PL_reg_flags |= RF_warned;
9014280d 3348 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3349 "Complex regular subexpression recursion",
3350 REG_INFTY - 1);
a0d0e21e
LW
3351 }
3352
3353 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3354 PL_regcc = cc->oldcc;
2ab05381
IZ
3355 if (PL_regcc)
3356 ln = PL_regcc->cur;
a0d0e21e 3357 if (regmatch(cc->next))
6e00f9c6 3358 CACHEsayYES;
2ab05381
IZ
3359 if (PL_regcc)
3360 PL_regcc->cur = ln;
3280af22 3361 PL_regcc = cc;
4633a7c4 3362 cc->cur = n - 1;
c277df42 3363 cc->lastloc = lastloc;
6e00f9c6 3364 CACHEsayNO;
a0d0e21e
LW
3365 }
3366 /* NOT REACHED */
9041c2e3 3367 case BRANCHJ:
c277df42
IZ
3368 next = scan + ARG(scan);
3369 if (next == scan)
3370 next = NULL;
3371 inner = NEXTOPER(NEXTOPER(scan));
3372 goto do_branch;
9041c2e3 3373 case BRANCH:
c277df42
IZ
3374 inner = NEXTOPER(scan);
3375 do_branch:
3376 {
c277df42
IZ
3377 c1 = OP(scan);
3378 if (OP(next) != c1) /* No choice. */
3379 next = inner; /* Avoid recursion. */
a0d0e21e 3380 else {
8c18bf38 3381 const I32 lastparen = *PL_reglastparen;
02db2b7b
IZ
3382 I32 unwind1;
3383 re_unwind_branch_t *uw;
3384
3385 /* Put unwinding data on stack */
3386 unwind1 = SSNEWt(1,re_unwind_branch_t);
3387 uw = SSPTRt(unwind1,re_unwind_branch_t);
3388 uw->prev = unwind;
3389 unwind = unwind1;
3390 uw->type = ((c1 == BRANCH)
3391 ? RE_UNWIND_BRANCH
3392 : RE_UNWIND_BRANCHJ);
3393 uw->lastparen = lastparen;
3394 uw->next = next;
3395 uw->locinput = locinput;
3396 uw->nextchr = nextchr;
3397#ifdef DEBUGGING
3398 uw->regindent = ++PL_regindent;
3399#endif
c277df42 3400
02db2b7b
IZ
3401 REGCP_SET(uw->lastcp);
3402
3403 /* Now go into the first branch */
3404 next = inner;
a687059c 3405 }
a0d0e21e
LW
3406 }
3407 break;
3408 case MINMOD:
3409 minmod = 1;
3410 break;
c277df42
IZ
3411 case CURLYM:
3412 {
00db4c45 3413 I32 l = 0;
c277df42 3414 CHECKPOINT lastcp;
9041c2e3 3415
c277df42 3416 /* We suppose that the next guy does not need
b035a42e 3417 backtracking: in particular, it is of constant non-zero length,
c277df42
IZ
3418 and has no parenths to influence future backrefs. */
3419 ln = ARG1(scan); /* min to match */
3420 n = ARG2(scan); /* max to match */
c277df42
IZ
3421 paren = scan->flags;
3422 if (paren) {
3280af22
NIS
3423 if (paren > PL_regsize)
3424 PL_regsize = paren;
eb160463 3425 if (paren > (I32)*PL_reglastparen)
3280af22 3426 *PL_reglastparen = paren;
c277df42 3427 }
dc45a647 3428 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3429 if (paren)
3430 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3431 PL_reginput = locinput;
c277df42
IZ
3432 if (minmod) {
3433 minmod = 0;
3434 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3435 sayNO;
3280af22 3436 locinput = PL_reginput;
cca55fe3 3437 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3438 regnode *text_node = next;
3439
cca55fe3 3440 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3441
cca55fe3 3442 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3443 else {
cca55fe3 3444 if (PL_regkind[(U8)OP(text_node)] == REF) {
c061b110
JH
3445 c1 = c2 = -1000;
3446 goto assume_ok_MM;
cca55fe3
JP
3447 }
3448 else { c1 = (U8)*STRING(text_node); }
af5decee 3449 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3450 c2 = PL_fold[c1];
af5decee 3451 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3452 c2 = PL_fold_locale[c1];
3453 else
3454 c2 = c1;
3455 }
a0ed51b3
LW
3456 }
3457 else
c277df42 3458 c1 = c2 = -1000;
cca55fe3 3459 assume_ok_MM:
02db2b7b 3460 REGCP_SET(lastcp);
b035a42e 3461 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
c277df42
IZ
3462 /* If it could work, try it. */
3463 if (c1 == -1000 ||
3280af22
NIS
3464 UCHARAT(PL_reginput) == c1 ||
3465 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3466 {
3467 if (paren) {
f31a99c8 3468 if (ln) {
cf93c79d
IZ
3469 PL_regstartp[paren] =
3470 HOPc(PL_reginput, -l) - PL_bostr;
3471 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3472 }
3473 else
cf93c79d 3474 PL_regendp[paren] = -1;
c277df42
IZ
3475 }
3476 if (regmatch(next))
3477 sayYES;
02db2b7b 3478 REGCP_UNWIND(lastcp);
c277df42
IZ
3479 }
3480 /* Couldn't or didn't -- move forward. */
3280af22 3481 PL_reginput = locinput;
c277df42
IZ
3482 if (regrepeat_hard(scan, 1, &l)) {
3483 ln++;
3280af22 3484 locinput = PL_reginput;
c277df42
IZ
3485 }
3486 else
3487 sayNO;
3488 }
a0ed51b3
LW
3489 }
3490 else {
c277df42 3491 n = regrepeat_hard(scan, n, &l);
3280af22 3492 locinput = PL_reginput;
c277df42 3493 DEBUG_r(
5c0ca799 3494 PerlIO_printf(Perl_debug_log,
faccc32b 3495 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3496 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3497 (IV) n, (IV)l)
c277df42
IZ
3498 );
3499 if (n >= ln) {
cca55fe3 3500 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3501 regnode *text_node = next;
3502
cca55fe3 3503 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3504
cca55fe3 3505 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3506 else {
cca55fe3 3507 if (PL_regkind[(U8)OP(text_node)] == REF) {
c061b110
JH
3508 c1 = c2 = -1000;
3509 goto assume_ok_REG;
cca55fe3
JP
3510 }
3511 else { c1 = (U8)*STRING(text_node); }
3512
af5decee 3513 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3514 c2 = PL_fold[c1];
af5decee 3515 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3516 c2 = PL_fold_locale[c1];
3517 else
3518 c2 = c1;
3519 }
a0ed51b3
LW
3520 }
3521 else
c277df42
IZ
3522 c1 = c2 = -1000;
3523 }
cca55fe3 3524 assume_ok_REG:
02db2b7b 3525 REGCP_SET(lastcp);
c277df42
IZ
3526 while (n >= ln) {
3527 /* If it could work, try it. */
3528 if (c1 == -1000 ||
3280af22
NIS
3529 UCHARAT(PL_reginput) == c1 ||
3530 UCHARAT(PL_reginput) == c2)
a0ed51b3
LW
3531 {
3532 DEBUG_r(
c3464db5 3533 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3534 "%*s trying tail with n=%"IVdf"...\n",
3535 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3536 );
3537 if (paren) {
3538 if (n) {
cf93c79d
IZ
3539 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3540 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3541 }
a0ed51b3 3542 else
cf93c79d 3543 PL_regendp[paren] = -1;
c277df42 3544 }
a0ed51b3
LW
3545 if (regmatch(next))
3546 sayYES;
02db2b7b 3547 REGCP_UNWIND(lastcp);
a0ed51b3 3548 }
c277df42
IZ
3549 /* Couldn't or didn't -- back up. */
3550 n--;
dfe13c55 3551 locinput = HOPc(locinput, -l);
3280af22 3552 PL_reginput = locinput;
c277df42
IZ
3553 }
3554 }
3555 sayNO;
3556 break;
3557 }
3558 case CURLYN:
3559 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3560 if (paren > PL_regsize)
3561 PL_regsize = paren;
eb160463 3562 if (paren > (I32)*PL_reglastparen)
3280af22 3563 *PL_reglastparen = paren;
c277df42
IZ
3564 ln = ARG1(scan); /* min to match */
3565 n = ARG2(scan); /* max to match */
dc45a647 3566 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3567 goto repeat;
a0d0e21e 3568 case CURLY:
c277df42 3569 paren = 0;
a0d0e21e
LW
3570 ln = ARG1(scan); /* min to match */
3571 n = ARG2(scan); /* max to match */
dc45a647 3572 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3573 goto repeat;
3574 case STAR:
3575 ln = 0;
c277df42 3576 n = REG_INFTY;
a0d0e21e 3577 scan = NEXTOPER(scan);
c277df42 3578 paren = 0;
a0d0e21e
LW
3579 goto repeat;
3580 case PLUS:
c277df42
IZ
3581 ln = 1;
3582 n = REG_INFTY;
3583 scan = NEXTOPER(scan);
3584 paren = 0;
3585 repeat:
a0d0e21e
LW
3586 /*
3587 * Lookahead to avoid useless match attempts
3588 * when we know what character comes next.
3589 */
5f80c4cf
JP
3590
3591 /*
3592 * Used to only do .*x and .*?x, but now it allows
3593 * for )'s, ('s and (?{ ... })'s to be in the way
3594 * of the quantifier and the EXACT-like node. -- japhy
3595 */
3596
cca55fe3 3597 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3598 U8 *s;
3599 regnode *text_node = next;
3600
cca55fe3 3601 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3602
cca55fe3 3603 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3604 else {
cca55fe3 3605 if (PL_regkind[(U8)OP(text_node)] == REF) {
c061b110
JH
3606 c1 = c2 = -1000;
3607 goto assume_ok_easy;
cca55fe3
JP
3608 }
3609 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
3610
3611 if (!UTF) {
3612 c2 = c1 = *s;
f65d3ee7 3613 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3614 c2 = PL_fold[c1];
f65d3ee7 3615 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf 3616 c2 = PL_fold_locale[c1];
1aa99e6b 3617 }
5f80c4cf 3618 else { /* UTF */
f65d3ee7 3619 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3620 STRLEN ulen1, ulen2;
a2a469f9
JH
3621 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3622 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
3623
3624 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3625 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3626
a2a469f9 3627 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
f5e9f069 3628 uniflags);
a2a469f9 3629 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
f5e9f069 3630 uniflags);
5f80c4cf
JP
3631 }
3632 else {
a2a469f9 3633 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
f5e9f069 3634 uniflags);
5f80c4cf 3635 }
1aa99e6b
IH
3636 }
3637 }
bbce6d69 3638 }
a0d0e21e 3639 else
bbce6d69 3640 c1 = c2 = -1000;
cca55fe3 3641 assume_ok_easy:
3280af22 3642 PL_reginput = locinput;
a0d0e21e 3643 if (minmod) {
c277df42 3644 CHECKPOINT lastcp;
a0d0e21e
LW
3645 minmod = 0;
3646 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 3647 sayNO;
a0ed51b3 3648 locinput = PL_reginput;
02db2b7b 3649 REGCP_SET(lastcp);
0fe9bf95 3650 if (c1 != -1000) {
1aa99e6b 3651 char *e; /* Should not check after this */
0fe9bf95 3652 char *old = locinput;
b2f2f093 3653 int count = 0;
0fe9bf95 3654
1aa99e6b 3655 if (n == REG_INFTY) {
0fe9bf95 3656 e = PL_regeol - 1;
1aa99e6b
IH
3657 if (do_utf8)
3658 while (UTF8_IS_CONTINUATION(*(U8*)e))
3659 e--;
3660 }
3661 else if (do_utf8) {
3662 int m = n - ln;
3663 for (e = locinput;
3664 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3665 e += UTF8SKIP(e);
3666 }
3667 else {
3668 e = locinput + n - ln;
3669 if (e >= PL_regeol)
3670 e = PL_regeol - 1;
3671 }
0fe9bf95
IZ
3672 while (1) {
3673 /* Find place 'next' could work */
1aa99e6b
IH
3674 if (!do_utf8) {
3675 if (c1 == c2) {
a8e8ab15
JH
3676 while (locinput <= e &&
3677 UCHARAT(locinput) != c1)
1aa99e6b
IH
3678 locinput++;
3679 } else {
9041c2e3 3680 while (locinput <= e
a8e8ab15
JH
3681 && UCHARAT(locinput) != c1
3682 && UCHARAT(locinput) != c2)
1aa99e6b
IH
3683 locinput++;
3684 }
3685 count = locinput - old;
3686 }
3687 else {
1aa99e6b 3688 if (c1 == c2) {
8c18bf38 3689 STRLEN len;
872c91ae
JH
3690 /* count initialised to
3691 * utf8_distance(old, locinput) */
b2f2f093 3692 while (locinput <= e &&
872c91ae 3693 utf8n_to_uvchr((U8*)locinput,
a2a469f9 3694 UTF8_MAXBYTES, &len,
f5e9f069 3695 uniflags) != (UV)c1) {
1aa99e6b 3696 locinput += len;
b2f2f093
JH
3697 count++;
3698 }
1aa99e6b 3699 } else {
8c18bf38 3700 STRLEN len;
872c91ae
JH
3701 /* count initialised to
3702 * utf8_distance(old, locinput) */
b2f2f093 3703 while (locinput <= e) {
872c91ae 3704 UV c = utf8n_to_uvchr((U8*)locinput,
a2a469f9 3705 UTF8_MAXBYTES, &len,
f5e9f069 3706 uniflags);
eb160463 3707 if (c == (UV)c1 || c == (UV)c2)
1aa99e6b 3708 break;
b2f2f093
JH
3709 locinput += len;
3710 count++;
1aa99e6b
IH
3711 }
3712 }
0fe9bf95 3713 }
9041c2e3 3714 if (locinput > e)
0fe9bf95
IZ
3715 sayNO;
3716 /* PL_reginput == old now */
3717 if (locinput != old) {
3718 ln = 1; /* Did some */
1aa99e6b 3719 if (regrepeat(scan, count) < count)
0fe9bf95
IZ
3720 sayNO;
3721 }
3722 /* PL_reginput == locinput now */
29d1e993 3723 TRYPAREN(paren, ln, locinput);
0fe9bf95 3724 PL_reginput = locinput; /* Could be reset... */
02db2b7b 3725 REGCP_UNWIND(lastcp);
0fe9bf95 3726 /* Couldn't or didn't -- move forward. */
1aa99e6b
IH
3727 old = locinput;
3728 if (do_utf8)
3729 locinput += UTF8SKIP(locinput);
3730 else
3731 locinput++;
b2f2f093 3732 count = 1;
0fe9bf95
IZ
3733 }
3734 }
3735 else
c277df42 3736 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
1aa99e6b
IH
3737 UV c;
3738 if (c1 != -1000) {
3739 if (do_utf8)
872c91ae 3740 c = utf8n_to_uvchr((U8*)PL_reginput,
a2a469f9 3741 UTF8_MAXBYTES, 0,
f5e9f069 3742 uniflags);
1aa99e6b 3743 else
9041c2e3 3744 c = UCHARAT(PL_reginput);
2390ecbc 3745 /* If it could work, try it. */
eb160463 3746 if (c == (UV)c1 || c == (UV)c2)
2390ecbc 3747 {
5b7ea690 3748 TRYPAREN(paren, ln, PL_reginput);
2390ecbc
PP
3749 REGCP_UNWIND(lastcp);
3750 }
1aa99e6b 3751 }
a0d0e21e 3752 /* If it could work, try it. */
2390ecbc 3753 else if (c1 == -1000)
bbce6d69 3754 {
5b7ea690 3755 TRYPAREN(paren, ln, PL_reginput);
02db2b7b 3756 REGCP_UNWIND(lastcp);
bbce6d69 3757 }
c277df42 3758 /* Couldn't or didn't -- move forward. */
a0ed51b3 3759 PL_reginput = locinput;
a0d0e21e
LW
3760 if (regrepeat(scan, 1)) {
3761 ln++;
a0ed51b3
LW
3762 locinput = PL_reginput;
3763 }
3764 else
4633a7c4 3765 sayNO;
a0d0e21e
LW
3766 }
3767 }
3768 else {
c277df42 3769 CHECKPOINT lastcp;
a0d0e21e 3770 n = regrepeat(scan, n);
a0ed51b3 3771 locinput = PL_reginput;
22c35a8c 3772 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
15272685
HS
3773 ((!PL_multiline && OP(next) != MEOL) ||
3774 OP(next) == SEOL || OP(next) == EOS))
3775 {
a0d0e21e 3776 ln = n; /* why back off? */
1aeab75a
GS
3777 /* ...because $ and \Z can match before *and* after
3778 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3779 We should back off by one in this case. */
3780 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3781 ln--;
3782 }
02db2b7b 3783 REGCP_SET(lastcp);
c277df42 3784 if (paren) {
8fa7f367 3785 UV c = 0;
c277df42 3786 while (n >= ln) {
1aa99e6b
IH
3787 if (c1 != -1000) {
3788 if (do_utf8)
872c91ae 3789 c = utf8n_to_uvchr((U8*)PL_reginput,
a2a469f9 3790 UTF8_MAXBYTES, 0,
f5e9f069 3791 uniflags);
1aa99e6b 3792 else
9041c2e3 3793 c = UCHARAT(PL_reginput);
1aa99e6b 3794 }
c277df42 3795 /* If it could work, try it. */
eb160463 3796 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3797 {
29d1e993 3798 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3799 REGCP_UNWIND(lastcp);
c277df42
IZ
3800 }
3801 /* Couldn't or didn't -- back up. */
3802 n--;
dfe13c55 3803 PL_reginput = locinput = HOPc(locinput, -1);
c277df42 3804 }
a0ed51b3
LW
3805 }
3806 else {
8fa7f367 3807 UV c = 0;
c277df42 3808 while (n >= ln) {
1aa99e6b
IH
3809 if (c1 != -1000) {
3810 if (do_utf8)
872c91ae 3811 c = utf8n_to_uvchr((U8*)PL_reginput,
a2a469f9 3812 UTF8_MAXBYTES, 0,
f5e9f069 3813 uniflags);
1aa99e6b 3814 else
9041c2e3 3815 c = UCHARAT(PL_reginput);
1aa99e6b 3816 }
c277df42 3817 /* If it could work, try it. */
eb160463 3818 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3819 {
29d1e993 3820 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3821 REGCP_UNWIND(lastcp);
c277df42
IZ
3822 }
3823 /* Couldn't or didn't -- back up. */
3824 n--;
dfe13c55 3825 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 3826 }
a0d0e21e
LW
3827 }
3828 }
4633a7c4 3829 sayNO;
c277df42 3830 break;
a0d0e21e 3831 case END:
0f5d15d6
IZ
3832 if (PL_reg_call_cc) {
3833 re_cc_state *cur_call_cc = PL_reg_call_cc;
3834 CURCUR *cctmp = PL_regcc;
3835 regexp *re = PL_reg_re;
3836 CHECKPOINT cp, lastcp;
3837
3838 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3839 REGCP_SET(lastcp);
0f5d15d6
IZ
3840 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3841 the caller. */
3842 PL_reginput = locinput; /* Make position available to
3843 the callcc. */
3844 cache_re(PL_reg_call_cc->re);
3845 PL_regcc = PL_reg_call_cc->cc;
3846 PL_reg_call_cc = PL_reg_call_cc->prev;
3847 if (regmatch(cur_call_cc->node)) {
3848 PL_reg_call_cc = cur_call_cc;
3849 regcpblow(cp);
3850 sayYES;
3851 }
02db2b7b 3852 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3853 regcppop();
3854 PL_reg_call_cc = cur_call_cc;
3855 PL_regcc = cctmp;
3856 PL_reg_re = re;
3857 cache_re(re);
3858
3859 DEBUG_r(
3860 PerlIO_printf(Perl_debug_log,
3861 "%*s continuation failed...\n",
3862 REPORT_CODE_OFF+PL_regindent*2, "")
3863 );
7821416a 3864 sayNO_SILENT;
0f5d15d6 3865 }
7821416a
IZ
3866 if (locinput < PL_regtill) {
3867 DEBUG_r(PerlIO_printf(Perl_debug_log,
3868 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3869 PL_colors[4],
3870 (long)(locinput - PL_reg_starttry),
3871 (long)(PL_regtill - PL_reg_starttry),
3872 PL_colors[5]));
3873 sayNO_FINAL; /* Cannot match: too short. */
3874 }
3875 PL_reginput = locinput; /* put where regtry can find it */
3876 sayYES_FINAL; /* Success! */
7e5428c5 3877 case SUCCEED:
3280af22 3878 PL_reginput = locinput; /* put where regtry can find it */
7821416a 3879 sayYES_LOUD; /* Success! */
c277df42
IZ
3880 case SUSPEND:
3881 n = 1;
9fe1d20c 3882 PL_reginput = locinput;
9041c2e3 3883 goto do_ifmatch;
a0d0e21e 3884 case UNLESSM:
c277df42 3885 n = 0;
a0ed51b3 3886 if (scan->flags) {
efb30f32
HS
3887 s = HOPBACKc(locinput, scan->flags);
3888 if (!s)
3889 goto say_yes;
3890 PL_reginput = s;
a0ed51b3
LW
3891 }
3892 else
3893 PL_reginput = locinput;
c277df42
IZ
3894 goto do_ifmatch;
3895 case IFMATCH:
3896 n = 1;
a0ed51b3 3897 if (scan->flags) {
efb30f32
HS
3898 s = HOPBACKc(locinput, scan->flags);
3899 if (!s)
3900 goto say_no;
3901 PL_reginput = s;
a0ed51b3
LW
3902 }
3903 else
3904 PL_reginput = locinput;
3905
c277df42 3906 do_ifmatch:
c277df42
IZ
3907 inner = NEXTOPER(NEXTOPER(scan));
3908 if (regmatch(inner) != n) {
3909 say_no:
3910 if (logical) {
3911 logical = 0;
3912 sw = 0;
3913 goto do_longjump;
a0ed51b3
LW
3914 }
3915 else
c277df42
IZ
3916 sayNO;
3917 }
3918 say_yes:
3919 if (logical) {
3920 logical = 0;
3921 sw = 1;
3922 }
fe44a5e8 3923 if (OP(scan) == SUSPEND) {
3280af22 3924 locinput = PL_reginput;
565764a8 3925 nextchr = UCHARAT(locinput);
fe44a5e8 3926 }
c277df42
IZ
3927 /* FALL THROUGH. */
3928 case LONGJMP:
3929 do_longjump:
3930 next = scan + ARG(scan);
3931 if (next == scan)
3932 next = NULL;
a0d0e21e
LW
3933 break;
3934 default:
b900a521 3935 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 3936 PTR2UV(scan), OP(scan));
cea2e8a9 3937 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 3938 }
02db2b7b 3939 reenter:
a0d0e21e
LW
3940 scan = next;
3941 }
a687059c 3942
a0d0e21e
LW
3943 /*
3944 * We get here only if there's trouble -- normally "case END" is
3945 * the terminating point.
3946 */
cea2e8a9 3947 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 3948 /*NOTREACHED*/
4633a7c4
LW
3949 sayNO;
3950
7821416a
IZ
3951yes_loud:
3952 DEBUG_r(
3953 PerlIO_printf(Perl_debug_log,
3954 "%*s %scould match...%s\n",
3955 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3956 );
3957 goto yes;
3958yes_final:
3959 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3960 PL_colors[4],PL_colors[5]));
4633a7c4
LW
3961yes:
3962#ifdef DEBUGGING
3280af22 3963 PL_regindent--;
4633a7c4 3964#endif
02db2b7b
IZ
3965
3966#if 0 /* Breaks $^R */
3967 if (unwind)
3968 regcpblow(firstcp);
3969#endif
4633a7c4
LW
3970 return 1;
3971
3972no:
7821416a
IZ
3973 DEBUG_r(
3974 PerlIO_printf(Perl_debug_log,
3975 "%*s %sfailed...%s\n",
3976 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3977 );
3978 goto do_no;
3979no_final:
3980do_no:
02db2b7b
IZ
3981 if (unwind) {
3982 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3983
3984 switch (uw->type) {
3985 case RE_UNWIND_BRANCH:
3986 case RE_UNWIND_BRANCHJ:
3987 {
3988 re_unwind_branch_t *uwb = &(uw->branch);
8c18bf38 3989 const I32 lastparen = uwb->lastparen;
9041c2e3 3990
02db2b7b
IZ
3991 REGCP_UNWIND(uwb->lastcp);
3992 for (n = *PL_reglastparen; n > lastparen; n--)
3993 PL_regendp[n] = -1;
3994 *PL_reglastparen = n;
3995 scan = next = uwb->next;
9041c2e3
NIS
3996 if ( !scan ||
3997 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b
IZ
3998 ? BRANCH : BRANCHJ) ) { /* Failure */
3999 unwind = uwb->prev;
4000#ifdef DEBUGGING
4001 PL_regindent--;
4002#endif
4003 goto do_no;
4004 }
4005 /* Have more choice yet. Reuse the same uwb. */
02db2b7b
IZ
4006 if ((n = (uwb->type == RE_UNWIND_BRANCH
4007 ? NEXT_OFF(next) : ARG(next))))
4008 next += n;
4009 else
4010 next = NULL; /* XXXX Needn't unwinding in this case... */
4011 uwb->next = next;
4012 next = NEXTOPER(scan);
4013 if (uwb->type == RE_UNWIND_BRANCHJ)
4014 next = NEXTOPER(next);
4015 locinput = uwb->locinput;
4016 nextchr = uwb->nextchr;
4017#ifdef DEBUGGING
4018 PL_regindent = uwb->regindent;
4019#endif
4020
4021 goto reenter;
4022 }
4023 /* NOT REACHED */
4024 default:
4025 Perl_croak(aTHX_ "regexp unwind memory corruption");
4026 }
4027 /* NOT REACHED */
4028 }
4633a7c4 4029#ifdef DEBUGGING
3280af22 4030 PL_regindent--;
4633a7c4 4031#endif
a0d0e21e 4032 return 0;
a687059c
LW
4033}
4034
4035/*
4036 - regrepeat - repeatedly match something simple, report how many
4037 */
4038/*
4039 * [This routine now assumes that it will only match on things of length 1.
4040 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4041 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4042 */
76e3520e 4043STATIC I32
8c18bf38 4044S_regrepeat(pTHX_ const regnode *p, I32 max)
a687059c 4045{
a0d0e21e 4046 register char *scan;
a0d0e21e 4047 register I32 c;
3280af22 4048 register char *loceol = PL_regeol;
a0ed51b3 4049 register I32 hardcount = 0;
53c4c00c 4050 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4051
3280af22 4052 scan = PL_reginput;
faf11cac
HS
4053 if (max == REG_INFTY)
4054 max = I32_MAX;
4055 else if (max < loceol - scan)
a0d0e21e 4056 loceol = scan + max;
a0d0e21e 4057 switch (OP(p)) {
22c35a8c 4058 case REG_ANY:
1aa99e6b 4059 if (do_utf8) {
ffc61ed2 4060 loceol = PL_regeol;
1aa99e6b 4061 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4062 scan += UTF8SKIP(scan);
4063 hardcount++;
4064 }
4065 } else {
4066 while (scan < loceol && *scan != '\n')
4067 scan++;
a0ed51b3
LW
4068 }
4069 break;
ffc61ed2 4070 case SANY:
def8e4ea
JH
4071 if (do_utf8) {
4072 loceol = PL_regeol;
a0804c9e 4073 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4074 scan += UTF8SKIP(scan);
4075 hardcount++;
4076 }
4077 }
4078 else
4079 scan = loceol;
a0ed51b3 4080 break;
f33976b4
DB
4081 case CANY:
4082 scan = loceol;
4083 break;
bbce6d69 4084 case EXACT: /* length of string is 1 */
cd439c50 4085 c = (U8)*STRING(p);
bbce6d69 4086 while (scan < loceol && UCHARAT(scan) == c)
4087 scan++;
4088 break;
4089 case EXACTF: /* length of string is 1 */
cd439c50 4090 c = (U8)*STRING(p);
bbce6d69 4091 while (scan < loceol &&
22c35a8c 4092 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4093 scan++;
4094 break;
4095 case EXACTFL: /* length of string is 1 */
3280af22 4096 PL_reg_flags |= RF_tainted;
cd439c50 4097 c = (U8)*STRING(p);
bbce6d69 4098 while (scan < loceol &&
22c35a8c 4099 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4100 scan++;
4101 break;
4102 case ANYOF:
ffc61ed2
JH
4103 if (do_utf8) {
4104 loceol = PL_regeol;
338501c1
JH
4105 while (hardcount < max && scan < loceol &&
4106 reginclass(p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4107 scan += UTF8SKIP(scan);
4108 hardcount++;
4109 }
4110 } else {
7d3e948e 4111 while (scan < loceol && REGINCLASS(p, (U8*)scan))
ffc61ed2
JH
4112 scan++;
4113 }
a0d0e21e
LW
4114 break;
4115 case ALNUM:
1aa99e6b 4116 if (do_utf8) {
ffc61ed2 4117 loceol = PL_regeol;
fab1f349 4118 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4119 while (hardcount < max && scan < loceol &&
3568d838 4120 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4121 scan += UTF8SKIP(scan);
4122 hardcount++;
4123 }
4124 } else {
4125 while (scan < loceol && isALNUM(*scan))
4126 scan++;
a0ed51b3
LW
4127 }
4128 break;
bbce6d69 4129 case ALNUML:
3280af22 4130 PL_reg_flags |= RF_tainted;
1aa99e6b 4131 if (do_utf8) {
ffc61ed2 4132 loceol = PL_regeol;
1aa99e6b
IH
4133 while (hardcount < max && scan < loceol &&
4134 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4135 scan += UTF8SKIP(scan);
4136 hardcount++;
4137 }
4138 } else {
4139 while (scan < loceol && isALNUM_LC(*scan))
4140 scan++;
a0ed51b3
LW
4141 }
4142 break;
a0d0e21e 4143 case NALNUM:
1aa99e6b 4144 if (do_utf8) {
ffc61ed2 4145 loceol = PL_regeol;
fab1f349 4146 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4147 while (hardcount < max && scan < loceol &&
3568d838 4148 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4149 scan += UTF8SKIP(scan);
4150 hardcount++;
4151 }
4152 } else {
4153 while (scan < loceol && !isALNUM(*scan))
4154 scan++;
a0ed51b3
LW
4155 }
4156 break;
bbce6d69 4157 case NALNUML:
3280af22 4158 PL_reg_flags |= RF_tainted;
1aa99e6b 4159 if (do_utf8) {
ffc61ed2 4160 loceol = PL_regeol;
1aa99e6b
IH
4161 while (hardcount < max && scan < loceol &&
4162 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4163 scan += UTF8SKIP(scan);
4164 hardcount++;
4165 }
4166 } else {
4167 while (scan < loceol && !isALNUM_LC(*scan))
4168 scan++;
a0ed51b3
LW
4169 }
4170 break;
a0d0e21e 4171 case SPACE:
1aa99e6b 4172 if (do_utf8) {
ffc61ed2 4173 loceol = PL_regeol;
fab1f349 4174 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4175 while (hardcount < max && scan < loceol &&
3568d838
JH
4176 (*scan == ' ' ||
4177 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4178 scan += UTF8SKIP(scan);
4179 hardcount++;
4180 }
4181 } else {
4182 while (scan < loceol && isSPACE(*scan))
4183 scan++;
a0ed51b3
LW
4184 }
4185 break;
bbce6d69 4186 case SPACEL:
3280af22 4187 PL_reg_flags |= RF_tainted;
1aa99e6b 4188 if (do_utf8) {
ffc61ed2 4189 loceol = PL_regeol;
1aa99e6b 4190 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4191 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4192 scan += UTF8SKIP(scan);
4193 hardcount++;
4194 }
4195 } else {
4196 while (scan < loceol && isSPACE_LC(*scan))
4197 scan++;
a0ed51b3
LW
4198 }
4199 break;
a0d0e21e 4200 case NSPACE:
1aa99e6b 4201 if (do_utf8) {
ffc61ed2 4202 loceol = PL_regeol;
fab1f349 4203 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4204 while (hardcount < max && scan < loceol &&
3568d838
JH
4205 !(*scan == ' ' ||
4206 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4207 scan += UTF8SKIP(scan);
4208 hardcount++;
4209 }
4210 } else {
4211 while (scan < loceol && !isSPACE(*scan))
4212 scan++;
4213 break;
a0ed51b3 4214 }
bbce6d69 4215 case NSPACEL:
3280af22 4216 PL_reg_flags |= RF_tainted;
1aa99e6b 4217 if (do_utf8) {
ffc61ed2 4218 loceol = PL_regeol;
1aa99e6b 4219 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4220 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4221 scan += UTF8SKIP(scan);
4222 hardcount++;
4223 }
4224 } else {
4225 while (scan < loceol && !isSPACE_LC(*scan))
4226 scan++;
a0ed51b3
LW
4227 }
4228 break;
a0d0e21e 4229 case DIGIT:
1aa99e6b 4230 if (do_utf8) {
ffc61ed2 4231 loceol = PL_regeol;
fab1f349 4232 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4233 while (hardcount < max && scan < loceol &&
3568d838 4234 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4235 scan += UTF8SKIP(scan);
4236 hardcount++;
4237 }
4238 } else {
4239 while (scan < loceol && isDIGIT(*scan))
4240 scan++;
a0ed51b3
LW
4241 }
4242 break;
a0d0e21e 4243 case NDIGIT:
1aa99e6b 4244 if (do_utf8) {
ffc61ed2 4245 loceol = PL_regeol;
fab1f349 4246 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4247 while (hardcount < max && scan < loceol &&
3568d838 4248 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4249 scan += UTF8SKIP(scan);
4250 hardcount++;
4251 }
4252 } else {
4253 while (scan < loceol && !isDIGIT(*scan))
4254 scan++;
a0ed51b3
LW
4255 }
4256 break;
a0d0e21e
LW
4257 default: /* Called on something of 0 width. */
4258 break; /* So match right here or not at all. */
4259 }
a687059c 4260
a0ed51b3
LW
4261 if (hardcount)
4262 c = hardcount;
4263 else
4264 c = scan - PL_reginput;
3280af22 4265 PL_reginput = scan;
a687059c 4266
9041c2e3 4267 DEBUG_r(
c277df42
IZ
4268 {
4269 SV *prop = sv_newmortal();
4270
47a91a97 4271 regprop(prop, (regnode *)p);
9041c2e3
NIS
4272 PerlIO_printf(Perl_debug_log,
4273 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
fdac8c4b 4274 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
c277df42 4275 });
9041c2e3 4276
a0d0e21e 4277 return(c);
a687059c
LW
4278}
4279
4280/*
c277df42 4281 - regrepeat_hard - repeatedly match something, report total lenth and length
9041c2e3 4282 *
b035a42e 4283 * The repeater is supposed to have constant non-zero length.
c277df42
IZ
4284 */
4285
76e3520e 4286STATIC I32
cea2e8a9 4287S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
c277df42 4288{
0e2d6244 4289 register char *scan = NULL;
c277df42 4290 register char *start;
3280af22 4291 register char *loceol = PL_regeol;
a0ed51b3 4292 I32 l = 0;
708e3b05 4293 I32 count = 0, res = 1;
a0ed51b3
LW
4294
4295 if (!max)
4296 return 0;
c277df42 4297
3280af22 4298 start = PL_reginput;
53c4c00c 4299 if (PL_reg_match_utf8) {
708e3b05 4300 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4301 if (!count++) {
4302 l = 0;
4303 while (start < PL_reginput) {
4304 l++;
4305 start += UTF8SKIP(start);
4306 }
4307 *lp = l;
4308 if (l == 0)
4309 return max;
4310 }
4311 if (count == max)
4312 return count;
4313 }
4314 }
4315 else {
708e3b05 4316 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4317 if (!count++) {
4318 *lp = l = PL_reginput - start;
4319 if (max != REG_INFTY && l*max < loceol - scan)
4320 loceol = scan + l*max;
4321 if (l == 0)
4322 return max;
c277df42
IZ
4323 }
4324 }
4325 }
708e3b05 4326 if (!res)
3280af22 4327 PL_reginput = scan;
9041c2e3 4328
a0ed51b3 4329 return count;
c277df42
IZ
4330}
4331
4332/*
ffc61ed2
JH
4333- regclass_swash - prepare the utf8 swash
4334*/
4335
4336SV *
9e55ce06 4337Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4338{
9e55ce06
JH
4339 SV *sw = NULL;
4340 SV *si = NULL;
4341 SV *alt = NULL;
ffc61ed2
JH
4342
4343 if (PL_regdata && PL_regdata->count) {
8c18bf38 4344 const U32 n = ARG(node);
ffc61ed2
JH
4345
4346 if (PL_regdata->what[n] == 's') {
c6d79d47
AL
4347 SV * const rv = (SV*)PL_regdata->data[n];
4348 AV * const av = (AV*)SvRV((SV*)rv);
339a2a6a 4349 SV **const ary = AvARRAY(av);
9e55ce06 4350 SV **a, **b;
9041c2e3 4351
ef049b7c 4352 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
4353 * documentation of these array elements. */
4354
b11f357e 4355 si = *ary;
19c080ff 4356 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
4357 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4358
ffc61ed2
JH
4359 if (a)
4360 sw = *a;
4361 else if (si && doinit) {
4362 sw = swash_init("utf8", "", si, 1, 0);
4363 (void)av_store(av, 1, sw);
4364 }
9e55ce06
JH
4365 if (b)
4366 alt = *b;
ffc61ed2
JH
4367 }
4368 }
4369
9e55ce06
JH
4370 if (listsvp)
4371 *listsvp = si;
4372 if (altsvp)
4373 *altsvp = alt;
ffc61ed2
JH
4374
4375 return sw;
4376}
4377
4378/*
ba7b4546 4379 - reginclass - determine if a character falls into a character class
832705d4
JH
4380
4381 The n is the ANYOF regnode, the p is the target string, lenp
4382 is pointer to the maximum length of how far to go in the p
4383 (if the lenp is zero, UTF8SKIP(p) is used),
4384 do_utf8 tells whether the target string is in UTF-8.
4385
bbce6d69 4386 */
4387
76e3520e 4388STATIC bool
8c18bf38 4389S_reginclass(pTHX_ register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4390{
8c18bf38 4391 const char flags = ANYOF_FLAGS(n);
bbce6d69 4392 bool match = FALSE;
cc07378b 4393 UV c = *p;
ae9ddab8 4394 STRLEN len = 0;
9e55ce06 4395 STRLEN plen;
1aa99e6b 4396
19c080ff 4397 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
47a91a97 4398 c = utf8n_to_uvchr((U8 *)p, UTF8_MAXBYTES, &len,
19c080ff
NC
4399 ckWARN(WARN_UTF8) ? UTF8_CHECK_ONLY :
4400 UTF8_ALLOW_ANYUV|UTF8_CHECK_ONLY);
4401 if (len == (STRLEN)-1)
4402 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
4403 }
bbce6d69 4404
0f0076b4 4405 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4406 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4407 if (lenp)
4408 *lenp = 0;
ffc61ed2 4409 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4410 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4411 match = TRUE;
bbce6d69 4412 }
3568d838 4413 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4414 match = TRUE;
ffc61ed2 4415 if (!match) {
9e55ce06 4416 AV *av;
47a91a97 4417 SV * const sw = regclass_swash((regnode *)n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4418
4419 if (sw) {
47a91a97 4420 if (swash_fetch(sw, (U8 *)p, do_utf8))
ffc61ed2
JH
4421 match = TRUE;
4422 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4423 if (!match && lenp && av) {
4424 I32 i;
9e55ce06 4425 for (i = 0; i <= av_len(av); i++) {
c6d79d47 4426 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 4427 STRLEN len;
c6d79d47 4428 const char * const s = SvPV_const(sv, len);
9e55ce06 4429
061b10df 4430 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4431 *lenp = len;
4432 match = TRUE;
4433 break;
4434 }
4435 }
4436 }
4437 if (!match) {
a2a469f9 4438 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
4439 STRLEN tmplen;
4440
47a91a97 4441 to_utf8_fold((U8 *)p, tmpbuf, &tmplen);
9e55ce06
JH
4442 if (swash_fetch(sw, tmpbuf, do_utf8))
4443 match = TRUE;
4444 }
ffc61ed2
JH
4445 }
4446 }
bbce6d69 4447 }
9e55ce06 4448 if (match && lenp && *lenp == 0)
0f0076b4 4449 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4450 }
1aa99e6b 4451 if (!match && c < 256) {
ffc61ed2
JH
4452 if (ANYOF_BITMAP_TEST(n, c))
4453 match = TRUE;
4454 else if (flags & ANYOF_FOLD) {
eb160463 4455 U8 f;
a0ed51b3 4456
ffc61ed2
JH
4457 if (flags & ANYOF_LOCALE) {
4458 PL_reg_flags |= RF_tainted;
4459 f = PL_fold_locale[c];
4460 }
4461 else
4462 f = PL_fold[c];
4463 if (f != c && ANYOF_BITMAP_TEST(n, f))
4464 match = TRUE;
4465 }
4466
4467 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4468 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4469 if (
4470 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4471 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4472 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4473 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4474 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4475 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4476 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4477 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4478 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4479 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4480 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4481 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4482 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4483 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4484 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4485 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4486 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4487 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4488 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4489 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4490 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4491 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4492 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4493 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4494 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4495 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4496 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4497 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4498 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4499 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4500 ) /* How's that for a conditional? */
4501 {
4502 match = TRUE;
4503 }
a0ed51b3 4504 }
a0ed51b3
LW
4505 }
4506
a0ed51b3
LW
4507 return (flags & ANYOF_INVERT) ? !match : match;
4508}
161b471a 4509
dfe13c55 4510STATIC U8 *
cea2e8a9 4511S_reghop(pTHX_ U8 *s, I32 off)
9041c2e3 4512{
1aa99e6b
IH
4513 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4514}
4515
4516STATIC U8 *
4517S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
9041c2e3 4518{
a0ed51b3 4519 if (off >= 0) {
1aa99e6b 4520 while (off-- && s < lim) {
ffc61ed2 4521 /* XXX could check well-formedness here */
a0ed51b3 4522 s += UTF8SKIP(s);
ffc61ed2 4523 }
a0ed51b3
LW
4524 }
4525 else {
4526 while (off++) {
1aa99e6b 4527 if (s > lim) {
a0ed51b3 4528 s--;
ffc61ed2 4529 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4530 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4531 s--;
ffc61ed2
JH
4532 }
4533 /* XXX could check well-formedness here */
a0ed51b3
LW
4534 }
4535 }
4536 }
4537 return s;
4538}
161b471a 4539
dfe13c55 4540STATIC U8 *
1aa99e6b 4541S_reghopmaybe(pTHX_ U8 *s, I32 off)
9041c2e3 4542{
1aa99e6b
IH
4543 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4544}
4545
4546STATIC U8 *
4547S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
a0ed51b3
LW
4548{
4549 if (off >= 0) {
1aa99e6b 4550 while (off-- && s < lim) {
ffc61ed2 4551 /* XXX could check well-formedness here */
a0ed51b3 4552 s += UTF8SKIP(s);
ffc61ed2 4553 }
a0ed51b3
LW
4554 if (off >= 0)
4555 return 0;
4556 }
4557 else {
4558 while (off++) {
1aa99e6b 4559 if (s > lim) {
a0ed51b3 4560 s--;
ffc61ed2 4561 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4562 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4563 s--;
ffc61ed2
JH
4564 }
4565 /* XXX could check well-formedness here */
a0ed51b3
LW
4566 }
4567 else
4568 break;
4569 }
4570 if (off <= 0)
4571 return 0;
4572 }
4573 return s;
4574}
51371543 4575
51371543 4576static void
acfe0abc 4577restore_pos(pTHX_ void *arg)
51371543 4578{
a2592645 4579 PERL_UNUSED_ARG(arg);
51371543
GS
4580 if (PL_reg_eval_set) {
4581 if (PL_reg_oldsaved) {
4582 PL_reg_re->subbeg = PL_reg_oldsaved;
4583 PL_reg_re->sublen = PL_reg_oldsavedlen;
4584 RX_MATCH_COPIED_on(PL_reg_re);
4585 }
4586 PL_reg_magic->mg_len = PL_reg_oldpos;
4587 PL_reg_eval_set = 0;
4588 PL_curpm = PL_reg_oldcurpm;
4589 }
4590}
33b8afdf
JH
4591
4592STATIC void
4593S_to_utf8_substr(pTHX_ register regexp *prog)
4594{
33b8afdf 4595 if (prog->float_substr && !prog->float_utf8) {
a2592645 4596 SV* sv;
04851bb3 4597 prog->float_utf8 = sv = newSVsv(prog->float_substr);
33b8afdf
JH
4598 sv_utf8_upgrade(sv);
4599 if (SvTAIL(prog->float_substr))
4600 SvTAIL_on(sv);
4601 if (prog->float_substr == prog->check_substr)
4602 prog->check_utf8 = sv;
4603 }
4604 if (prog->anchored_substr && !prog->anchored_utf8) {
a2592645 4605 SV* sv;
04851bb3 4606 prog->anchored_utf8 = sv = newSVsv(prog->anchored_substr);
33b8afdf
JH
4607 sv_utf8_upgrade(sv);
4608 if (SvTAIL(prog->anchored_substr))
4609 SvTAIL_on(sv);
4610 if (prog->anchored_substr == prog->check_substr)
4611 prog->check_utf8 = sv;
4612 }
4613}
4614
4615STATIC void
4616S_to_byte_substr(pTHX_ register regexp *prog)
4617{
33b8afdf 4618 if (prog->float_utf8 && !prog->float_substr) {
a2592645 4619 SV* sv;
04851bb3 4620 prog->float_substr = sv = newSVsv(prog->float_utf8);
33b8afdf
JH
4621 if (sv_utf8_downgrade(sv, TRUE)) {
4622 if (SvTAIL(prog->float_utf8))
4623 SvTAIL_on(sv);
4624 } else {
4625 SvREFCNT_dec(sv);
4626 prog->float_substr = sv = &PL_sv_undef;
4627 }
4628 if (prog->float_utf8 == prog->check_utf8)
4629 prog->check_substr = sv;
4630 }
4631 if (prog->anchored_utf8 && !prog->anchored_substr) {
a2592645 4632 SV* sv;
04851bb3 4633 prog->anchored_substr = sv = newSVsv(prog->anchored_utf8);
33b8afdf
JH
4634 if (sv_utf8_downgrade(sv, TRUE)) {
4635 if (SvTAIL(prog->anchored_utf8))
4636 SvTAIL_on(sv);
4637 } else {
4638 SvREFCNT_dec(sv);
4639 prog->anchored_substr = sv = &PL_sv_undef;
4640 }
4641 if (prog->anchored_utf8 == prog->check_utf8)
4642 prog->check_substr = sv;
4643 }
4644}
d8294a4d
NC
4645
4646/*
4647 * Local variables:
4648 * c-indentation-style: bsd
4649 * c-basic-offset: 4
4650 * indent-tabs-mode: t
4651 * End:
4652 *
4653 * ex: set ts=8 sts=4 sw=4 noet:
4654 */