This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: [PATCH perl-current] Re: [perl #31793] Data::Dumper: Useqq interacts badly with...
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29
DM
10 * a regular expression.
11 */
12
a687059c
LW
13/* NOTE: this is derived from Henry Spencer's regexp code, and should not
14 * confused with the original package (see point 3 below). Thanks, Henry!
15 */
16
17/* Additional note: this code is very heavily munged from Henry's version
18 * in places. In some spots I've traded clarity for efficiency, so don't
19 * blame Henry for some of the lack of readability.
20 */
21
e50aee73
AD
22/* The names of the functions have been changed from regcomp and
23 * regexec to pregcomp and pregexec in order to avoid conflicts
24 * with the POSIX routines of the same names.
25*/
26
b9d5759e
AD
27#ifdef PERL_EXT_RE_BUILD
28/* need to replace pregcomp et al, so enable that */
29# ifndef PERL_IN_XSUB_RE
30# define PERL_IN_XSUB_RE
31# endif
32/* need access to debugger hooks */
cad2e5aa 33# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
34# define DEBUGGING
35# endif
36#endif
37
38#ifdef PERL_IN_XSUB_RE
d06ea78c 39/* We *really* need to overwrite these symbols: */
56953603
IZ
40# define Perl_regexec_flags my_regexec
41# define Perl_regdump my_regdump
42# define Perl_regprop my_regprop
cad2e5aa 43# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
44/* *These* symbols are masked to allow static link. */
45# define Perl_pregexec my_pregexec
9041c2e3 46# define Perl_reginitcolors my_reginitcolors
490a3f88 47# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
48
49# define PERL_NO_GET_CONTEXT
9041c2e3 50#endif
56953603 51
f0fcb552 52/*SUPPRESS 112*/
a687059c 53/*
e50aee73 54 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
55 *
56 * Copyright (c) 1986 by University of Toronto.
57 * Written by Henry Spencer. Not derived from licensed software.
58 *
59 * Permission is granted to anyone to use this software for any
60 * purpose on any computer system, and to redistribute it freely,
61 * subject to the following restrictions:
62 *
63 * 1. The author is not responsible for the consequences of use of
64 * this software, no matter how awful, even if they arise
65 * from defects in it.
66 *
67 * 2. The origin of this software must not be misrepresented, either
68 * by explicit claim or by omission.
69 *
70 * 3. Altered versions must be plainly marked as such, and must not
71 * be misrepresented as being the original software.
72 *
73 **** Alterations to Henry's code are...
74 ****
4bb101f2 75 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b5f8cc5c 76 **** 2000, 2001, 2002, 2003, 2004, by Larry Wall and others
a687059c 77 ****
9ef589d8
LW
78 **** You may distribute under the terms of either the GNU General Public
79 **** License or the Artistic License, as specified in the README file.
a687059c
LW
80 *
81 * Beware that some of this code is subtly aware of the way operator
82 * precedence is structured in regular expressions. Serious changes in
83 * regular-expression syntax might require a total rethink.
84 */
85#include "EXTERN.h"
864dbfa3 86#define PERL_IN_REGEXEC_C
a687059c 87#include "perl.h"
0f5d15d6 88
a687059c
LW
89#include "regcomp.h"
90
c277df42
IZ
91#define RF_tainted 1 /* tainted information used? */
92#define RF_warned 2 /* warned about big count? */
ce862d02 93#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3 94#define RF_utf8 8 /* String contains multibyte chars? */
c2b0868c 95#define RF_false 16 /* odd number of nested negatives */
a0ed51b3 96
eb160463 97#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
98
99#define RS_init 1 /* eval environment created */
100#define RS_set 2 /* replsv value is set */
c277df42 101
a687059c
LW
102#ifndef STATIC
103#define STATIC static
104#endif
105
ba7b4546 106#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 107
c277df42
IZ
108/*
109 * Forwards.
110 */
111
33b8afdf 112#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 113#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 114
dfe13c55
GS
115#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
116#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
117#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
118#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
119#define HOPc(pos,off) ((char*)HOP(pos,off))
120#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 121
efb30f32 122#define HOPBACK(pos, off) ( \
e54858b0 123 (PL_reg_match_utf8) \
efb30f32
HS
124 ? reghopmaybe((U8*)pos, -off) \
125 : (pos - off >= PL_bostr) \
126 ? (U8*)(pos - off) \
127 : (U8*)NULL \
128)
129#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
130
1aa99e6b
IH
131#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
132#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
133#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
134#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
135#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
136#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
137
977e4b84 138#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
51371543 139
5f80c4cf 140/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
141#define JUMPABLE(rn) ( \
142 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
143 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
144 OP(rn) == PLUS || OP(rn) == MINMOD || \
145 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
146)
147
cca55fe3
JP
148#define HAS_TEXT(rn) ( \
149 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
150)
e2d8ce26 151
a84d97b6
HS
152/*
153 Search for mandatory following text node; for lookahead, the text must
154 follow but for lookbehind (rn->flags != 0) we skip to the next step.
155*/
cca55fe3 156#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 157 while (JUMPABLE(rn)) \
a84d97b6 158 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 159 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
160 else if (OP(rn) == PLUS) \
161 rn = NEXTOPER(rn); \
a84d97b6
HS
162 else if (OP(rn) == IFMATCH) \
163 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 164 else rn += NEXT_OFF(rn); \
5f80c4cf 165} STMT_END
74750237 166
acfe0abc 167static void restore_pos(pTHX_ void *arg);
51371543 168
76e3520e 169STATIC CHECKPOINT
cea2e8a9 170S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 171{
3280af22 172 int retval = PL_savestack_ix;
b1ce53c5
JH
173#define REGCP_PAREN_ELEMS 4
174 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
175 int p;
176
e49a9654
IH
177 if (paren_elems_to_push < 0)
178 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
179
a01268b5 180#define REGCP_OTHER_ELEMS 6
4b3c1a47 181 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 182 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 183/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
184 SSPUSHINT(PL_regendp[p]);
185 SSPUSHINT(PL_regstartp[p]);
3280af22 186 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
187 SSPUSHINT(p);
188 }
b1ce53c5 189/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
190 SSPUSHINT(PL_regsize);
191 SSPUSHINT(*PL_reglastparen);
a01268b5 192 SSPUSHINT(*PL_reglastcloseparen);
3280af22 193 SSPUSHPTR(PL_reginput);
41123dfd
JH
194#define REGCP_FRAME_ELEMS 2
195/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
196 * are needed for the regexp context stack bookkeeping. */
197 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 198 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 199
a0d0e21e
LW
200 return retval;
201}
202
c277df42 203/* These are needed since we do not localize EVAL nodes: */
02db2b7b 204# define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
faccc32b 205 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 206 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 207
02db2b7b 208# define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
c3464db5 209 PerlIO_printf(Perl_debug_log, \
faccc32b 210 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 211 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 212
76e3520e 213STATIC char *
cea2e8a9 214S_regcppop(pTHX)
a0d0e21e 215{
b1ce53c5 216 I32 i;
a0d0e21e
LW
217 U32 paren = 0;
218 char *input;
cf93c79d 219 I32 tmps;
b1ce53c5
JH
220
221 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 222 i = SSPOPINT;
b1ce53c5
JH
223 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
224 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 225 input = (char *) SSPOPPTR;
a01268b5 226 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
227 *PL_reglastparen = SSPOPINT;
228 PL_regsize = SSPOPINT;
b1ce53c5
JH
229
230 /* Now restore the parentheses context. */
41123dfd
JH
231 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
232 i > 0; i -= REGCP_PAREN_ELEMS) {
a0d0e21e 233 paren = (U32)SSPOPINT;
3280af22 234 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
235 PL_regstartp[paren] = SSPOPINT;
236 tmps = SSPOPINT;
3280af22
NIS
237 if (paren <= *PL_reglastparen)
238 PL_regendp[paren] = tmps;
c277df42 239 DEBUG_r(
c3464db5 240 PerlIO_printf(Perl_debug_log,
b900a521 241 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 242 (UV)paren, (IV)PL_regstartp[paren],
b900a521 243 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 244 (IV)PL_regendp[paren],
3280af22 245 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 246 );
a0d0e21e 247 }
c277df42 248 DEBUG_r(
eb160463 249 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 250 PerlIO_printf(Perl_debug_log,
faccc32b
JH
251 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
252 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
253 }
254 );
daf18116 255#if 1
dafc8851
JH
256 /* It would seem that the similar code in regtry()
257 * already takes care of this, and in fact it is in
258 * a better location to since this code can #if 0-ed out
259 * but the code in regtry() is needed or otherwise tests
260 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
261 * (as of patchlevel 7877) will fail. Then again,
262 * this code seems to be necessary or otherwise
263 * building DynaLoader will fail:
264 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
265 * --jhi */
eb160463
GS
266 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
267 if ((I32)paren > PL_regsize)
cf93c79d
IZ
268 PL_regstartp[paren] = -1;
269 PL_regendp[paren] = -1;
a0d0e21e 270 }
dafc8851 271#endif
a0d0e21e
LW
272 return input;
273}
274
0f5d15d6 275STATIC char *
cea2e8a9 276S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6
IZ
277{
278 I32 tmp = PL_savestack_ix;
279
280 PL_savestack_ix = ss;
281 regcppop();
282 PL_savestack_ix = tmp;
942e002e 283 return Nullch;
0f5d15d6
IZ
284}
285
286typedef struct re_cc_state
287{
288 I32 ss;
289 regnode *node;
290 struct re_cc_state *prev;
291 CURCUR *cc;
292 regexp *re;
293} re_cc_state;
294
02db2b7b 295#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 296
29d1e993
HS
297#define TRYPAREN(paren, n, input) { \
298 if (paren) { \
299 if (n) { \
300 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
301 PL_regendp[paren] = input - PL_bostr; \
302 } \
303 else \
304 PL_regendp[paren] = -1; \
305 } \
306 if (regmatch(next)) \
307 sayYES; \
308 if (paren && n) \
309 PL_regendp[paren] = -1; \
310}
311
312
a687059c 313/*
e50aee73 314 * pregexec and friends
a687059c
LW
315 */
316
317/*
c277df42 318 - pregexec - match a regexp against a string
a687059c 319 */
c277df42 320I32
864dbfa3 321Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 322 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
323/* strend: pointer to null at end of string */
324/* strbeg: real beginning of string */
325/* minend: end of match must be >=minend after stringarg. */
326/* nosave: For optimizations. */
327{
328 return
9041c2e3 329 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
330 nosave ? 0 : REXEC_COPY_STR);
331}
0f5d15d6
IZ
332
333STATIC void
cea2e8a9 334S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
335{
336 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
337#ifdef DEBUGGING
338 PL_regprogram = prog->program;
339#endif
340 PL_regnpar = prog->nparens;
9041c2e3
NIS
341 PL_regdata = prog->data;
342 PL_reg_re = prog;
0f5d15d6 343}
22e551b9 344
9041c2e3 345/*
cad2e5aa
JH
346 * Need to implement the following flags for reg_anch:
347 *
348 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
349 * USE_INTUIT_ML
350 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
351 * INTUIT_AUTORITATIVE_ML
352 * INTUIT_ONCE_NOML - Intuit can match in one location only.
353 * INTUIT_ONCE_ML
354 *
355 * Another flag for this function: SECOND_TIME (so that float substrs
356 * with giant delta may be not rechecked).
357 */
358
359/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
360
2c2d71f5 361/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
cad2e5aa
JH
362 Otherwise, only SvCUR(sv) is used to get strbeg. */
363
364/* XXXX We assume that strpos is strbeg unless sv. */
365
6eb5f6b9
JH
366/* XXXX Some places assume that there is a fixed substring.
367 An update may be needed if optimizer marks as "INTUITable"
368 RExen without fixed substrings. Similarly, it is assumed that
369 lengths of all the strings are no more than minlen, thus they
370 cannot come from lookahead.
371 (Or minlen should take into account lookahead.) */
372
2c2d71f5
JH
373/* A failure to find a constant substring means that there is no need to make
374 an expensive call to REx engine, thus we celebrate a failure. Similarly,
375 finding a substring too deep into the string means that less calls to
30944b6d
IZ
376 regtry() should be needed.
377
378 REx compiler's optimizer found 4 possible hints:
379 a) Anchored substring;
380 b) Fixed substring;
381 c) Whether we are anchored (beginning-of-line or \G);
382 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 383 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
384 string which does not contradict any of them.
385 */
2c2d71f5 386
6eb5f6b9
JH
387/* Most of decisions we do here should have been done at compile time.
388 The nodes of the REx which we used for the search should have been
389 deleted from the finite automaton. */
390
cad2e5aa
JH
391char *
392Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
393 char *strend, U32 flags, re_scream_pos_data *data)
394{
b7953727 395 register I32 start_shift = 0;
cad2e5aa 396 /* Should be nonnegative! */
b7953727 397 register I32 end_shift = 0;
2c2d71f5
JH
398 register char *s;
399 register SV *check;
a1933d95 400 char *strbeg;
cad2e5aa 401 char *t;
33b8afdf 402 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 403 I32 ml_anch;
6eb5f6b9 404 register char *other_last = Nullch; /* other substr checked before this */
b7953727 405 char *check_at = Nullch; /* check substr found at this pos */
30944b6d
IZ
406#ifdef DEBUGGING
407 char *i_strpos = strpos;
ce333219 408 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 409#endif
a30b2f1f 410 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 411
b8d68ded
JH
412 if (prog->reganch & ROPT_UTF8) {
413 DEBUG_r(PerlIO_printf(Perl_debug_log,
414 "UTF-8 regex...\n"));
415 PL_reg_flags |= RF_utf8;
416 }
417
2a782b5b 418 DEBUG_r({
b8d68ded 419 char *s = PL_reg_match_utf8 ?
c728cb41
JH
420 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
421 strpos;
b8d68ded
JH
422 int len = PL_reg_match_utf8 ?
423 strlen(s) : strend - strpos;
2a782b5b
JH
424 if (!PL_colorset)
425 reginitcolors();
b8d68ded
JH
426 if (PL_reg_match_utf8)
427 DEBUG_r(PerlIO_printf(Perl_debug_log,
428 "UTF-8 target...\n"));
2a782b5b
JH
429 PerlIO_printf(Perl_debug_log,
430 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
431 PL_colors[4],PL_colors[5],PL_colors[0],
432 prog->precomp,
433 PL_colors[1],
434 (strlen(prog->precomp) > 60 ? "..." : ""),
435 PL_colors[0],
436 (int)(len > 60 ? 60 : len),
437 s, PL_colors[1],
438 (len > 60 ? "..." : "")
439 );
440 });
cad2e5aa 441
c344f387
JH
442 /* CHR_DIST() would be more correct here but it makes things slow. */
443 if (prog->minlen > strend - strpos) {
a72c7584
JH
444 DEBUG_r(PerlIO_printf(Perl_debug_log,
445 "String too short... [re_intuit_start]\n"));
cad2e5aa 446 goto fail;
2c2d71f5 447 }
a1933d95 448 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 449 PL_regeol = strend;
33b8afdf
JH
450 if (do_utf8) {
451 if (!prog->check_utf8 && prog->check_substr)
452 to_utf8_substr(prog);
453 check = prog->check_utf8;
454 } else {
455 if (!prog->check_substr && prog->check_utf8)
456 to_byte_substr(prog);
457 check = prog->check_substr;
458 }
459 if (check == &PL_sv_undef) {
460 DEBUG_r(PerlIO_printf(Perl_debug_log,
461 "Non-utf string cannot match utf check string\n"));
462 goto fail;
463 }
2c2d71f5 464 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
465 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
466 || ( (prog->reganch & ROPT_ANCH_BOL)
2c2d71f5 467 && !PL_multiline ) ); /* Check after \n? */
cad2e5aa 468
7e25d62c
JH
469 if (!ml_anch) {
470 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
471 | ROPT_IMPLICIT)) /* not a real BOL */
472 /* SvCUR is not set on references: SvRV and SvPVX overlap */
473 && sv && !SvROK(sv)
474 && (strpos != strbeg)) {
475 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
476 goto fail;
477 }
478 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 479 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 480 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
481 I32 slen;
482
1aa99e6b 483 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
484 if (SvTAIL(check)) {
485 slen = SvCUR(check); /* >= 1 */
cad2e5aa 486
9041c2e3 487 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5
JH
488 || (strend - s == slen && strend[-1] != '\n')) {
489 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
490 goto fail_finish;
cad2e5aa
JH
491 }
492 /* Now should match s[0..slen-2] */
493 slen--;
653099ff 494 if (slen && (*SvPVX(check) != *s
cad2e5aa 495 || (slen > 1
653099ff 496 && memNE(SvPVX(check), s, slen)))) {
2c2d71f5
JH
497 report_neq:
498 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
499 goto fail_finish;
500 }
cad2e5aa 501 }
653099ff
GS
502 else if (*SvPVX(check) != *s
503 || ((slen = SvCUR(check)) > 1
504 && memNE(SvPVX(check), s, slen)))
2c2d71f5
JH
505 goto report_neq;
506 goto success_at_start;
7e25d62c 507 }
cad2e5aa 508 }
2c2d71f5 509 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 510 s = strpos;
2c2d71f5 511 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 512 end_shift = prog->minlen - start_shift -
653099ff 513 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 514 if (!ml_anch) {
653099ff
GS
515 I32 end = prog->check_offset_max + CHR_SVLEN(check)
516 - (SvTAIL(check) != 0);
1aa99e6b 517 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
518
519 if (end_shift < eshift)
520 end_shift = eshift;
521 }
cad2e5aa 522 }
2c2d71f5 523 else { /* Can match at random position */
cad2e5aa
JH
524 ml_anch = 0;
525 s = strpos;
2c2d71f5
JH
526 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
527 /* Should be nonnegative! */
528 end_shift = prog->minlen - start_shift -
653099ff 529 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
530 }
531
2c2d71f5 532#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 533 if (end_shift < 0)
6bbae5e6 534 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
535#endif
536
2c2d71f5
JH
537 restart:
538 /* Find a possible match in the region s..strend by looking for
539 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 540 if (flags & REXEC_SCREAM) {
cad2e5aa
JH
541 I32 p = -1; /* Internal iterator of scream. */
542 I32 *pp = data ? data->scream_pos : &p;
543
2c2d71f5
JH
544 if (PL_screamfirst[BmRARE(check)] >= 0
545 || ( BmRARE(check) == '\n'
546 && (BmPREVIOUS(check) == SvCUR(check) - 1)
547 && SvTAIL(check) ))
9041c2e3 548 s = screaminstr(sv, check,
2c2d71f5 549 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 550 else
2c2d71f5 551 goto fail_finish;
4addbd3b
HS
552 /* we may be pointing at the wrong string */
553 if (s && RX_MATCH_COPIED(prog))
7ef91622 554 s = strbeg + (s - SvPVX(sv));
cad2e5aa
JH
555 if (data)
556 *data->scream_olds = s;
557 }
f33976b4 558 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
559 s = fbm_instr((U8*)(s + start_shift),
560 (U8*)(strend - end_shift),
561 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 562 else
1aa99e6b
IH
563 s = fbm_instr(HOP3(s, start_shift, strend),
564 HOP3(strend, -end_shift, strbeg),
2c2d71f5 565 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
566
567 /* Update the count-of-usability, remove useless subpatterns,
568 unshift s. */
2c2d71f5
JH
569
570 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
571 (s ? "Found" : "Did not find"),
33b8afdf 572 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 573 PL_colors[0],
7b0972df
JH
574 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
575 SvPVX(check),
2c2d71f5
JH
576 PL_colors[1], (SvTAIL(check) ? "$" : ""),
577 (s ? " at offset " : "...\n") ) );
578
579 if (!s)
580 goto fail_finish;
581
6eb5f6b9
JH
582 check_at = s;
583
2c2d71f5 584 /* Finish the diagnostic message */
30944b6d 585 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
586
587 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
588 Start with the other substr.
589 XXXX no SCREAM optimization yet - and a very coarse implementation
590 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
591 *always* match. Probably should be marked during compile...
592 Probably it is right to do no SCREAM here...
593 */
594
33b8afdf 595 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 596 /* Take into account the "other" substring. */
2c2d71f5
JH
597 /* XXXX May be hopelessly wrong for UTF... */
598 if (!other_last)
6eb5f6b9 599 other_last = strpos;
33b8afdf 600 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
601 do_other_anchored:
602 {
1aa99e6b 603 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5 604 char *s1 = s;
33b8afdf 605 SV* must;
2c2d71f5 606
2c2d71f5
JH
607 t = s - prog->check_offset_max;
608 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 609 && (!do_utf8
1aa99e6b 610 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 611 && t > strpos)))
30944b6d 612 /* EMPTY */;
2c2d71f5
JH
613 else
614 t = strpos;
1aa99e6b 615 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
616 if (t < other_last) /* These positions already checked */
617 t = other_last;
1aa99e6b 618 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
619 if (last < last1)
620 last1 = last;
621 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
622 /* On end-of-str: see comment below. */
33b8afdf
JH
623 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
624 if (must == &PL_sv_undef) {
625 s = (char*)NULL;
626 DEBUG_r(must = prog->anchored_utf8); /* for debug */
627 }
628 else
629 s = fbm_instr(
630 (unsigned char*)t,
631 HOP3(HOP3(last1, prog->anchored_offset, strend)
632 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
633 must,
634 PL_multiline ? FBMrf_MULTILINE : 0
635 );
1aa99e6b
IH
636 DEBUG_r(PerlIO_printf(Perl_debug_log,
637 "%s anchored substr `%s%.*s%s'%s",
2c2d71f5
JH
638 (s ? "Found" : "Contradicts"),
639 PL_colors[0],
33b8afdf
JH
640 (int)(SvCUR(must)
641 - (SvTAIL(must)!=0)),
642 SvPVX(must),
643 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
644 if (!s) {
645 if (last1 >= last2) {
646 DEBUG_r(PerlIO_printf(Perl_debug_log,
647 ", giving up...\n"));
648 goto fail_finish;
649 }
650 DEBUG_r(PerlIO_printf(Perl_debug_log,
651 ", trying floating at offset %ld...\n",
1aa99e6b
IH
652 (long)(HOP3c(s1, 1, strend) - i_strpos)));
653 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
654 s = HOP3c(last, 1, strend);
2c2d71f5
JH
655 goto restart;
656 }
657 else {
658 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 659 (long)(s - i_strpos)));
1aa99e6b
IH
660 t = HOP3c(s, -prog->anchored_offset, strbeg);
661 other_last = HOP3c(s, 1, strend);
30944b6d 662 s = s1;
2c2d71f5
JH
663 if (t == strpos)
664 goto try_at_start;
2c2d71f5
JH
665 goto try_at_offset;
666 }
30944b6d 667 }
2c2d71f5
JH
668 }
669 else { /* Take into account the floating substring. */
33b8afdf
JH
670 char *last, *last1;
671 char *s1 = s;
672 SV* must;
673
674 t = HOP3c(s, -start_shift, strbeg);
675 last1 = last =
676 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
677 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
678 last = HOP3c(t, prog->float_max_offset, strend);
679 s = HOP3c(t, prog->float_min_offset, strend);
680 if (s < other_last)
681 s = other_last;
2c2d71f5 682 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
683 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
684 /* fbm_instr() takes into account exact value of end-of-str
685 if the check is SvTAIL(ed). Since false positives are OK,
686 and end-of-str is not later than strend we are OK. */
687 if (must == &PL_sv_undef) {
688 s = (char*)NULL;
689 DEBUG_r(must = prog->float_utf8); /* for debug message */
690 }
691 else
2c2d71f5 692 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
693 (unsigned char*)last + SvCUR(must)
694 - (SvTAIL(must)!=0),
695 must, PL_multiline ? FBMrf_MULTILINE : 0);
696 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
697 (s ? "Found" : "Contradicts"),
698 PL_colors[0],
699 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
700 SvPVX(must),
701 PL_colors[1], (SvTAIL(must) ? "$" : "")));
702 if (!s) {
703 if (last1 == last) {
2c2d71f5 704 DEBUG_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
705 ", giving up...\n"));
706 goto fail_finish;
2c2d71f5 707 }
33b8afdf
JH
708 DEBUG_r(PerlIO_printf(Perl_debug_log,
709 ", trying anchored starting at offset %ld...\n",
710 (long)(s1 + 1 - i_strpos)));
711 other_last = last;
712 s = HOP3c(t, 1, strend);
713 goto restart;
714 }
715 else {
716 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
717 (long)(s - i_strpos)));
718 other_last = s; /* Fix this later. --Hugo */
719 s = s1;
720 if (t == strpos)
721 goto try_at_start;
722 goto try_at_offset;
723 }
2c2d71f5 724 }
cad2e5aa 725 }
2c2d71f5
JH
726
727 t = s - prog->check_offset_max;
2c2d71f5 728 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 729 && (!do_utf8
1aa99e6b
IH
730 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
731 && t > strpos))) {
2c2d71f5
JH
732 /* Fixed substring is found far enough so that the match
733 cannot start at strpos. */
734 try_at_offset:
cad2e5aa 735 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
736 /* Eventually fbm_*() should handle this, but often
737 anchored_offset is not 0, so this check will not be wasted. */
738 /* XXXX In the code below we prefer to look for "^" even in
739 presence of anchored substrings. And we search even
740 beyond the found float position. These pessimizations
741 are historical artefacts only. */
742 find_anchor:
2c2d71f5 743 while (t < strend - prog->minlen) {
cad2e5aa 744 if (*t == '\n') {
4ee3650e 745 if (t < check_at - prog->check_offset_min) {
33b8afdf 746 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
747 /* Since we moved from the found position,
748 we definitely contradict the found anchored
30944b6d
IZ
749 substr. Due to the above check we do not
750 contradict "check" substr.
751 Thus we can arrive here only if check substr
752 is float. Redo checking for "other"=="fixed".
753 */
9041c2e3 754 strpos = t + 1;
30944b6d
IZ
755 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
756 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
757 goto do_other_anchored;
758 }
4ee3650e
GS
759 /* We don't contradict the found floating substring. */
760 /* XXXX Why not check for STCLASS? */
cad2e5aa 761 s = t + 1;
2c2d71f5 762 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
30944b6d 763 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
764 goto set_useful;
765 }
4ee3650e
GS
766 /* Position contradicts check-string */
767 /* XXXX probably better to look for check-string
768 than for "\n", so one should lower the limit for t? */
769 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
30944b6d 770 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 771 other_last = strpos = s = t + 1;
cad2e5aa
JH
772 goto restart;
773 }
774 t++;
775 }
2c2d71f5
JH
776 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
777 PL_colors[0],PL_colors[1]));
778 goto fail_finish;
cad2e5aa 779 }
f5952150
GS
780 else {
781 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
782 PL_colors[0],PL_colors[1]));
783 }
cad2e5aa
JH
784 s = t;
785 set_useful:
33b8afdf 786 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
787 }
788 else {
f5952150 789 /* The found string does not prohibit matching at strpos,
2c2d71f5 790 - no optimization of calling REx engine can be performed,
f5952150
GS
791 unless it was an MBOL and we are not after MBOL,
792 or a future STCLASS check will fail this. */
2c2d71f5
JH
793 try_at_start:
794 /* Even in this situation we may use MBOL flag if strpos is offset
795 wrt the start of the string. */
05b4157f 796 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 797 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
798 /* May be due to an implicit anchor of m{.*foo} */
799 && !(prog->reganch & ROPT_IMPLICIT))
800 {
cad2e5aa
JH
801 t = strpos;
802 goto find_anchor;
803 }
30944b6d 804 DEBUG_r( if (ml_anch)
f5952150
GS
805 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
806 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
30944b6d 807 );
2c2d71f5 808 success_at_start:
30944b6d 809 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
810 && (do_utf8 ? (
811 prog->check_utf8 /* Could be deleted already */
812 && --BmUSEFUL(prog->check_utf8) < 0
813 && (prog->check_utf8 == prog->float_utf8)
814 ) : (
815 prog->check_substr /* Could be deleted already */
816 && --BmUSEFUL(prog->check_substr) < 0
817 && (prog->check_substr == prog->float_substr)
818 )))
66e933ab 819 {
cad2e5aa 820 /* If flags & SOMETHING - do not do it many times on the same match */
f5952150 821 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
822 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
823 if (do_utf8 ? prog->check_substr : prog->check_utf8)
824 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
825 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
826 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 827 check = Nullsv; /* abort */
cad2e5aa 828 s = strpos;
3cf5c195
IZ
829 /* XXXX This is a remnant of the old implementation. It
830 looks wasteful, since now INTUIT can use many
6eb5f6b9 831 other heuristics. */
cad2e5aa
JH
832 prog->reganch &= ~RE_USE_INTUIT;
833 }
834 else
835 s = strpos;
836 }
837
6eb5f6b9
JH
838 /* Last resort... */
839 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
840 if (prog->regstclass) {
841 /* minlen == 0 is possible if regstclass is \b or \B,
842 and the fixed substr is ''$.
843 Since minlen is already taken into account, s+1 is before strend;
844 accidentally, minlen >= 1 guaranties no false positives at s + 1
845 even for \b or \B. But (minlen? 1 : 0) below assumes that
846 regstclass does not come from lookahead... */
847 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
848 This leaves EXACTF only, which is dealt with in find_byclass(). */
1aa99e6b 849 U8* str = (U8*)STRING(prog->regstclass);
66e933ab 850 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 851 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 852 : 1);
33b8afdf 853 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 854 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 855 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
856 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
857 cl_l, strend)
858 : strend);
a1933d95 859 char *startpos = strbeg;
6eb5f6b9
JH
860
861 t = s;
9926ca43 862 cache_re(prog);
f33976b4 863 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
6eb5f6b9
JH
864 if (!s) {
865#ifdef DEBUGGING
b7953727 866 char *what = 0;
6eb5f6b9
JH
867#endif
868 if (endpos == strend) {
869 DEBUG_r( PerlIO_printf(Perl_debug_log,
870 "Could not match STCLASS...\n") );
871 goto fail;
872 }
66e933ab
GS
873 DEBUG_r( PerlIO_printf(Perl_debug_log,
874 "This position contradicts STCLASS...\n") );
653099ff
GS
875 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
876 goto fail;
6eb5f6b9 877 /* Contradict one of substrings */
33b8afdf
JH
878 if (prog->anchored_substr || prog->anchored_utf8) {
879 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
6eb5f6b9
JH
880 DEBUG_r( what = "anchored" );
881 hop_and_restart:
1aa99e6b 882 s = HOP3c(t, 1, strend);
66e933ab
GS
883 if (s + start_shift + end_shift > strend) {
884 /* XXXX Should be taken into account earlier? */
885 DEBUG_r( PerlIO_printf(Perl_debug_log,
886 "Could not match STCLASS...\n") );
887 goto fail;
888 }
5e39e1e5
HS
889 if (!check)
890 goto giveup;
6eb5f6b9 891 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 892 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
893 what, (long)(s + start_shift - i_strpos)) );
894 goto restart;
895 }
66e933ab 896 /* Have both, check_string is floating */
6eb5f6b9
JH
897 if (t + start_shift >= check_at) /* Contradicts floating=check */
898 goto retry_floating_check;
899 /* Recheck anchored substring, but not floating... */
9041c2e3 900 s = check_at;
5e39e1e5
HS
901 if (!check)
902 goto giveup;
6eb5f6b9 903 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 904 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
905 (long)(other_last - i_strpos)) );
906 goto do_other_anchored;
907 }
60e71179
GS
908 /* Another way we could have checked stclass at the
909 current position only: */
910 if (ml_anch) {
911 s = t = t + 1;
5e39e1e5
HS
912 if (!check)
913 goto giveup;
60e71179 914 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150
GS
915 "Looking for /%s^%s/m starting at offset %ld...\n",
916 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
60e71179 917 goto try_at_offset;
66e933ab 918 }
33b8afdf 919 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 920 goto fail;
6eb5f6b9
JH
921 /* Check is floating subtring. */
922 retry_floating_check:
923 t = check_at - start_shift;
924 DEBUG_r( what = "floating" );
925 goto hop_and_restart;
926 }
b7953727
JH
927 if (t != s) {
928 DEBUG_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 929 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
930 (long)(t - i_strpos), (long)(s - i_strpos))
931 );
932 }
933 else {
934 DEBUG_r(PerlIO_printf(Perl_debug_log,
935 "Does not contradict STCLASS...\n");
936 );
937 }
6eb5f6b9 938 }
5e39e1e5
HS
939 giveup:
940 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
941 PL_colors[4], (check ? "Guessed" : "Giving up"),
942 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 943 return s;
2c2d71f5
JH
944
945 fail_finish: /* Substring not found */
33b8afdf
JH
946 if (prog->check_substr || prog->check_utf8) /* could be removed already */
947 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 948 fail:
2c2d71f5 949 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
cad2e5aa
JH
950 PL_colors[4],PL_colors[5]));
951 return Nullch;
952}
9661b544 953
6eb5f6b9 954/* We know what class REx starts with. Try to find this position... */
3c3eec57
GS
955STATIC char *
956S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
a687059c 957{
6eb5f6b9
JH
958 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
959 char *m;
d8093b23 960 STRLEN ln;
5dab1207 961 STRLEN lnc;
078c425b 962 register STRLEN uskip;
d8093b23
G
963 unsigned int c1;
964 unsigned int c2;
6eb5f6b9
JH
965 char *e;
966 register I32 tmp = 1; /* Scratch variable? */
53c4c00c 967 register bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 968
6eb5f6b9
JH
969 /* We know what class it must start with. */
970 switch (OP(c)) {
6eb5f6b9 971 case ANYOF:
388cc4de 972 if (do_utf8) {
078c425b 973 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
974 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
975 !UTF8_IS_INVARIANT((U8)s[0]) ?
976 reginclass(c, (U8*)s, 0, do_utf8) :
977 REGINCLASS(c, (U8*)s)) {
978 if (tmp && (norun || regtry(prog, s)))
979 goto got_it;
980 else
981 tmp = doevery;
982 }
983 else
984 tmp = 1;
078c425b 985 s += uskip;
388cc4de
HS
986 }
987 }
988 else {
989 while (s < strend) {
990 STRLEN skip = 1;
991
992 if (REGINCLASS(c, (U8*)s) ||
993 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
994 /* The assignment of 2 is intentional:
995 * for the folded sharp s, the skip is 2. */
996 (skip = SHARP_S_SKIP))) {
997 if (tmp && (norun || regtry(prog, s)))
998 goto got_it;
999 else
1000 tmp = doevery;
1001 }
1002 else
1003 tmp = 1;
1004 s += skip;
1005 }
a0d0e21e 1006 }
6eb5f6b9 1007 break;
f33976b4
DB
1008 case CANY:
1009 while (s < strend) {
1010 if (tmp && (norun || regtry(prog, s)))
1011 goto got_it;
1012 else
1013 tmp = doevery;
1014 s++;
1015 }
1016 break;
6eb5f6b9 1017 case EXACTF:
5dab1207
NIS
1018 m = STRING(c);
1019 ln = STR_LEN(c); /* length to match in octets/bytes */
1020 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1021 if (UTF) {
a2a2844f 1022 STRLEN ulen1, ulen2;
5dab1207 1023 U8 *sm = (U8 *) m;
e7ae6809
JH
1024 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
1025 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
1026
1027 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1028 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1029
872c91ae
JH
1030 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
1031 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1032 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
1033 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
5dab1207
NIS
1034 lnc = 0;
1035 while (sm < ((U8 *) m + ln)) {
1036 lnc++;
1037 sm += UTF8SKIP(sm);
1038 }
1aa99e6b
IH
1039 }
1040 else {
1041 c1 = *(U8*)m;
1042 c2 = PL_fold[c1];
1043 }
6eb5f6b9
JH
1044 goto do_exactf;
1045 case EXACTFL:
5dab1207
NIS
1046 m = STRING(c);
1047 ln = STR_LEN(c);
1048 lnc = (I32) ln;
d8093b23 1049 c1 = *(U8*)m;
6eb5f6b9
JH
1050 c2 = PL_fold_locale[c1];
1051 do_exactf:
db12adc6 1052 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1053
6eb5f6b9
JH
1054 if (norun && e < s)
1055 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1056
60a8b682
JH
1057 /* The idea in the EXACTF* cases is to first find the
1058 * first character of the EXACTF* node and then, if
1059 * necessary, case-insensitively compare the full
1060 * text of the node. The c1 and c2 are the first
1061 * characters (though in Unicode it gets a bit
1062 * more complicated because there are more cases
7f16dd3d
JH
1063 * than just upper and lower: one needs to use
1064 * the so-called folding case for case-insensitive
1065 * matching (called "loose matching" in Unicode).
1066 * ibcmp_utf8() will do just that. */
60a8b682 1067
1aa99e6b 1068 if (do_utf8) {
575cac57
JH
1069 UV c, f;
1070 U8 tmpbuf [UTF8_MAXLEN+1];
1071 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
1072 STRLEN len, foldlen;
d7f013c8 1073
09091399 1074 if (c1 == c2) {
5dab1207
NIS
1075 /* Upper and lower of 1st char are equal -
1076 * probably not a "letter". */
1aa99e6b 1077 while (s <= e) {
872c91ae
JH
1078 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1079 ckWARN(WARN_UTF8) ?
1080 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1081 if ( c == c1
1082 && (ln == len ||
66423254 1083 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1084 m, (char **)0, ln, (bool)UTF))
55da9344 1085 && (norun || regtry(prog, s)) )
1aa99e6b 1086 goto got_it;
80aecb99
JH
1087 else {
1088 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1089 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1090 if ( f != c
1091 && (f == c1 || f == c2)
1092 && (ln == foldlen ||
66423254
JH
1093 !ibcmp_utf8((char *) foldbuf,
1094 (char **)0, foldlen, do_utf8,
d07ddd77 1095 m,
eb160463 1096 (char **)0, ln, (bool)UTF))
80aecb99
JH
1097 && (norun || regtry(prog, s)) )
1098 goto got_it;
1099 }
1aa99e6b
IH
1100 s += len;
1101 }
09091399
JH
1102 }
1103 else {
1aa99e6b 1104 while (s <= e) {
872c91ae
JH
1105 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1106 ckWARN(WARN_UTF8) ?
1107 0 : UTF8_ALLOW_ANY);
80aecb99 1108
60a8b682 1109 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1110 * Note that not all the possible combinations
1111 * are handled here: some of them are handled
1112 * by the standard folding rules, and some of
1113 * them (the character class or ANYOF cases)
1114 * are handled during compiletime in
1115 * regexec.c:S_regclass(). */
880bd946
JH
1116 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1117 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1118 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1119
1120 if ( (c == c1 || c == c2)
1121 && (ln == len ||
66423254 1122 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1123 m, (char **)0, ln, (bool)UTF))
55da9344 1124 && (norun || regtry(prog, s)) )
1aa99e6b 1125 goto got_it;
80aecb99
JH
1126 else {
1127 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1128 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1129 if ( f != c
1130 && (f == c1 || f == c2)
1131 && (ln == foldlen ||
a6872d42 1132 !ibcmp_utf8((char *) foldbuf,
66423254 1133 (char **)0, foldlen, do_utf8,
d07ddd77 1134 m,
eb160463 1135 (char **)0, ln, (bool)UTF))
80aecb99
JH
1136 && (norun || regtry(prog, s)) )
1137 goto got_it;
1138 }
1aa99e6b
IH
1139 s += len;
1140 }
09091399 1141 }
1aa99e6b
IH
1142 }
1143 else {
1144 if (c1 == c2)
1145 while (s <= e) {
1146 if ( *(U8*)s == c1
1147 && (ln == 1 || !(OP(c) == EXACTF
1148 ? ibcmp(s, m, ln)
1149 : ibcmp_locale(s, m, ln)))
1150 && (norun || regtry(prog, s)) )
1151 goto got_it;
1152 s++;
1153 }
1154 else
1155 while (s <= e) {
1156 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1157 && (ln == 1 || !(OP(c) == EXACTF
1158 ? ibcmp(s, m, ln)
1159 : ibcmp_locale(s, m, ln)))
1160 && (norun || regtry(prog, s)) )
1161 goto got_it;
1162 s++;
1163 }
b3c9acc1
IZ
1164 }
1165 break;
bbce6d69 1166 case BOUNDL:
3280af22 1167 PL_reg_flags |= RF_tainted;
bbce6d69 1168 /* FALL THROUGH */
a0d0e21e 1169 case BOUND:
ffc61ed2 1170 if (do_utf8) {
12d33761 1171 if (s == PL_bostr)
ffc61ed2
JH
1172 tmp = '\n';
1173 else {
b4f7163a 1174 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1175
b4f7163a 1176 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1177 }
1178 tmp = ((OP(c) == BOUND ?
9041c2e3 1179 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1180 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1181 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1182 if (tmp == !(OP(c) == BOUND ?
3568d838 1183 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1184 isALNUM_LC_utf8((U8*)s)))
1185 {
1186 tmp = !tmp;
1187 if ((norun || regtry(prog, s)))
1188 goto got_it;
1189 }
078c425b 1190 s += uskip;
a687059c 1191 }
a0d0e21e 1192 }
667bb95a 1193 else {
12d33761 1194 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1195 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1196 while (s < strend) {
1197 if (tmp ==
1198 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1199 tmp = !tmp;
1200 if ((norun || regtry(prog, s)))
1201 goto got_it;
1202 }
1203 s++;
a0ed51b3 1204 }
a0ed51b3 1205 }
6eb5f6b9 1206 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1207 goto got_it;
1208 break;
bbce6d69 1209 case NBOUNDL:
3280af22 1210 PL_reg_flags |= RF_tainted;
bbce6d69 1211 /* FALL THROUGH */
a0d0e21e 1212 case NBOUND:
ffc61ed2 1213 if (do_utf8) {
12d33761 1214 if (s == PL_bostr)
ffc61ed2
JH
1215 tmp = '\n';
1216 else {
b4f7163a 1217 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1218
b4f7163a 1219 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1220 }
1221 tmp = ((OP(c) == NBOUND ?
9041c2e3 1222 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1223 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1224 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1225 if (tmp == !(OP(c) == NBOUND ?
3568d838 1226 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1227 isALNUM_LC_utf8((U8*)s)))
1228 tmp = !tmp;
1229 else if ((norun || regtry(prog, s)))
1230 goto got_it;
078c425b 1231 s += uskip;
ffc61ed2 1232 }
a0d0e21e 1233 }
667bb95a 1234 else {
12d33761 1235 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1236 tmp = ((OP(c) == NBOUND ?
1237 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1238 while (s < strend) {
1239 if (tmp ==
1240 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1241 tmp = !tmp;
1242 else if ((norun || regtry(prog, s)))
1243 goto got_it;
1244 s++;
1245 }
a0ed51b3 1246 }
6eb5f6b9 1247 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1248 goto got_it;
1249 break;
a0d0e21e 1250 case ALNUM:
ffc61ed2 1251 if (do_utf8) {
8269fa76 1252 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1253 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1254 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1255 if (tmp && (norun || regtry(prog, s)))
1256 goto got_it;
1257 else
1258 tmp = doevery;
1259 }
bbce6d69 1260 else
ffc61ed2 1261 tmp = 1;
078c425b 1262 s += uskip;
bbce6d69 1263 }
bbce6d69 1264 }
ffc61ed2
JH
1265 else {
1266 while (s < strend) {
1267 if (isALNUM(*s)) {
1268 if (tmp && (norun || regtry(prog, s)))
1269 goto got_it;
1270 else
1271 tmp = doevery;
1272 }
a0ed51b3 1273 else
ffc61ed2
JH
1274 tmp = 1;
1275 s++;
a0ed51b3 1276 }
a0ed51b3
LW
1277 }
1278 break;
bbce6d69 1279 case ALNUML:
3280af22 1280 PL_reg_flags |= RF_tainted;
ffc61ed2 1281 if (do_utf8) {
078c425b 1282 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1283 if (isALNUM_LC_utf8((U8*)s)) {
1284 if (tmp && (norun || regtry(prog, s)))
1285 goto got_it;
1286 else
1287 tmp = doevery;
1288 }
a687059c 1289 else
ffc61ed2 1290 tmp = 1;
078c425b 1291 s += uskip;
a0d0e21e 1292 }
a0d0e21e 1293 }
ffc61ed2
JH
1294 else {
1295 while (s < strend) {
1296 if (isALNUM_LC(*s)) {
1297 if (tmp && (norun || regtry(prog, s)))
1298 goto got_it;
1299 else
1300 tmp = doevery;
1301 }
a0ed51b3 1302 else
ffc61ed2
JH
1303 tmp = 1;
1304 s++;
a0ed51b3 1305 }
a0ed51b3
LW
1306 }
1307 break;
a0d0e21e 1308 case NALNUM:
ffc61ed2 1309 if (do_utf8) {
8269fa76 1310 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1311 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1312 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1313 if (tmp && (norun || regtry(prog, s)))
1314 goto got_it;
1315 else
1316 tmp = doevery;
1317 }
bbce6d69 1318 else
ffc61ed2 1319 tmp = 1;
078c425b 1320 s += uskip;
bbce6d69 1321 }
bbce6d69 1322 }
ffc61ed2
JH
1323 else {
1324 while (s < strend) {
1325 if (!isALNUM(*s)) {
1326 if (tmp && (norun || regtry(prog, s)))
1327 goto got_it;
1328 else
1329 tmp = doevery;
1330 }
a0ed51b3 1331 else
ffc61ed2
JH
1332 tmp = 1;
1333 s++;
a0ed51b3 1334 }
a0ed51b3
LW
1335 }
1336 break;
bbce6d69 1337 case NALNUML:
3280af22 1338 PL_reg_flags |= RF_tainted;
ffc61ed2 1339 if (do_utf8) {
078c425b 1340 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1341 if (!isALNUM_LC_utf8((U8*)s)) {
1342 if (tmp && (norun || regtry(prog, s)))
1343 goto got_it;
1344 else
1345 tmp = doevery;
1346 }
a687059c 1347 else
ffc61ed2 1348 tmp = 1;
078c425b 1349 s += uskip;
a687059c 1350 }
a0d0e21e 1351 }
ffc61ed2
JH
1352 else {
1353 while (s < strend) {
1354 if (!isALNUM_LC(*s)) {
1355 if (tmp && (norun || regtry(prog, s)))
1356 goto got_it;
1357 else
1358 tmp = doevery;
1359 }
a0ed51b3 1360 else
ffc61ed2
JH
1361 tmp = 1;
1362 s++;
a0ed51b3 1363 }
a0ed51b3
LW
1364 }
1365 break;
a0d0e21e 1366 case SPACE:
ffc61ed2 1367 if (do_utf8) {
8269fa76 1368 LOAD_UTF8_CHARCLASS(space," ");
078c425b 1369 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1370 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1371 if (tmp && (norun || regtry(prog, s)))
1372 goto got_it;
1373 else
1374 tmp = doevery;
1375 }
a0d0e21e 1376 else
ffc61ed2 1377 tmp = 1;
078c425b 1378 s += uskip;
2304df62 1379 }
a0d0e21e 1380 }
ffc61ed2
JH
1381 else {
1382 while (s < strend) {
1383 if (isSPACE(*s)) {
1384 if (tmp && (norun || regtry(prog, s)))
1385 goto got_it;
1386 else
1387 tmp = doevery;
1388 }
a0ed51b3 1389 else
ffc61ed2
JH
1390 tmp = 1;
1391 s++;
a0ed51b3 1392 }
a0ed51b3
LW
1393 }
1394 break;
bbce6d69 1395 case SPACEL:
3280af22 1396 PL_reg_flags |= RF_tainted;
ffc61ed2 1397 if (do_utf8) {
078c425b 1398 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1399 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1400 if (tmp && (norun || regtry(prog, s)))
1401 goto got_it;
1402 else
1403 tmp = doevery;
1404 }
bbce6d69 1405 else
ffc61ed2 1406 tmp = 1;
078c425b 1407 s += uskip;
bbce6d69 1408 }
bbce6d69 1409 }
ffc61ed2
JH
1410 else {
1411 while (s < strend) {
1412 if (isSPACE_LC(*s)) {
1413 if (tmp && (norun || regtry(prog, s)))
1414 goto got_it;
1415 else
1416 tmp = doevery;
1417 }
a0ed51b3 1418 else
ffc61ed2
JH
1419 tmp = 1;
1420 s++;
a0ed51b3 1421 }
a0ed51b3
LW
1422 }
1423 break;
a0d0e21e 1424 case NSPACE:
ffc61ed2 1425 if (do_utf8) {
8269fa76 1426 LOAD_UTF8_CHARCLASS(space," ");
078c425b 1427 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1428 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1429 if (tmp && (norun || regtry(prog, s)))
1430 goto got_it;
1431 else
1432 tmp = doevery;
1433 }
a0d0e21e 1434 else
ffc61ed2 1435 tmp = 1;
078c425b 1436 s += uskip;
a687059c 1437 }
a0d0e21e 1438 }
ffc61ed2
JH
1439 else {
1440 while (s < strend) {
1441 if (!isSPACE(*s)) {
1442 if (tmp && (norun || regtry(prog, s)))
1443 goto got_it;
1444 else
1445 tmp = doevery;
1446 }
a0ed51b3 1447 else
ffc61ed2
JH
1448 tmp = 1;
1449 s++;
a0ed51b3 1450 }
a0ed51b3
LW
1451 }
1452 break;
bbce6d69 1453 case NSPACEL:
3280af22 1454 PL_reg_flags |= RF_tainted;
ffc61ed2 1455 if (do_utf8) {
078c425b 1456 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1457 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1458 if (tmp && (norun || regtry(prog, s)))
1459 goto got_it;
1460 else
1461 tmp = doevery;
1462 }
bbce6d69 1463 else
ffc61ed2 1464 tmp = 1;
078c425b 1465 s += uskip;
bbce6d69 1466 }
bbce6d69 1467 }
ffc61ed2
JH
1468 else {
1469 while (s < strend) {
1470 if (!isSPACE_LC(*s)) {
1471 if (tmp && (norun || regtry(prog, s)))
1472 goto got_it;
1473 else
1474 tmp = doevery;
1475 }
a0ed51b3 1476 else
ffc61ed2
JH
1477 tmp = 1;
1478 s++;
a0ed51b3 1479 }
a0ed51b3
LW
1480 }
1481 break;
a0d0e21e 1482 case DIGIT:
ffc61ed2 1483 if (do_utf8) {
8269fa76 1484 LOAD_UTF8_CHARCLASS(digit,"0");
078c425b 1485 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1486 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1487 if (tmp && (norun || regtry(prog, s)))
1488 goto got_it;
1489 else
1490 tmp = doevery;
1491 }
a0d0e21e 1492 else
ffc61ed2 1493 tmp = 1;
078c425b 1494 s += uskip;
2b69d0c2 1495 }
a0d0e21e 1496 }
ffc61ed2
JH
1497 else {
1498 while (s < strend) {
1499 if (isDIGIT(*s)) {
1500 if (tmp && (norun || regtry(prog, s)))
1501 goto got_it;
1502 else
1503 tmp = doevery;
1504 }
a0ed51b3 1505 else
ffc61ed2
JH
1506 tmp = 1;
1507 s++;
a0ed51b3 1508 }
a0ed51b3
LW
1509 }
1510 break;
b8c5462f
JH
1511 case DIGITL:
1512 PL_reg_flags |= RF_tainted;
ffc61ed2 1513 if (do_utf8) {
078c425b 1514 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1515 if (isDIGIT_LC_utf8((U8*)s)) {
1516 if (tmp && (norun || regtry(prog, s)))
1517 goto got_it;
1518 else
1519 tmp = doevery;
1520 }
b8c5462f 1521 else
ffc61ed2 1522 tmp = 1;
078c425b 1523 s += uskip;
b8c5462f 1524 }
b8c5462f 1525 }
ffc61ed2
JH
1526 else {
1527 while (s < strend) {
1528 if (isDIGIT_LC(*s)) {
1529 if (tmp && (norun || regtry(prog, s)))
1530 goto got_it;
1531 else
1532 tmp = doevery;
1533 }
b8c5462f 1534 else
ffc61ed2
JH
1535 tmp = 1;
1536 s++;
b8c5462f 1537 }
b8c5462f
JH
1538 }
1539 break;
a0d0e21e 1540 case NDIGIT:
ffc61ed2 1541 if (do_utf8) {
8269fa76 1542 LOAD_UTF8_CHARCLASS(digit,"0");
078c425b 1543 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1544 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1545 if (tmp && (norun || regtry(prog, s)))
1546 goto got_it;
1547 else
1548 tmp = doevery;
1549 }
a0d0e21e 1550 else
ffc61ed2 1551 tmp = 1;
078c425b 1552 s += uskip;
a687059c 1553 }
a0d0e21e 1554 }
ffc61ed2
JH
1555 else {
1556 while (s < strend) {
1557 if (!isDIGIT(*s)) {
1558 if (tmp && (norun || regtry(prog, s)))
1559 goto got_it;
1560 else
1561 tmp = doevery;
1562 }
a0ed51b3 1563 else
ffc61ed2
JH
1564 tmp = 1;
1565 s++;
a0ed51b3 1566 }
a0ed51b3
LW
1567 }
1568 break;
b8c5462f
JH
1569 case NDIGITL:
1570 PL_reg_flags |= RF_tainted;
ffc61ed2 1571 if (do_utf8) {
078c425b 1572 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1573 if (!isDIGIT_LC_utf8((U8*)s)) {
1574 if (tmp && (norun || regtry(prog, s)))
1575 goto got_it;
1576 else
1577 tmp = doevery;
1578 }
b8c5462f 1579 else
ffc61ed2 1580 tmp = 1;
078c425b 1581 s += uskip;
b8c5462f 1582 }
a0ed51b3 1583 }
ffc61ed2
JH
1584 else {
1585 while (s < strend) {
1586 if (!isDIGIT_LC(*s)) {
1587 if (tmp && (norun || regtry(prog, s)))
1588 goto got_it;
1589 else
1590 tmp = doevery;
1591 }
cf93c79d 1592 else
ffc61ed2
JH
1593 tmp = 1;
1594 s++;
b8c5462f 1595 }
b8c5462f
JH
1596 }
1597 break;
b3c9acc1 1598 default:
3c3eec57
GS
1599 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1600 break;
d6a28714 1601 }
6eb5f6b9
JH
1602 return 0;
1603 got_it:
1604 return s;
1605}
1606
1607/*
1608 - regexec_flags - match a regexp against a string
1609 */
1610I32
1611Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1612 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1613/* strend: pointer to null at end of string */
1614/* strbeg: real beginning of string */
1615/* minend: end of match must be >=minend after stringarg. */
1616/* data: May be used for some additional optimizations. */
1617/* nosave: For optimizations. */
1618{
6eb5f6b9
JH
1619 register char *s;
1620 register regnode *c;
1621 register char *startpos = stringarg;
6eb5f6b9
JH
1622 I32 minlen; /* must match at least this many chars */
1623 I32 dontbother = 0; /* how many characters not to try at end */
155aba94 1624 /* I32 start_shift = 0; */ /* Offset of the start to find
6eb5f6b9
JH
1625 constant substr. */ /* CC */
1626 I32 end_shift = 0; /* Same for the end. */ /* CC */
1627 I32 scream_pos = -1; /* Internal iterator of scream. */
1628 char *scream_olds;
1629 SV* oreplsv = GvSV(PL_replgv);
1aa99e6b 1630 bool do_utf8 = DO_UTF8(sv);
2a782b5b 1631#ifdef DEBUGGING
9e55ce06
JH
1632 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1633 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1634#endif
a30b2f1f 1635 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1636
1637 PL_regcc = 0;
1638
1639 cache_re(prog);
1640#ifdef DEBUGGING
aea4f609 1641 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1642#endif
1643
1644 /* Be paranoid... */
1645 if (prog == NULL || startpos == NULL) {
1646 Perl_croak(aTHX_ "NULL regexp parameter");
1647 return 0;
1648 }
1649
1650 minlen = prog->minlen;
61a36c01 1651 if (strend - startpos < minlen) {
a72c7584
JH
1652 DEBUG_r(PerlIO_printf(Perl_debug_log,
1653 "String too short [regexec_flags]...\n"));
1654 goto phooey;
1aa99e6b 1655 }
6eb5f6b9 1656
6eb5f6b9
JH
1657 /* Check validity of program. */
1658 if (UCHARAT(prog->program) != REG_MAGIC) {
1659 Perl_croak(aTHX_ "corrupted regexp program");
1660 }
1661
1662 PL_reg_flags = 0;
1663 PL_reg_eval_set = 0;
1664 PL_reg_maxiter = 0;
1665
1666 if (prog->reganch & ROPT_UTF8)
1667 PL_reg_flags |= RF_utf8;
1668
1669 /* Mark beginning of line for ^ and lookbehind. */
1670 PL_regbol = startpos;
1671 PL_bostr = strbeg;
1672 PL_reg_sv = sv;
1673
1674 /* Mark end of line for $ (and such) */
1675 PL_regeol = strend;
1676
1677 /* see how far we have to get to not match where we matched before */
1678 PL_regtill = startpos+minend;
1679
1680 /* We start without call_cc context. */
1681 PL_reg_call_cc = 0;
1682
1683 /* If there is a "must appear" string, look for it. */
1684 s = startpos;
1685
1686 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1687 MAGIC *mg;
1688
1689 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1690 PL_reg_ganch = startpos;
1691 else if (sv && SvTYPE(sv) >= SVt_PVMG
1692 && SvMAGIC(sv)
14befaf4
DM
1693 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1694 && mg->mg_len >= 0) {
6eb5f6b9
JH
1695 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1696 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1697 if (s > PL_reg_ganch)
6eb5f6b9
JH
1698 goto phooey;
1699 s = PL_reg_ganch;
1700 }
1701 }
1702 else /* pos() not defined */
1703 PL_reg_ganch = strbeg;
1704 }
1705
33b8afdf 1706 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1707 re_scream_pos_data d;
1708
1709 d.scream_olds = &scream_olds;
1710 d.scream_pos = &scream_pos;
1711 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7
JH
1712 if (!s) {
1713 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1714 goto phooey; /* not present */
3fa9c3d7 1715 }
6eb5f6b9
JH
1716 }
1717
2a782b5b 1718 DEBUG_r({
9e55ce06
JH
1719 char *s0 = UTF ?
1720 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
c728cb41 1721 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1722 prog->precomp;
1723 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1724 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1725 UNI_DISPLAY_REGEX) : startpos;
9e55ce06 1726 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1727 if (!PL_colorset)
1728 reginitcolors();
1729 PerlIO_printf(Perl_debug_log,
9e55ce06 1730 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
2a782b5b 1731 PL_colors[4],PL_colors[5],PL_colors[0],
9e55ce06 1732 len0, len0, s0,
2a782b5b 1733 PL_colors[1],
9e55ce06 1734 len0 > 60 ? "..." : "",
2a782b5b 1735 PL_colors[0],
9e55ce06
JH
1736 (int)(len1 > 60 ? 60 : len1),
1737 s1, PL_colors[1],
1738 (len1 > 60 ? "..." : "")
2a782b5b
JH
1739 );
1740 });
6eb5f6b9
JH
1741
1742 /* Simplest case: anchored match need be tried only once. */
1743 /* [unless only anchor is BOL and multiline is set] */
1744 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1745 if (s == startpos && regtry(prog, startpos))
1746 goto got_it;
1747 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1748 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1749 {
1750 char *end;
1751
1752 if (minlen)
1753 dontbother = minlen - 1;
1aa99e6b 1754 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1755 /* for multiline we only have to try after newlines */
33b8afdf 1756 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1757 if (s == startpos)
1758 goto after_try;
1759 while (1) {
1760 if (regtry(prog, s))
1761 goto got_it;
1762 after_try:
1763 if (s >= end)
1764 goto phooey;
1765 if (prog->reganch & RE_USE_INTUIT) {
1766 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1767 if (!s)
1768 goto phooey;
1769 }
1770 else
1771 s++;
1772 }
1773 } else {
1774 if (s > startpos)
1775 s--;
1776 while (s < end) {
1777 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1778 if (regtry(prog, s))
1779 goto got_it;
1780 }
1781 }
1782 }
1783 }
1784 goto phooey;
1785 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1786 if (regtry(prog, PL_reg_ganch))
1787 goto got_it;
1788 goto phooey;
1789 }
1790
1791 /* Messy cases: unanchored match. */
33b8afdf 1792 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1793 /* we have /x+whatever/ */
1794 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1795 char ch;
bf93d4cc
GS
1796#ifdef DEBUGGING
1797 int did_match = 0;
1798#endif
33b8afdf
JH
1799 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1800 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1801 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1802
1aa99e6b 1803 if (do_utf8) {
6eb5f6b9
JH
1804 while (s < strend) {
1805 if (*s == ch) {
bf93d4cc 1806 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1807 if (regtry(prog, s)) goto got_it;
1808 s += UTF8SKIP(s);
1809 while (s < strend && *s == ch)
1810 s += UTF8SKIP(s);
1811 }
1812 s += UTF8SKIP(s);
1813 }
1814 }
1815 else {
1816 while (s < strend) {
1817 if (*s == ch) {
bf93d4cc 1818 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1819 if (regtry(prog, s)) goto got_it;
1820 s++;
1821 while (s < strend && *s == ch)
1822 s++;
1823 }
1824 s++;
1825 }
1826 }
b7953727 1827 DEBUG_r(if (!did_match)
bf93d4cc 1828 PerlIO_printf(Perl_debug_log,
b7953727
JH
1829 "Did not find anchored character...\n")
1830 );
6eb5f6b9
JH
1831 }
1832 /*SUPPRESS 560*/
33b8afdf
JH
1833 else if (prog->anchored_substr != Nullsv
1834 || prog->anchored_utf8 != Nullsv
1835 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1836 && prog->float_max_offset < strend - s)) {
1837 SV *must;
1838 I32 back_max;
1839 I32 back_min;
1840 char *last;
6eb5f6b9 1841 char *last1; /* Last position checked before */
bf93d4cc
GS
1842#ifdef DEBUGGING
1843 int did_match = 0;
1844#endif
33b8afdf
JH
1845 if (prog->anchored_substr || prog->anchored_utf8) {
1846 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1847 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1848 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1849 back_max = back_min = prog->anchored_offset;
1850 } else {
1851 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1852 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1853 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1854 back_max = prog->float_max_offset;
1855 back_min = prog->float_min_offset;
1856 }
1857 if (must == &PL_sv_undef)
1858 /* could not downgrade utf8 check substring, so must fail */
1859 goto phooey;
1860
1861 last = HOP3c(strend, /* Cannot start after this */
1862 -(I32)(CHR_SVLEN(must)
1863 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1864
1865 if (s > PL_bostr)
1866 last1 = HOPc(s, -1);
1867 else
1868 last1 = s - 1; /* bogus */
1869
1870 /* XXXX check_substr already used to find `s', can optimize if
1871 check_substr==must. */
1872 scream_pos = -1;
1873 dontbother = end_shift;
1874 strend = HOPc(strend, -dontbother);
1875 while ( (s <= last) &&
9041c2e3 1876 ((flags & REXEC_SCREAM)
1aa99e6b 1877 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1878 end_shift, &scream_pos, 0))
1aa99e6b 1879 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1880 (unsigned char*)strend, must,
6eb5f6b9 1881 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1882 /* we may be pointing at the wrong string */
1883 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
7ef91622 1884 s = strbeg + (s - SvPVX(sv));
bf93d4cc 1885 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1886 if (HOPc(s, -back_max) > last1) {
1887 last1 = HOPc(s, -back_min);
1888 s = HOPc(s, -back_max);
1889 }
1890 else {
1891 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1892
1893 last1 = HOPc(s, -back_min);
1894 s = t;
1895 }
1aa99e6b 1896 if (do_utf8) {
6eb5f6b9
JH
1897 while (s <= last1) {
1898 if (regtry(prog, s))
1899 goto got_it;
1900 s += UTF8SKIP(s);
1901 }
1902 }
1903 else {
1904 while (s <= last1) {
1905 if (regtry(prog, s))
1906 goto got_it;
1907 s++;
1908 }
1909 }
1910 }
b7953727
JH
1911 DEBUG_r(if (!did_match)
1912 PerlIO_printf(Perl_debug_log,
1913 "Did not find %s substr `%s%.*s%s'%s...\n",
33b8afdf 1914 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1915 ? "anchored" : "floating"),
1916 PL_colors[0],
1917 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1918 SvPVX(must),
b7953727
JH
1919 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1920 );
6eb5f6b9
JH
1921 goto phooey;
1922 }
155aba94 1923 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1924 if (minlen) {
1925 I32 op = (U8)OP(prog->regstclass);
66e933ab 1926 /* don't bother with what can't match */
f14c76ed
RGS
1927 if (PL_regkind[op] != EXACT && op != CANY)
1928 strend = HOPc(strend, -(minlen - 1));
1929 }
ffc61ed2
JH
1930 DEBUG_r({
1931 SV *prop = sv_newmortal();
9e55ce06
JH
1932 char *s0;
1933 char *s1;
1934 int len0;
1935 int len1;
1936
ffc61ed2 1937 regprop(prop, c);
9e55ce06
JH
1938 s0 = UTF ?
1939 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
c728cb41 1940 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1941 SvPVX(prop);
1942 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1943 s1 = UTF ?
c728cb41 1944 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1945 len1 = UTF ? SvCUR(dsv1) : strend - s;
1946 PerlIO_printf(Perl_debug_log,
1947 "Matching stclass `%*.*s' against `%*.*s'\n",
1948 len0, len0, s0,
1949 len1, len1, s1);
ffc61ed2 1950 });
6eb5f6b9
JH
1951 if (find_byclass(prog, c, s, strend, startpos, 0))
1952 goto got_it;
bf93d4cc 1953 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1954 }
1955 else {
1956 dontbother = 0;
33b8afdf
JH
1957 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1958 /* Trim the end. */
d6a28714 1959 char *last;
33b8afdf
JH
1960 SV* float_real;
1961
1962 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1963 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1964 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1965
1966 if (flags & REXEC_SCREAM) {
33b8afdf 1967 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1968 end_shift, &scream_pos, 1); /* last one */
1969 if (!last)
ffc61ed2 1970 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1971 /* we may be pointing at the wrong string */
1972 else if (RX_MATCH_COPIED(prog))
7ef91622 1973 s = strbeg + (s - SvPVX(sv));
b8c5462f 1974 }
d6a28714
JH
1975 else {
1976 STRLEN len;
33b8afdf 1977 char *little = SvPV(float_real, len);
d6a28714 1978
33b8afdf 1979 if (SvTAIL(float_real)) {
d6a28714
JH
1980 if (memEQ(strend - len + 1, little, len - 1))
1981 last = strend - len + 1;
1982 else if (!PL_multiline)
9041c2e3 1983 last = memEQ(strend - len, little, len)
d6a28714 1984 ? strend - len : Nullch;
b8c5462f 1985 else
d6a28714
JH
1986 goto find_last;
1987 } else {
1988 find_last:
9041c2e3 1989 if (len)
d6a28714 1990 last = rninstr(s, strend, little, little + len);
b8c5462f 1991 else
d6a28714 1992 last = strend; /* matching `$' */
b8c5462f 1993 }
b8c5462f 1994 }
bf93d4cc
GS
1995 if (last == NULL) {
1996 DEBUG_r(PerlIO_printf(Perl_debug_log,
1997 "%sCan't trim the tail, match fails (should not happen)%s\n",
1998 PL_colors[4],PL_colors[5]));
1999 goto phooey; /* Should not happen! */
2000 }
d6a28714
JH
2001 dontbother = strend - last + prog->float_min_offset;
2002 }
2003 if (minlen && (dontbother < minlen))
2004 dontbother = minlen - 1;
2005 strend -= dontbother; /* this one's always in bytes! */
2006 /* We don't know much -- general case. */
1aa99e6b 2007 if (do_utf8) {
d6a28714
JH
2008 for (;;) {
2009 if (regtry(prog, s))
2010 goto got_it;
2011 if (s >= strend)
2012 break;
b8c5462f 2013 s += UTF8SKIP(s);
d6a28714
JH
2014 };
2015 }
2016 else {
2017 do {
2018 if (regtry(prog, s))
2019 goto got_it;
2020 } while (s++ < strend);
2021 }
2022 }
2023
2024 /* Failure. */
2025 goto phooey;
2026
2027got_it:
2028 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2029
2030 if (PL_reg_eval_set) {
2031 /* Preserve the current value of $^R */
2032 if (oreplsv != GvSV(PL_replgv))
2033 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2034 restored, the value remains
2035 the same. */
acfe0abc 2036 restore_pos(aTHX_ 0);
d6a28714
JH
2037 }
2038
2039 /* make sure $`, $&, $', and $digit will work later */
2040 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2041 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2042 if (flags & REXEC_COPY_STR) {
2043 I32 i = PL_regeol - startpos + (stringarg - strbeg);
ed252734
NC
2044#ifdef PERL_COPY_ON_WRITE
2045 if ((SvIsCOW(sv)
2046 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2047 if (DEBUG_C_TEST) {
2048 PerlIO_printf(Perl_debug_log,
2049 "Copy on write: regexp capture, type %d\n",
2050 (int) SvTYPE(sv));
2051 }
2052 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2053 prog->subbeg = SvPVX(prog->saved_copy);
2054 assert (SvPOKp(prog->saved_copy));
2055 } else
2056#endif
2057 {
2058 RX_MATCH_COPIED_on(prog);
2059 s = savepvn(strbeg, i);
2060 prog->subbeg = s;
2061 }
d6a28714 2062 prog->sublen = i;
d6a28714
JH
2063 }
2064 else {
2065 prog->subbeg = strbeg;
2066 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2067 }
2068 }
9041c2e3 2069
d6a28714
JH
2070 return 1;
2071
2072phooey:
bf93d4cc
GS
2073 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2074 PL_colors[4],PL_colors[5]));
d6a28714 2075 if (PL_reg_eval_set)
acfe0abc 2076 restore_pos(aTHX_ 0);
d6a28714
JH
2077 return 0;
2078}
2079
2080/*
2081 - regtry - try match at specific point
2082 */
2083STATIC I32 /* 0 failure, 1 success */
2084S_regtry(pTHX_ regexp *prog, char *startpos)
2085{
d6a28714
JH
2086 register I32 i;
2087 register I32 *sp;
2088 register I32 *ep;
2089 CHECKPOINT lastcp;
2090
02db2b7b
IZ
2091#ifdef DEBUGGING
2092 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2093#endif
d6a28714
JH
2094 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2095 MAGIC *mg;
2096
2097 PL_reg_eval_set = RS_init;
2098 DEBUG_r(DEBUG_s(
b900a521
JH
2099 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2100 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2101 ));
e8347627 2102 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2103 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2104 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2105 SAVETMPS;
2106 /* Apparently this is not needed, judging by wantarray. */
e8347627 2107 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2108 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2109
2110 if (PL_reg_sv) {
2111 /* Make $_ available to executed code. */
2112 if (PL_reg_sv != DEFSV) {
59f00321 2113 SAVE_DEFSV;
d6a28714 2114 DEFSV = PL_reg_sv;
b8c5462f 2115 }
d6a28714 2116
9041c2e3 2117 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2118 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2119 /* prepare for quick setting of pos */
14befaf4
DM
2120 sv_magic(PL_reg_sv, (SV*)0,
2121 PERL_MAGIC_regex_global, Nullch, 0);
2122 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2123 mg->mg_len = -1;
b8c5462f 2124 }
d6a28714
JH
2125 PL_reg_magic = mg;
2126 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2127 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2128 }
09687e5a 2129 if (!PL_reg_curpm) {
0f79a09d 2130 Newz(22,PL_reg_curpm, 1, PMOP);
09687e5a
AB
2131#ifdef USE_ITHREADS
2132 {
2133 SV* repointer = newSViv(0);
577e12cc 2134 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2135 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2136 av_push(PL_regex_padav,repointer);
2137 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2138 PL_regex_pad = AvARRAY(PL_regex_padav);
2139 }
2140#endif
2141 }
aaa362c4 2142 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2143 PL_reg_oldcurpm = PL_curpm;
2144 PL_curpm = PL_reg_curpm;
2145 if (RX_MATCH_COPIED(prog)) {
2146 /* Here is a serious problem: we cannot rewrite subbeg,
2147 since it may be needed if this match fails. Thus
2148 $` inside (?{}) could fail... */
2149 PL_reg_oldsaved = prog->subbeg;
2150 PL_reg_oldsavedlen = prog->sublen;
ed252734
NC
2151#ifdef PERL_COPY_ON_WRITE
2152 PL_nrs = prog->saved_copy;
2153#endif
d6a28714
JH
2154 RX_MATCH_COPIED_off(prog);
2155 }
2156 else
2157 PL_reg_oldsaved = Nullch;
2158 prog->subbeg = PL_bostr;
2159 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2160 }
2161 prog->startp[0] = startpos - PL_bostr;
2162 PL_reginput = startpos;
2163 PL_regstartp = prog->startp;
2164 PL_regendp = prog->endp;
2165 PL_reglastparen = &prog->lastparen;
a01268b5 2166 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2167 prog->lastparen = 0;
03994de8 2168 prog->lastcloseparen = 0;
d6a28714
JH
2169 PL_regsize = 0;
2170 DEBUG_r(PL_reg_starttry = startpos);
2171 if (PL_reg_start_tmpl <= prog->nparens) {
2172 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2173 if(PL_reg_start_tmp)
2174 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2175 else
2176 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2177 }
2178
2179 /* XXXX What this code is doing here?!!! There should be no need
2180 to do this again and again, PL_reglastparen should take care of
3dd2943c 2181 this! --ilya*/
dafc8851
JH
2182
2183 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2184 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2185 * PL_reglastparen), is not needed at all by the test suite
2186 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2187 * enough, for building DynaLoader, or otherwise this
2188 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2189 * will happen. Meanwhile, this code *is* needed for the
2190 * above-mentioned test suite tests to succeed. The common theme
2191 * on those tests seems to be returning null fields from matches.
2192 * --jhi */
dafc8851 2193#if 1
d6a28714
JH
2194 sp = prog->startp;
2195 ep = prog->endp;
2196 if (prog->nparens) {
eb160463 2197 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2198 *++sp = -1;
2199 *++ep = -1;
2200 }
2201 }
dafc8851 2202#endif
02db2b7b 2203 REGCP_SET(lastcp);
d6a28714
JH
2204 if (regmatch(prog->program + 1)) {
2205 prog->endp[0] = PL_reginput - PL_bostr;
2206 return 1;
2207 }
02db2b7b 2208 REGCP_UNWIND(lastcp);
d6a28714
JH
2209 return 0;
2210}
2211
02db2b7b
IZ
2212#define RE_UNWIND_BRANCH 1
2213#define RE_UNWIND_BRANCHJ 2
2214
2215union re_unwind_t;
2216
2217typedef struct { /* XX: makes sense to enlarge it... */
2218 I32 type;
2219 I32 prev;
2220 CHECKPOINT lastcp;
2221} re_unwind_generic_t;
2222
2223typedef struct {
2224 I32 type;
2225 I32 prev;
2226 CHECKPOINT lastcp;
2227 I32 lastparen;
2228 regnode *next;
2229 char *locinput;
2230 I32 nextchr;
2231#ifdef DEBUGGING
2232 int regindent;
2233#endif
2234} re_unwind_branch_t;
2235
2236typedef union re_unwind_t {
2237 I32 type;
2238 re_unwind_generic_t generic;
2239 re_unwind_branch_t branch;
2240} re_unwind_t;
2241
8ba1375e
MJD
2242#define sayYES goto yes
2243#define sayNO goto no
e0f9d4a8 2244#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2245#define sayYES_FINAL goto yes_final
2246#define sayYES_LOUD goto yes_loud
2247#define sayNO_FINAL goto no_final
2248#define sayNO_SILENT goto do_no
2249#define saySAME(x) if (x) goto yes; else goto no
2250
2251#define REPORT_CODE_OFF 24
2252
d6a28714
JH
2253/*
2254 - regmatch - main matching routine
2255 *
2256 * Conceptually the strategy is simple: check to see whether the current
2257 * node matches, call self recursively to see whether the rest matches,
2258 * and then act accordingly. In practice we make some effort to avoid
2259 * recursion, in particular by going through "ordinary" nodes (that don't
2260 * need to know whether the rest of the match failed) by a loop instead of
2261 * by recursion.
2262 */
2263/* [lwall] I've hoisted the register declarations to the outer block in order to
2264 * maybe save a little bit of pushing and popping on the stack. It also takes
2265 * advantage of machines that use a register save mask on subroutine entry.
2266 */
2267STATIC I32 /* 0 failure, 1 success */
2268S_regmatch(pTHX_ regnode *prog)
2269{
d6a28714
JH
2270 register regnode *scan; /* Current node. */
2271 regnode *next; /* Next node. */
2272 regnode *inner; /* Next node in internal branch. */
2273 register I32 nextchr; /* renamed nextchr - nextchar colides with
2274 function of same name */
2275 register I32 n; /* no or next */
b7953727
JH
2276 register I32 ln = 0; /* len or last */
2277 register char *s = Nullch; /* operand or save */
d6a28714 2278 register char *locinput = PL_reginput;
b7953727 2279 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2280 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2281 I32 unwind = 0;
b7953727 2282#if 0
02db2b7b 2283 I32 firstcp = PL_savestack_ix;
b7953727 2284#endif
53c4c00c 2285 register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2286#ifdef DEBUGGING
ce333219
JH
2287 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2288 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2289 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2a782b5b 2290#endif
02db2b7b 2291
d6a28714
JH
2292#ifdef DEBUGGING
2293 PL_regindent++;
2294#endif
2295
2296 /* Note that nextchr is a byte even in UTF */
2297 nextchr = UCHARAT(locinput);
2298 scan = prog;
2299 while (scan != NULL) {
8ba1375e 2300
2a782b5b 2301 DEBUG_r( {
d6a28714
JH
2302 SV *prop = sv_newmortal();
2303 int docolor = *PL_colors[0];
2304 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2305 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2306 /* The part of the string before starttry has one color
2307 (pref0_len chars), between starttry and current
2308 position another one (pref_len - pref0_len chars),
2309 after the current position the third one.
2310 We assume that pref0_len <= pref_len, otherwise we
2311 decrease pref0_len. */
9041c2e3 2312 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2313 ? (5 + taill) - l : locinput - PL_bostr;
2314 int pref0_len;
d6a28714 2315
df1ffd02 2316 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2317 pref_len++;
2318 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2319 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2320 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2321 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2322 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2323 l--;
d6a28714
JH
2324 if (pref0_len < 0)
2325 pref0_len = 0;
2326 if (pref0_len > pref_len)
2327 pref0_len = pref_len;
2328 regprop(prop, scan);
2a782b5b
JH
2329 {
2330 char *s0 =
f14c76ed 2331 do_utf8 && OP(scan) != CANY ?
2a782b5b 2332 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2333 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2334 locinput - pref_len;
df1ffd02 2335 int len0 = do_utf8 ? strlen(s0) : pref0_len;
f14c76ed 2336 char *s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2337 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2338 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2339 locinput - pref_len + pref0_len;
df1ffd02 2340 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
f14c76ed 2341 char *s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2342 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2343 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2344 locinput;
df1ffd02 2345 int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2346 PerlIO_printf(Perl_debug_log,
2347 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2348 (IV)(locinput - PL_bostr),
2349 PL_colors[4],
2350 len0, s0,
2351 PL_colors[5],
2352 PL_colors[2],
2353 len1, s1,
2354 PL_colors[3],
2355 (docolor ? "" : "> <"),
2356 PL_colors[0],
2357 len2, s2,
2358 PL_colors[1],
2359 15 - l - pref_len + 1,
2360 "",
2361 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2362 SvPVX(prop));
2363 }
2364 });
d6a28714
JH
2365
2366 next = scan + NEXT_OFF(scan);
2367 if (next == scan)
2368 next = NULL;
2369
2370 switch (OP(scan)) {
2371 case BOL:
12d33761
HS
2372 if (locinput == PL_bostr || (PL_multiline &&
2373 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
d6a28714
JH
2374 {
2375 /* regtill = regbol; */
b8c5462f
JH
2376 break;
2377 }
d6a28714
JH
2378 sayNO;
2379 case MBOL:
12d33761
HS
2380 if (locinput == PL_bostr ||
2381 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2382 {
b8c5462f
JH
2383 break;
2384 }
d6a28714
JH
2385 sayNO;
2386 case SBOL:
c2a73568 2387 if (locinput == PL_bostr)
b8c5462f 2388 break;
d6a28714
JH
2389 sayNO;
2390 case GPOS:
2391 if (locinput == PL_reg_ganch)
2392 break;
2393 sayNO;
2394 case EOL:
2395 if (PL_multiline)
2396 goto meol;
2397 else
2398 goto seol;
2399 case MEOL:
2400 meol:
2401 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2402 sayNO;
b8c5462f 2403 break;
d6a28714
JH
2404 case SEOL:
2405 seol:
2406 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2407 sayNO;
d6a28714 2408 if (PL_regeol - locinput > 1)
b8c5462f 2409 sayNO;
b8c5462f 2410 break;
d6a28714
JH
2411 case EOS:
2412 if (PL_regeol != locinput)
b8c5462f 2413 sayNO;
d6a28714 2414 break;
ffc61ed2 2415 case SANY:
d6a28714 2416 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2417 sayNO;
f33976b4
DB
2418 if (do_utf8) {
2419 locinput += PL_utf8skip[nextchr];
2420 if (locinput > PL_regeol)
2421 sayNO;
2422 nextchr = UCHARAT(locinput);
2423 }
2424 else
2425 nextchr = UCHARAT(++locinput);
2426 break;
2427 case CANY:
2428 if (!nextchr && locinput >= PL_regeol)
2429 sayNO;
b8c5462f 2430 nextchr = UCHARAT(++locinput);
a0d0e21e 2431 break;
ffc61ed2 2432 case REG_ANY:
1aa99e6b
IH
2433 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2434 sayNO;
2435 if (do_utf8) {
b8c5462f 2436 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2437 if (locinput > PL_regeol)
2438 sayNO;
a0ed51b3 2439 nextchr = UCHARAT(locinput);
a0ed51b3 2440 }
1aa99e6b
IH
2441 else
2442 nextchr = UCHARAT(++locinput);
a0ed51b3 2443 break;
d6a28714 2444 case EXACT:
cd439c50
IZ
2445 s = STRING(scan);
2446 ln = STR_LEN(scan);
eb160463 2447 if (do_utf8 != UTF) {
bc517b45 2448 /* The target and the pattern have differing utf8ness. */
1aa99e6b
IH
2449 char *l = locinput;
2450 char *e = s + ln;
bc517b45 2451 STRLEN ulen;
a72c7584 2452
5ff6fc6d
JH
2453 if (do_utf8) {
2454 /* The target is utf8, the pattern is not utf8. */
1aa99e6b
IH
2455 while (s < e) {
2456 if (l >= PL_regeol)
5ff6fc6d
JH
2457 sayNO;
2458 if (NATIVE_TO_UNI(*(U8*)s) !=
872c91ae
JH
2459 utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
2460 ckWARN(WARN_UTF8) ?
2461 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2462 sayNO;
bc517b45 2463 l += ulen;
5ff6fc6d 2464 s ++;
1aa99e6b 2465 }
5ff6fc6d
JH
2466 }
2467 else {
2468 /* The target is not utf8, the pattern is utf8. */
1aa99e6b
IH
2469 while (s < e) {
2470 if (l >= PL_regeol)
2471 sayNO;
5ff6fc6d 2472 if (NATIVE_TO_UNI(*((U8*)l)) !=
872c91ae
JH
2473 utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
2474 ckWARN(WARN_UTF8) ?
2475 0 : UTF8_ALLOW_ANY))
1aa99e6b 2476 sayNO;
bc517b45 2477 s += ulen;
a72c7584 2478 l ++;
1aa99e6b 2479 }
5ff6fc6d 2480 }
1aa99e6b
IH
2481 locinput = l;
2482 nextchr = UCHARAT(locinput);
2483 break;
2484 }
bc517b45 2485 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2486 /* Inline the first character, for speed. */
2487 if (UCHARAT(s) != nextchr)
2488 sayNO;
2489 if (PL_regeol - locinput < ln)
2490 sayNO;
2491 if (ln > 1 && memNE(s, locinput, ln))
2492 sayNO;
2493 locinput += ln;
2494 nextchr = UCHARAT(locinput);
2495 break;
2496 case EXACTFL:
b8c5462f
JH
2497 PL_reg_flags |= RF_tainted;
2498 /* FALL THROUGH */
d6a28714 2499 case EXACTF:
cd439c50
IZ
2500 s = STRING(scan);
2501 ln = STR_LEN(scan);
d6a28714 2502
d07ddd77
JH
2503 if (do_utf8 || UTF) {
2504 /* Either target or the pattern are utf8. */
d6a28714 2505 char *l = locinput;
d07ddd77 2506 char *e = PL_regeol;
bc517b45 2507
eb160463 2508 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2509 l, &e, 0, do_utf8)) {
5486206c
JH
2510 /* One more case for the sharp s:
2511 * pack("U0U*", 0xDF) =~ /ss/i,
2512 * the 0xC3 0x9F are the UTF-8
2513 * byte sequence for the U+00DF. */
2514 if (!(do_utf8 &&
2515 toLOWER(s[0]) == 's' &&
2516 ln >= 2 &&
2517 toLOWER(s[1]) == 's' &&
2518 (U8)l[0] == 0xC3 &&
2519 e - l >= 2 &&
2520 (U8)l[1] == 0x9F))
2521 sayNO;
2522 }
d07ddd77
JH
2523 locinput = e;
2524 nextchr = UCHARAT(locinput);
2525 break;
a0ed51b3 2526 }
d6a28714 2527
bc517b45
JH
2528 /* Neither the target and the pattern are utf8. */
2529
d6a28714
JH
2530 /* Inline the first character, for speed. */
2531 if (UCHARAT(s) != nextchr &&
2532 UCHARAT(s) != ((OP(scan) == EXACTF)
2533 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2534 sayNO;
d6a28714 2535 if (PL_regeol - locinput < ln)
b8c5462f 2536 sayNO;
d6a28714
JH
2537 if (ln > 1 && (OP(scan) == EXACTF
2538 ? ibcmp(s, locinput, ln)
2539 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2540 sayNO;
d6a28714
JH
2541 locinput += ln;
2542 nextchr = UCHARAT(locinput);
a0d0e21e 2543 break;
d6a28714 2544 case ANYOF:
ffc61ed2 2545 if (do_utf8) {
9e55ce06
JH
2546 STRLEN inclasslen = PL_regeol - locinput;
2547
ba7b4546 2548 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2549 sayNO_ANYOF;
ffc61ed2
JH
2550 if (locinput >= PL_regeol)
2551 sayNO;
0f0076b4 2552 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2553 nextchr = UCHARAT(locinput);
e0f9d4a8 2554 break;
ffc61ed2
JH
2555 }
2556 else {
2557 if (nextchr < 0)
2558 nextchr = UCHARAT(locinput);
7d3e948e 2559 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2560 sayNO_ANYOF;
ffc61ed2
JH
2561 if (!nextchr && locinput >= PL_regeol)
2562 sayNO;
2563 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2564 break;
2565 }
2566 no_anyof:
2567 /* If we might have the case of the German sharp s
2568 * in a casefolding Unicode character class. */
2569
ebc501f0
JH
2570 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2571 locinput += SHARP_S_SKIP;
e0f9d4a8 2572 nextchr = UCHARAT(locinput);
ffc61ed2 2573 }
e0f9d4a8
JH
2574 else
2575 sayNO;
b8c5462f 2576 break;
d6a28714 2577 case ALNUML:
b8c5462f
JH
2578 PL_reg_flags |= RF_tainted;
2579 /* FALL THROUGH */
d6a28714 2580 case ALNUM:
b8c5462f 2581 if (!nextchr)
4633a7c4 2582 sayNO;
ffc61ed2 2583 if (do_utf8) {
ad24be35 2584 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2585 if (!(OP(scan) == ALNUM
3568d838 2586 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2587 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2588 {
2589 sayNO;
a0ed51b3 2590 }
b8c5462f 2591 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2592 nextchr = UCHARAT(locinput);
2593 break;
2594 }
ffc61ed2 2595 if (!(OP(scan) == ALNUM
d6a28714 2596 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2597 sayNO;
b8c5462f 2598 nextchr = UCHARAT(++locinput);
a0d0e21e 2599 break;
d6a28714 2600 case NALNUML:
b8c5462f
JH
2601 PL_reg_flags |= RF_tainted;
2602 /* FALL THROUGH */
d6a28714
JH
2603 case NALNUM:
2604 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2605 sayNO;
ffc61ed2 2606 if (do_utf8) {
8269fa76 2607 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2608 if (OP(scan) == NALNUM
3568d838 2609 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2610 : isALNUM_LC_utf8((U8*)locinput))
2611 {
b8c5462f 2612 sayNO;
d6a28714 2613 }
b8c5462f
JH
2614 locinput += PL_utf8skip[nextchr];
2615 nextchr = UCHARAT(locinput);
2616 break;
2617 }
ffc61ed2 2618 if (OP(scan) == NALNUM
d6a28714 2619 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2620 sayNO;
76e3520e 2621 nextchr = UCHARAT(++locinput);
a0d0e21e 2622 break;
d6a28714
JH
2623 case BOUNDL:
2624 case NBOUNDL:
3280af22 2625 PL_reg_flags |= RF_tainted;
bbce6d69 2626 /* FALL THROUGH */
d6a28714
JH
2627 case BOUND:
2628 case NBOUND:
2629 /* was last char in word? */
ffc61ed2 2630 if (do_utf8) {
12d33761
HS
2631 if (locinput == PL_bostr)
2632 ln = '\n';
ffc61ed2 2633 else {
b4f7163a 2634 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2635
b4f7163a 2636 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2637 }
2638 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2639 ln = isALNUM_uni(ln);
8269fa76 2640 LOAD_UTF8_CHARCLASS(alnum,"a");
3568d838 2641 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2642 }
2643 else {
9041c2e3 2644 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2645 n = isALNUM_LC_utf8((U8*)locinput);
2646 }
a0ed51b3 2647 }
d6a28714 2648 else {
12d33761
HS
2649 ln = (locinput != PL_bostr) ?
2650 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2651 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2652 ln = isALNUM(ln);
2653 n = isALNUM(nextchr);
2654 }
2655 else {
2656 ln = isALNUM_LC(ln);
2657 n = isALNUM_LC(nextchr);
2658 }
d6a28714 2659 }
ffc61ed2
JH
2660 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2661 OP(scan) == BOUNDL))
2662 sayNO;
a0ed51b3 2663 break;
d6a28714 2664 case SPACEL:
3280af22 2665 PL_reg_flags |= RF_tainted;
bbce6d69 2666 /* FALL THROUGH */
d6a28714 2667 case SPACE:
9442cb0e 2668 if (!nextchr)
4633a7c4 2669 sayNO;
1aa99e6b 2670 if (do_utf8) {
fd400ab9 2671 if (UTF8_IS_CONTINUED(nextchr)) {
8269fa76 2672 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2673 if (!(OP(scan) == SPACE
3568d838 2674 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2675 : isSPACE_LC_utf8((U8*)locinput)))
2676 {
2677 sayNO;
2678 }
2679 locinput += PL_utf8skip[nextchr];
2680 nextchr = UCHARAT(locinput);
2681 break;
d6a28714 2682 }
ffc61ed2
JH
2683 if (!(OP(scan) == SPACE
2684 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2685 sayNO;
2686 nextchr = UCHARAT(++locinput);
2687 }
2688 else {
2689 if (!(OP(scan) == SPACE
2690 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2691 sayNO;
2692 nextchr = UCHARAT(++locinput);
a0ed51b3 2693 }
a0ed51b3 2694 break;
d6a28714 2695 case NSPACEL:
3280af22 2696 PL_reg_flags |= RF_tainted;
bbce6d69 2697 /* FALL THROUGH */
d6a28714 2698 case NSPACE:
9442cb0e 2699 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2700 sayNO;
1aa99e6b 2701 if (do_utf8) {
8269fa76 2702 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2703 if (OP(scan) == NSPACE
3568d838 2704 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 2705 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
2706 {
2707 sayNO;
2708 }
2709 locinput += PL_utf8skip[nextchr];
2710 nextchr = UCHARAT(locinput);
2711 break;
a0ed51b3 2712 }
ffc61ed2 2713 if (OP(scan) == NSPACE
d6a28714 2714 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 2715 sayNO;
76e3520e 2716 nextchr = UCHARAT(++locinput);
a0d0e21e 2717 break;
d6a28714 2718 case DIGITL:
a0ed51b3
LW
2719 PL_reg_flags |= RF_tainted;
2720 /* FALL THROUGH */
d6a28714 2721 case DIGIT:
9442cb0e 2722 if (!nextchr)
a0ed51b3 2723 sayNO;
1aa99e6b 2724 if (do_utf8) {
8269fa76 2725 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2726 if (!(OP(scan) == DIGIT
3568d838 2727 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 2728 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 2729 {
a0ed51b3 2730 sayNO;
dfe13c55 2731 }
6f06b55f 2732 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2733 nextchr = UCHARAT(locinput);
2734 break;
2735 }
ffc61ed2 2736 if (!(OP(scan) == DIGIT
9442cb0e 2737 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
2738 sayNO;
2739 nextchr = UCHARAT(++locinput);
2740 break;
d6a28714 2741 case NDIGITL:
b8c5462f
JH
2742 PL_reg_flags |= RF_tainted;
2743 /* FALL THROUGH */
d6a28714 2744 case NDIGIT:
9442cb0e 2745 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2746 sayNO;
1aa99e6b 2747 if (do_utf8) {
8269fa76 2748 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2749 if (OP(scan) == NDIGIT
3568d838 2750 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
2751 : isDIGIT_LC_utf8((U8*)locinput))
2752 {
a0ed51b3 2753 sayNO;
9442cb0e 2754 }
6f06b55f 2755 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2756 nextchr = UCHARAT(locinput);
2757 break;
2758 }
ffc61ed2 2759 if (OP(scan) == NDIGIT
9442cb0e 2760 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
2761 sayNO;
2762 nextchr = UCHARAT(++locinput);
2763 break;
2764 case CLUMP:
b7c83a7e 2765 if (locinput >= PL_regeol)
a0ed51b3 2766 sayNO;
b7c83a7e
JH
2767 if (do_utf8) {
2768 LOAD_UTF8_CHARCLASS(mark,"~");
2769 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2770 sayNO;
2771 locinput += PL_utf8skip[nextchr];
2772 while (locinput < PL_regeol &&
2773 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2774 locinput += UTF8SKIP(locinput);
2775 if (locinput > PL_regeol)
2776 sayNO;
eb08e2da
JH
2777 }
2778 else
2779 locinput++;
a0ed51b3
LW
2780 nextchr = UCHARAT(locinput);
2781 break;
c8756f30 2782 case REFFL:
3280af22 2783 PL_reg_flags |= RF_tainted;
c8756f30 2784 /* FALL THROUGH */
c277df42 2785 case REF:
c8756f30 2786 case REFF:
c277df42 2787 n = ARG(scan); /* which paren pair */
cf93c79d 2788 ln = PL_regstartp[n];
2c2d71f5 2789 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 2790 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 2791 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 2792 if (ln == PL_regendp[n])
a0d0e21e 2793 break;
a0ed51b3 2794
cf93c79d 2795 s = PL_bostr + ln;
1aa99e6b 2796 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 2797 char *l = locinput;
cf93c79d 2798 char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
2799 /*
2800 * Note that we can't do the "other character" lookup trick as
2801 * in the 8-bit case (no pun intended) because in Unicode we
2802 * have to map both upper and title case to lower case.
2803 */
2804 if (OP(scan) == REFF) {
a2a2844f 2805 STRLEN ulen1, ulen2;
e7ae6809
JH
2806 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2807 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a0ed51b3
LW
2808 while (s < e) {
2809 if (l >= PL_regeol)
2810 sayNO;
a2a2844f
JH
2811 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2812 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 2813 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 2814 sayNO;
a2a2844f
JH
2815 s += ulen1;
2816 l += ulen2;
a0ed51b3
LW
2817 }
2818 }
2819 locinput = l;
2820 nextchr = UCHARAT(locinput);
2821 break;
2822 }
2823
a0d0e21e 2824 /* Inline the first character, for speed. */
76e3520e 2825 if (UCHARAT(s) != nextchr &&
c8756f30
AK
2826 (OP(scan) == REF ||
2827 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 2828 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 2829 sayNO;
cf93c79d 2830 ln = PL_regendp[n] - ln;
3280af22 2831 if (locinput + ln > PL_regeol)
4633a7c4 2832 sayNO;
c8756f30
AK
2833 if (ln > 1 && (OP(scan) == REF
2834 ? memNE(s, locinput, ln)
2835 : (OP(scan) == REFF
2836 ? ibcmp(s, locinput, ln)
2837 : ibcmp_locale(s, locinput, ln))))
4633a7c4 2838 sayNO;
a0d0e21e 2839 locinput += ln;
76e3520e 2840 nextchr = UCHARAT(locinput);
a0d0e21e
LW
2841 break;
2842
2843 case NOTHING:
c277df42 2844 case TAIL:
a0d0e21e
LW
2845 break;
2846 case BACK:
2847 break;
c277df42
IZ
2848 case EVAL:
2849 {
2850 dSP;
533c011a 2851 OP_4tree *oop = PL_op;
3280af22 2852 COP *ocurcop = PL_curcop;
f3548bdc 2853 PAD *old_comppad;
c277df42 2854 SV *ret;
080c2dec 2855 struct regexp *oreg = PL_reg_re;
9041c2e3 2856
c277df42 2857 n = ARG(scan);
533c011a 2858 PL_op = (OP_4tree*)PL_regdata->data[n];
d7d93a81 2859 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 2860 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 2861 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 2862
8e5e9ebe
RGS
2863 {
2864 SV **before = SP;
2865 CALLRUNOPS(aTHX); /* Scalar context. */
2866 SPAGAIN;
2867 if (SP == before)
075aa684 2868 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
2869 else {
2870 ret = POPs;
2871 PUTBACK;
2872 }
2873 }
2874
0f5d15d6 2875 PL_op = oop;
f3548bdc 2876 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 2877 PL_curcop = ocurcop;
c277df42 2878 if (logical) {
0f5d15d6
IZ
2879 if (logical == 2) { /* Postponed subexpression. */
2880 regexp *re;
22c35a8c 2881 MAGIC *mg = Null(MAGIC*);
0f5d15d6 2882 re_cc_state state;
0f5d15d6 2883 CHECKPOINT cp, lastcp;
cb50f42d 2884 int toggleutf;
faf82a0b 2885 register SV *sv;
0f5d15d6 2886
faf82a0b
AE
2887 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
2888 mg = mg_find(sv, PERL_MAGIC_qr);
2889 else if (SvSMAGICAL(ret)) {
2890 if (SvGMAGICAL(ret))
2891 sv_unmagic(ret, PERL_MAGIC_qr);
2892 else
2893 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 2894 }
faf82a0b 2895
0f5d15d6
IZ
2896 if (mg) {
2897 re = (regexp *)mg->mg_obj;
df0003d4 2898 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
2899 }
2900 else {
2901 STRLEN len;
2902 char *t = SvPV(ret, len);
2903 PMOP pm;
2904 char *oprecomp = PL_regprecomp;
2905 I32 osize = PL_regsize;
2906 I32 onpar = PL_regnpar;
2907
5fcd1c1b 2908 Zero(&pm, 1, PMOP);
cb50f42d 2909 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
cea2e8a9 2910 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
9041c2e3 2911 if (!(SvFLAGS(ret)
faf82a0b
AE
2912 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
2913 | SVs_GMG)))
14befaf4
DM
2914 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2915 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
2916 PL_regprecomp = oprecomp;
2917 PL_regsize = osize;
2918 PL_regnpar = onpar;
2919 }
2920 DEBUG_r(
9041c2e3 2921 PerlIO_printf(Perl_debug_log,
0f5d15d6
IZ
2922 "Entering embedded `%s%.60s%s%s'\n",
2923 PL_colors[0],
2924 re->precomp,
2925 PL_colors[1],
2926 (strlen(re->precomp) > 60 ? "..." : ""))
2927 );
2928 state.node = next;
2929 state.prev = PL_reg_call_cc;
2930 state.cc = PL_regcc;
2931 state.re = PL_reg_re;
2932
2ab05381 2933 PL_regcc = 0;
9041c2e3 2934
0f5d15d6 2935 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 2936 REGCP_SET(lastcp);
0f5d15d6
IZ
2937 cache_re(re);
2938 state.ss = PL_savestack_ix;
2939 *PL_reglastparen = 0;
a01268b5 2940 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
2941 PL_reg_call_cc = &state;
2942 PL_reginput = locinput;
cb50f42d
YST
2943 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2944 ((re->reganch & ROPT_UTF8) != 0);
2945 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2946
2947 /* XXXX This is too dramatic a measure... */
2948 PL_reg_maxiter = 0;
2949
0f5d15d6 2950 if (regmatch(re->program + 1)) {
2c914db6
IZ
2951 /* Even though we succeeded, we need to restore
2952 global variables, since we may be wrapped inside
2953 SUSPEND, thus the match may be not finished yet. */
2954
2955 /* XXXX Do this only if SUSPENDed? */
2956 PL_reg_call_cc = state.prev;
2957 PL_regcc = state.cc;
2958 PL_reg_re = state.re;
2959 cache_re(PL_reg_re);
cb50f42d 2960 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
2961
2962 /* XXXX This is too dramatic a measure... */
2963 PL_reg_maxiter = 0;
2964
2965 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
2966 ReREFCNT_dec(re);
2967 regcpblow(cp);
2968 sayYES;
2969 }
0f5d15d6 2970 ReREFCNT_dec(re);
02db2b7b 2971 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
2972 regcppop();
2973 PL_reg_call_cc = state.prev;
2974 PL_regcc = state.cc;
2975 PL_reg_re = state.re;
d3790889 2976 cache_re(PL_reg_re);
cb50f42d 2977 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2978
2979 /* XXXX This is too dramatic a measure... */
2980 PL_reg_maxiter = 0;
2981
8e514ae6 2982 logical = 0;
0f5d15d6
IZ
2983 sayNO;
2984 }
c277df42 2985 sw = SvTRUE(ret);
0f5d15d6 2986 logical = 0;
a0ed51b3 2987 }
080c2dec 2988 else {
3280af22 2989 sv_setsv(save_scalar(PL_replgv), ret);
080c2dec
AE
2990 cache_re(oreg);
2991 }
c277df42
IZ
2992 break;
2993 }
a0d0e21e 2994 case OPEN:
c277df42 2995 n = ARG(scan); /* which paren pair */
3280af22
NIS
2996 PL_reg_start_tmp[n] = locinput;
2997 if (n > PL_regsize)
2998 PL_regsize = n;
a0d0e21e
LW
2999 break;
3000 case CLOSE:
c277df42 3001 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3002 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3003 PL_regendp[n] = locinput - PL_bostr;
eb160463 3004 if (n > (I32)*PL_reglastparen)
3280af22 3005 *PL_reglastparen = n;
a01268b5 3006 *PL_reglastcloseparen = n;
a0d0e21e 3007 break;
c277df42
IZ
3008 case GROUPP:
3009 n = ARG(scan); /* which paren pair */
eb160463 3010 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3011 break;
3012 case IFTHEN:
2c2d71f5 3013 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
3014 if (sw)
3015 next = NEXTOPER(NEXTOPER(scan));
3016 else {
3017 next = scan + ARG(scan);
3018 if (OP(next) == IFTHEN) /* Fake one. */
3019 next = NEXTOPER(NEXTOPER(next));
3020 }
3021 break;
3022 case LOGICAL:
0f5d15d6 3023 logical = scan->flags;
c277df42 3024 break;
2ab05381
IZ
3025/*******************************************************************
3026 PL_regcc contains infoblock about the innermost (...)* loop, and
3027 a pointer to the next outer infoblock.
3028
3029 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3030
3031 1) After matching X, regnode for CURLYX is processed;
3032
9041c2e3 3033 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3034 regmatch() recursively with the starting point at WHILEM node;
3035
3036 3) Each hit of WHILEM node tries to match A and Z (in the order
3037 depending on the current iteration, min/max of {min,max} and
3038 greediness). The information about where are nodes for "A"
3039 and "Z" is read from the infoblock, as is info on how many times "A"
3040 was already matched, and greediness.
3041
3042 4) After A matches, the same WHILEM node is hit again.
3043
3044 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3045 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3046 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3047 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3048 of the external loop.
3049
3050 Currently present infoblocks form a tree with a stem formed by PL_curcc
3051 and whatever it mentions via ->next, and additional attached trees
3052 corresponding to temporarily unset infoblocks as in "5" above.
3053
9041c2e3 3054 In the following picture infoblocks for outer loop of
2ab05381
IZ
3055 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3056 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3057 infoblocks are drawn below the "reset" infoblock.
3058
3059 In fact in the picture below we do not show failed matches for Z and T
3060 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3061 more obvious *why* one needs to *temporary* unset infoblocks.]
3062
3063 Matched REx position InfoBlocks Comment
3064 (Y(A)*?Z)*?T x
3065 Y(A)*?Z)*?T x <- O
3066 Y (A)*?Z)*?T x <- O
3067 Y A)*?Z)*?T x <- O <- I
3068 YA )*?Z)*?T x <- O <- I
3069 YA A)*?Z)*?T x <- O <- I
3070 YAA )*?Z)*?T x <- O <- I
3071 YAA Z)*?T x <- O # Temporary unset I
3072 I
3073
3074 YAAZ Y(A)*?Z)*?T x <- O
3075 I
3076
3077 YAAZY (A)*?Z)*?T x <- O
3078 I
3079
3080 YAAZY A)*?Z)*?T x <- O <- I
3081 I
3082
3083 YAAZYA )*?Z)*?T x <- O <- I
3084 I
3085
3086 YAAZYA Z)*?T x <- O # Temporary unset I
3087 I,I
3088
3089 YAAZYAZ )*?T x <- O
3090 I,I
3091
3092 YAAZYAZ T x # Temporary unset O
3093 O
3094 I,I
3095
3096 YAAZYAZT x
3097 O
3098 I,I
3099 *******************************************************************/
a0d0e21e
LW
3100 case CURLYX: {
3101 CURCUR cc;
3280af22 3102 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3103 /* No need to save/restore up to this paren */
3104 I32 parenfloor = scan->flags;
c277df42
IZ
3105
3106 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3107 next += ARG(next);
3280af22
NIS
3108 cc.oldcc = PL_regcc;
3109 PL_regcc = &cc;
cb434fcc
IZ
3110 /* XXXX Probably it is better to teach regpush to support
3111 parenfloor > PL_regsize... */
eb160463 3112 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3113 parenfloor = *PL_reglastparen; /* Pessimization... */
3114 cc.parenfloor = parenfloor;
a0d0e21e
LW
3115 cc.cur = -1;
3116 cc.min = ARG1(scan);
3117 cc.max = ARG2(scan);
c277df42 3118 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3119 cc.next = next;
3120 cc.minmod = minmod;
3121 cc.lastloc = 0;
3280af22 3122 PL_reginput = locinput;
a0d0e21e
LW
3123 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3124 regcpblow(cp);
3280af22 3125 PL_regcc = cc.oldcc;
4633a7c4 3126 saySAME(n);
a0d0e21e
LW
3127 }
3128 /* NOT REACHED */
3129 case WHILEM: {
3130 /*
3131 * This is really hard to understand, because after we match
3132 * what we're trying to match, we must make sure the rest of
2c2d71f5 3133 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3134 * to go back UP the parse tree by recursing ever deeper. And
3135 * if it fails, we have to reset our parent's current state
3136 * that we can try again after backing off.
3137 */
3138
c277df42 3139 CHECKPOINT cp, lastcp;
3280af22 3140 CURCUR* cc = PL_regcc;
c277df42
IZ
3141 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3142
4633a7c4 3143 n = cc->cur + 1; /* how many we know we matched */
3280af22 3144 PL_reginput = locinput;
a0d0e21e 3145
c277df42 3146 DEBUG_r(
9041c2e3 3147 PerlIO_printf(Perl_debug_log,
91f3b821 3148 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3149 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3150 (long)n, (long)cc->min,
2797576d 3151 (long)cc->max, PTR2UV(cc))
c277df42 3152 );
4633a7c4 3153
a0d0e21e
LW
3154 /* If degenerate scan matches "", assume scan done. */
3155
579cf2c3 3156 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3157 PL_regcc = cc->oldcc;
2ab05381
IZ
3158 if (PL_regcc)
3159 ln = PL_regcc->cur;
c277df42 3160 DEBUG_r(
c3464db5
DD
3161 PerlIO_printf(Perl_debug_log,
3162 "%*s empty match detected, try continuation...\n",
3280af22 3163 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3164 );
a0d0e21e 3165 if (regmatch(cc->next))
4633a7c4 3166 sayYES;
2ab05381
IZ
3167 if (PL_regcc)
3168 PL_regcc->cur = ln;
3280af22 3169 PL_regcc = cc;
4633a7c4 3170 sayNO;
a0d0e21e
LW
3171 }
3172
3173 /* First just match a string of min scans. */
3174
3175 if (n < cc->min) {
3176 cc->cur = n;
3177 cc->lastloc = locinput;
4633a7c4
LW
3178 if (regmatch(cc->scan))
3179 sayYES;
3180 cc->cur = n - 1;
c277df42 3181 cc->lastloc = lastloc;
4633a7c4 3182 sayNO;
a0d0e21e
LW
3183 }
3184
2c2d71f5
JH
3185 if (scan->flags) {
3186 /* Check whether we already were at this position.
3187 Postpone detection until we know the match is not
3188 *that* much linear. */
3189 if (!PL_reg_maxiter) {
3190 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3191 PL_reg_leftiter = PL_reg_maxiter;
3192 }
3193 if (PL_reg_leftiter-- == 0) {
3194 I32 size = (PL_reg_maxiter + 7)/8;
3195 if (PL_reg_poscache) {
eb160463 3196 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3197 Renew(PL_reg_poscache, size, char);
3198 PL_reg_poscache_size = size;
3199 }
3200 Zero(PL_reg_poscache, size, char);
3201 }
3202 else {
3203 PL_reg_poscache_size = size;
3204 Newz(29, PL_reg_poscache, size, char);
3205 }
3206 DEBUG_r(
3207 PerlIO_printf(Perl_debug_log,
3208 "%sDetected a super-linear match, switching on caching%s...\n",
3209 PL_colors[4], PL_colors[5])
3210 );
3211 }
3212 if (PL_reg_leftiter < 0) {
3213 I32 o = locinput - PL_bostr, b;
3214
3215 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3216 b = o % 8;
3217 o /= 8;
3218 if (PL_reg_poscache[o] & (1<<b)) {
3219 DEBUG_r(
3220 PerlIO_printf(Perl_debug_log,
3221 "%*s already tried at this position...\n",
3222 REPORT_CODE_OFF+PL_regindent*2, "")
3223 );
c2b0868c
HS
3224 if (PL_reg_flags & RF_false)
3225 sayYES;
3226 else
3227 sayNO_SILENT;
2c2d71f5
JH
3228 }
3229 PL_reg_poscache[o] |= (1<<b);
3230 }
3231 }
3232
a0d0e21e
LW
3233 /* Prefer next over scan for minimal matching. */
3234
3235 if (cc->minmod) {
3280af22 3236 PL_regcc = cc->oldcc;
2ab05381
IZ
3237 if (PL_regcc)
3238 ln = PL_regcc->cur;
5f05dabc 3239 cp = regcppush(cc->parenfloor);
02db2b7b 3240 REGCP_SET(lastcp);
5f05dabc 3241 if (regmatch(cc->next)) {
c277df42 3242 regcpblow(cp);
4633a7c4 3243 sayYES; /* All done. */
5f05dabc 3244 }
02db2b7b 3245 REGCP_UNWIND(lastcp);
5f05dabc 3246 regcppop();
2ab05381
IZ
3247 if (PL_regcc)
3248 PL_regcc->cur = ln;
3280af22 3249 PL_regcc = cc;
a0d0e21e 3250
c277df42 3251 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3252 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3253 && !(PL_reg_flags & RF_warned)) {
3254 PL_reg_flags |= RF_warned;
9014280d 3255 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3256 "Complex regular subexpression recursion",
3257 REG_INFTY - 1);
c277df42 3258 }
4633a7c4 3259 sayNO;
c277df42 3260 }
a687059c 3261
c277df42 3262 DEBUG_r(
c3464db5
DD
3263 PerlIO_printf(Perl_debug_log,
3264 "%*s trying longer...\n",
3280af22 3265 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3266 );
a0d0e21e 3267 /* Try scanning more and see if it helps. */
3280af22 3268 PL_reginput = locinput;
a0d0e21e
LW
3269 cc->cur = n;
3270 cc->lastloc = locinput;
5f05dabc 3271 cp = regcppush(cc->parenfloor);
02db2b7b 3272 REGCP_SET(lastcp);
5f05dabc 3273 if (regmatch(cc->scan)) {
c277df42 3274 regcpblow(cp);
4633a7c4 3275 sayYES;
5f05dabc 3276 }
02db2b7b 3277 REGCP_UNWIND(lastcp);
5f05dabc 3278 regcppop();
4633a7c4 3279 cc->cur = n - 1;
c277df42 3280 cc->lastloc = lastloc;
4633a7c4 3281 sayNO;
a0d0e21e
LW
3282 }
3283
3284 /* Prefer scan over next for maximal matching. */
3285
3286 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3287 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3288 cc->cur = n;
3289 cc->lastloc = locinput;
02db2b7b 3290 REGCP_SET(lastcp);
5f05dabc 3291 if (regmatch(cc->scan)) {
c277df42 3292 regcpblow(cp);
4633a7c4 3293 sayYES;
5f05dabc 3294 }
02db2b7b 3295 REGCP_UNWIND(lastcp);
a0d0e21e 3296 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3297 PL_reginput = locinput;
c277df42 3298 DEBUG_r(
c3464db5
DD
3299 PerlIO_printf(Perl_debug_log,
3300 "%*s failed, try continuation...\n",
3280af22 3301 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3302 );
3303 }
9041c2e3 3304 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3305 && !(PL_reg_flags & RF_warned)) {
3280af22 3306 PL_reg_flags |= RF_warned;
9014280d 3307 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3308 "Complex regular subexpression recursion",
3309 REG_INFTY - 1);
a0d0e21e
LW
3310 }
3311
3312 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3313 PL_regcc = cc->oldcc;
2ab05381
IZ
3314 if (PL_regcc)
3315 ln = PL_regcc->cur;
a0d0e21e 3316 if (regmatch(cc->next))
4633a7c4 3317 sayYES;
2ab05381
IZ
3318 if (PL_regcc)
3319 PL_regcc->cur = ln;
3280af22 3320 PL_regcc = cc;
4633a7c4 3321 cc->cur = n - 1;
c277df42 3322 cc->lastloc = lastloc;
4633a7c4 3323 sayNO;
a0d0e21e
LW
3324 }
3325 /* NOT REACHED */
9041c2e3 3326 case BRANCHJ:
c277df42
IZ
3327 next = scan + ARG(scan);
3328 if (next == scan)
3329 next = NULL;
3330 inner = NEXTOPER(NEXTOPER(scan));
3331 goto do_branch;
9041c2e3 3332 case BRANCH:
c277df42
IZ
3333 inner = NEXTOPER(scan);
3334 do_branch:
3335 {
c277df42
IZ
3336 c1 = OP(scan);
3337 if (OP(next) != c1) /* No choice. */
3338 next = inner; /* Avoid recursion. */
a0d0e21e 3339 else {
02db2b7b
IZ
3340 I32 lastparen = *PL_reglastparen;
3341 I32 unwind1;
3342 re_unwind_branch_t *uw;
3343
3344 /* Put unwinding data on stack */
3345 unwind1 = SSNEWt(1,re_unwind_branch_t);
3346 uw = SSPTRt(unwind1,re_unwind_branch_t);
3347 uw->prev = unwind;
3348 unwind = unwind1;
3349 uw->type = ((c1 == BRANCH)
3350 ? RE_UNWIND_BRANCH
3351 : RE_UNWIND_BRANCHJ);
3352 uw->lastparen = lastparen;
3353 uw->next = next;
3354 uw->locinput = locinput;
3355 uw->nextchr = nextchr;
3356#ifdef DEBUGGING
3357 uw->regindent = ++PL_regindent;
3358#endif
c277df42 3359
02db2b7b
IZ
3360 REGCP_SET(uw->lastcp);
3361
3362 /* Now go into the first branch */
3363 next = inner;
a687059c 3364 }
a0d0e21e
LW
3365 }
3366 break;
3367 case MINMOD:
3368 minmod = 1;
3369 break;
c277df42
IZ
3370 case CURLYM:
3371 {
00db4c45 3372 I32 l = 0;
c277df42 3373 CHECKPOINT lastcp;
9041c2e3 3374
c277df42 3375 /* We suppose that the next guy does not need
0e788c72 3376 backtracking: in particular, it is of constant non-zero length,
c277df42
IZ
3377 and has no parenths to influence future backrefs. */
3378 ln = ARG1(scan); /* min to match */
3379 n = ARG2(scan); /* max to match */
c277df42
IZ
3380 paren = scan->flags;
3381 if (paren) {
3280af22
NIS
3382 if (paren > PL_regsize)
3383 PL_regsize = paren;
eb160463 3384 if (paren > (I32)*PL_reglastparen)
3280af22 3385 *PL_reglastparen = paren;
c277df42 3386 }
dc45a647 3387 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3388 if (paren)
3389 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3390 PL_reginput = locinput;
c277df42
IZ
3391 if (minmod) {
3392 minmod = 0;
3393 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3394 sayNO;
3280af22 3395 locinput = PL_reginput;
cca55fe3 3396 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3397 regnode *text_node = next;
3398
cca55fe3 3399 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3400
cca55fe3 3401 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3402 else {
cca55fe3 3403 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3404 c1 = c2 = -1000;
3405 goto assume_ok_MM;
cca55fe3
JP
3406 }
3407 else { c1 = (U8)*STRING(text_node); }
af5decee 3408 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3409 c2 = PL_fold[c1];
af5decee 3410 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3411 c2 = PL_fold_locale[c1];
3412 else
3413 c2 = c1;
3414 }
a0ed51b3
LW
3415 }
3416 else
c277df42 3417 c1 = c2 = -1000;
cca55fe3 3418 assume_ok_MM:
02db2b7b 3419 REGCP_SET(lastcp);
0e788c72 3420 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
c277df42
IZ
3421 /* If it could work, try it. */
3422 if (c1 == -1000 ||
3280af22
NIS
3423 UCHARAT(PL_reginput) == c1 ||
3424 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3425 {
3426 if (paren) {
f31a99c8 3427 if (ln) {
cf93c79d
IZ
3428 PL_regstartp[paren] =
3429 HOPc(PL_reginput, -l) - PL_bostr;
3430 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3431 }
3432 else
cf93c79d 3433 PL_regendp[paren] = -1;
c277df42
IZ
3434 }
3435 if (regmatch(next))
3436 sayYES;
02db2b7b 3437 REGCP_UNWIND(lastcp);
c277df42
IZ
3438 }
3439 /* Couldn't or didn't -- move forward. */
3280af22 3440 PL_reginput = locinput;
c277df42
IZ
3441 if (regrepeat_hard(scan, 1, &l)) {
3442 ln++;
3280af22 3443 locinput = PL_reginput;
c277df42
IZ
3444 }
3445 else
3446 sayNO;
3447 }
a0ed51b3
LW
3448 }
3449 else {
c277df42 3450 n = regrepeat_hard(scan, n, &l);
3280af22 3451 locinput = PL_reginput;
c277df42 3452 DEBUG_r(
5c0ca799 3453 PerlIO_printf(Perl_debug_log,
faccc32b 3454 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3455 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3456 (IV) n, (IV)l)
c277df42
IZ
3457 );
3458 if (n >= ln) {
cca55fe3 3459 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3460 regnode *text_node = next;
3461
cca55fe3 3462 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3463
cca55fe3 3464 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3465 else {
cca55fe3 3466 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3467 c1 = c2 = -1000;
3468 goto assume_ok_REG;
cca55fe3
JP
3469 }
3470 else { c1 = (U8)*STRING(text_node); }
3471
af5decee 3472 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3473 c2 = PL_fold[c1];
af5decee 3474 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3475 c2 = PL_fold_locale[c1];
3476 else
3477 c2 = c1;
3478 }
a0ed51b3
LW
3479 }
3480 else
c277df42
IZ
3481 c1 = c2 = -1000;
3482 }
cca55fe3 3483 assume_ok_REG:
02db2b7b 3484 REGCP_SET(lastcp);
c277df42
IZ
3485 while (n >= ln) {
3486 /* If it could work, try it. */
3487 if (c1 == -1000 ||
3280af22
NIS
3488 UCHARAT(PL_reginput) == c1 ||
3489 UCHARAT(PL_reginput) == c2)
a0ed51b3
LW
3490 {
3491 DEBUG_r(
c3464db5 3492 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3493 "%*s trying tail with n=%"IVdf"...\n",
3494 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3495 );
3496 if (paren) {
3497 if (n) {
cf93c79d
IZ
3498 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3499 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3500 }
a0ed51b3 3501 else
cf93c79d 3502 PL_regendp[paren] = -1;
c277df42 3503 }
a0ed51b3
LW
3504 if (regmatch(next))
3505 sayYES;
02db2b7b 3506 REGCP_UNWIND(lastcp);
a0ed51b3 3507 }
c277df42
IZ
3508 /* Couldn't or didn't -- back up. */
3509 n--;
dfe13c55 3510 locinput = HOPc(locinput, -l);
3280af22 3511 PL_reginput = locinput;
c277df42
IZ
3512 }
3513 }
3514 sayNO;
3515 break;
3516 }
3517 case CURLYN:
3518 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3519 if (paren > PL_regsize)
3520 PL_regsize = paren;
eb160463 3521 if (paren > (I32)*PL_reglastparen)
3280af22 3522 *PL_reglastparen = paren;
c277df42
IZ
3523 ln = ARG1(scan); /* min to match */
3524 n = ARG2(scan); /* max to match */
dc45a647 3525 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3526 goto repeat;
a0d0e21e 3527 case CURLY:
c277df42 3528 paren = 0;
a0d0e21e
LW
3529 ln = ARG1(scan); /* min to match */
3530 n = ARG2(scan); /* max to match */
dc45a647 3531 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3532 goto repeat;
3533 case STAR:
3534 ln = 0;
c277df42 3535 n = REG_INFTY;
a0d0e21e 3536 scan = NEXTOPER(scan);
c277df42 3537 paren = 0;
a0d0e21e
LW
3538 goto repeat;
3539 case PLUS:
c277df42
IZ
3540 ln = 1;
3541 n = REG_INFTY;
3542 scan = NEXTOPER(scan);
3543 paren = 0;
3544 repeat:
a0d0e21e
LW
3545 /*
3546 * Lookahead to avoid useless match attempts
3547 * when we know what character comes next.
3548 */
5f80c4cf
JP
3549
3550 /*
3551 * Used to only do .*x and .*?x, but now it allows
3552 * for )'s, ('s and (?{ ... })'s to be in the way
3553 * of the quantifier and the EXACT-like node. -- japhy
3554 */
3555
cca55fe3 3556 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3557 U8 *s;
3558 regnode *text_node = next;
3559
cca55fe3 3560 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3561
cca55fe3 3562 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3563 else {
cca55fe3 3564 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3565 c1 = c2 = -1000;
3566 goto assume_ok_easy;
cca55fe3
JP
3567 }
3568 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
3569
3570 if (!UTF) {
3571 c2 = c1 = *s;
f65d3ee7 3572 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3573 c2 = PL_fold[c1];
f65d3ee7 3574 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf 3575 c2 = PL_fold_locale[c1];
1aa99e6b 3576 }
5f80c4cf 3577 else { /* UTF */
f65d3ee7 3578 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3579 STRLEN ulen1, ulen2;
e7ae6809
JH
3580 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3581 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
3582
3583 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3584 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3585
872c91ae
JH
3586 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
3587 ckWARN(WARN_UTF8) ?
3588 0 : UTF8_ALLOW_ANY);
3589 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
3590 ckWARN(WARN_UTF8) ?
3591 0 : UTF8_ALLOW_ANY);
5f80c4cf
JP
3592 }
3593 else {
872c91ae
JH
3594 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
3595 ckWARN(WARN_UTF8) ?
3596 0 : UTF8_ALLOW_ANY);
5f80c4cf 3597 }
1aa99e6b
IH
3598 }
3599 }
bbce6d69 3600 }
a0d0e21e 3601 else
bbce6d69 3602 c1 = c2 = -1000;
cca55fe3 3603 assume_ok_easy:
3280af22 3604 PL_reginput = locinput;
a0d0e21e 3605 if (minmod) {
c277df42 3606 CHECKPOINT lastcp;
a0d0e21e
LW
3607 minmod = 0;
3608 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 3609 sayNO;
a0ed51b3 3610 locinput = PL_reginput;
02db2b7b 3611 REGCP_SET(lastcp);
0fe9bf95 3612 if (c1 != -1000) {
1aa99e6b 3613 char *e; /* Should not check after this */
0fe9bf95 3614 char *old = locinput;
b2f2f093 3615 int count = 0;
0fe9bf95 3616
1aa99e6b 3617 if (n == REG_INFTY) {
0fe9bf95 3618 e = PL_regeol - 1;
1aa99e6b
IH
3619 if (do_utf8)
3620 while (UTF8_IS_CONTINUATION(*(U8*)e))
3621 e--;
3622 }
3623 else if (do_utf8) {
3624 int m = n - ln;
3625 for (e = locinput;
3626 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3627 e += UTF8SKIP(e);
3628 }
3629 else {
3630 e = locinput + n - ln;
3631 if (e >= PL_regeol)
3632 e = PL_regeol - 1;
3633 }
0fe9bf95
IZ
3634 while (1) {
3635 /* Find place 'next' could work */
1aa99e6b
IH
3636 if (!do_utf8) {
3637 if (c1 == c2) {
a8e8ab15
JH
3638 while (locinput <= e &&
3639 UCHARAT(locinput) != c1)
1aa99e6b
IH
3640 locinput++;
3641 } else {
9041c2e3 3642 while (locinput <= e
a8e8ab15
JH
3643 && UCHARAT(locinput) != c1
3644 && UCHARAT(locinput) != c2)
1aa99e6b
IH
3645 locinput++;
3646 }
3647 count = locinput - old;
3648 }
3649 else {
3650 STRLEN len;
3651 if (c1 == c2) {
872c91ae
JH
3652 /* count initialised to
3653 * utf8_distance(old, locinput) */
b2f2f093 3654 while (locinput <= e &&
872c91ae
JH
3655 utf8n_to_uvchr((U8*)locinput,
3656 UTF8_MAXLEN, &len,
3657 ckWARN(WARN_UTF8) ?
eb160463 3658 0 : UTF8_ALLOW_ANY) != (UV)c1) {
1aa99e6b 3659 locinput += len;
b2f2f093
JH
3660 count++;
3661 }
1aa99e6b 3662 } else {
872c91ae
JH
3663 /* count initialised to
3664 * utf8_distance(old, locinput) */
b2f2f093 3665 while (locinput <= e) {
872c91ae
JH
3666 UV c = utf8n_to_uvchr((U8*)locinput,
3667 UTF8_MAXLEN, &len,
3668 ckWARN(WARN_UTF8) ?
3669 0 : UTF8_ALLOW_ANY);
eb160463 3670 if (c == (UV)c1 || c == (UV)c2)
1aa99e6b 3671 break;
b2f2f093
JH
3672 locinput += len;
3673 count++;
1aa99e6b
IH
3674 }
3675 }
0fe9bf95 3676 }
9041c2e3 3677 if (locinput > e)
0fe9bf95
IZ
3678 sayNO;
3679 /* PL_reginput == old now */
3680 if (locinput != old) {
3681 ln = 1; /* Did some */
1aa99e6b 3682 if (regrepeat(scan, count) < count)
0fe9bf95
IZ
3683 sayNO;
3684 }
3685 /* PL_reginput == locinput now */
29d1e993 3686 TRYPAREN(paren, ln, locinput);
0fe9bf95 3687 PL_reginput = locinput; /* Could be reset... */
02db2b7b 3688 REGCP_UNWIND(lastcp);
0fe9bf95 3689 /* Couldn't or didn't -- move forward. */
1aa99e6b
IH
3690 old = locinput;
3691 if (do_utf8)
3692 locinput += UTF8SKIP(locinput);
3693 else
3694 locinput++;
b2f2f093 3695 count = 1;
0fe9bf95
IZ
3696 }
3697 }
3698 else
c277df42 3699 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
1aa99e6b
IH
3700 UV c;
3701 if (c1 != -1000) {
3702 if (do_utf8)
872c91ae
JH
3703 c = utf8n_to_uvchr((U8*)PL_reginput,
3704 UTF8_MAXLEN, 0,
3705 ckWARN(WARN_UTF8) ?
3706 0 : UTF8_ALLOW_ANY);
1aa99e6b 3707 else
9041c2e3 3708 c = UCHARAT(PL_reginput);
2390ecbc 3709 /* If it could work, try it. */
eb160463 3710 if (c == (UV)c1 || c == (UV)c2)
2390ecbc 3711 {
ecc99935 3712 TRYPAREN(paren, ln, PL_reginput);
2390ecbc
PP
3713 REGCP_UNWIND(lastcp);
3714 }
1aa99e6b 3715 }
a0d0e21e 3716 /* If it could work, try it. */
2390ecbc 3717 else if (c1 == -1000)
bbce6d69 3718 {
ecc99935 3719 TRYPAREN(paren, ln, PL_reginput);
02db2b7b 3720 REGCP_UNWIND(lastcp);
bbce6d69 3721 }
c277df42 3722 /* Couldn't or didn't -- move forward. */
a0ed51b3 3723 PL_reginput = locinput;
a0d0e21e
LW
3724 if (regrepeat(scan, 1)) {
3725 ln++;
a0ed51b3
LW
3726 locinput = PL_reginput;
3727 }
3728 else
4633a7c4 3729 sayNO;
a0d0e21e
LW
3730 }
3731 }
3732 else {
c277df42 3733 CHECKPOINT lastcp;
a0d0e21e 3734 n = regrepeat(scan, n);
a0ed51b3 3735 locinput = PL_reginput;
22c35a8c 3736 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
15272685
HS
3737 ((!PL_multiline && OP(next) != MEOL) ||
3738 OP(next) == SEOL || OP(next) == EOS))
3739 {
a0d0e21e 3740 ln = n; /* why back off? */
1aeab75a
GS
3741 /* ...because $ and \Z can match before *and* after
3742 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3743 We should back off by one in this case. */
3744 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3745 ln--;
3746 }
02db2b7b 3747 REGCP_SET(lastcp);
c277df42 3748 if (paren) {
8fa7f367 3749 UV c = 0;
c277df42 3750 while (n >= ln) {
1aa99e6b
IH
3751 if (c1 != -1000) {
3752 if (do_utf8)
872c91ae
JH
3753 c = utf8n_to_uvchr((U8*)PL_reginput,
3754 UTF8_MAXLEN, 0,
3755 ckWARN(WARN_UTF8) ?
3756 0 : UTF8_ALLOW_ANY);
1aa99e6b 3757 else
9041c2e3 3758 c = UCHARAT(PL_reginput);
1aa99e6b 3759 }
c277df42 3760 /* If it could work, try it. */
eb160463 3761 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3762 {
29d1e993 3763 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3764 REGCP_UNWIND(lastcp);
c277df42
IZ
3765 }
3766 /* Couldn't or didn't -- back up. */
3767 n--;
dfe13c55 3768 PL_reginput = locinput = HOPc(locinput, -1);
c277df42 3769 }
a0ed51b3
LW
3770 }
3771 else {
8fa7f367 3772 UV c = 0;
c277df42 3773 while (n >= ln) {
1aa99e6b
IH
3774 if (c1 != -1000) {
3775 if (do_utf8)
872c91ae
JH
3776 c = utf8n_to_uvchr((U8*)PL_reginput,
3777 UTF8_MAXLEN, 0,
3778 ckWARN(WARN_UTF8) ?
3779 0 : UTF8_ALLOW_ANY);
1aa99e6b 3780 else
9041c2e3 3781 c = UCHARAT(PL_reginput);
1aa99e6b 3782 }
c277df42 3783 /* If it could work, try it. */
eb160463 3784 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3785 {
29d1e993 3786 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3787 REGCP_UNWIND(lastcp);
c277df42
IZ
3788 }
3789 /* Couldn't or didn't -- back up. */
3790 n--;
dfe13c55 3791 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 3792 }
a0d0e21e
LW
3793 }
3794 }
4633a7c4 3795 sayNO;
c277df42 3796 break;
a0d0e21e 3797 case END:
0f5d15d6
IZ
3798 if (PL_reg_call_cc) {
3799 re_cc_state *cur_call_cc = PL_reg_call_cc;
3800 CURCUR *cctmp = PL_regcc;
3801 regexp *re = PL_reg_re;
3802 CHECKPOINT cp, lastcp;
3803
3804 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3805 REGCP_SET(lastcp);
0f5d15d6
IZ
3806 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3807 the caller. */
3808 PL_reginput = locinput; /* Make position available to
3809 the callcc. */
3810 cache_re(PL_reg_call_cc->re);
3811 PL_regcc = PL_reg_call_cc->cc;
3812 PL_reg_call_cc = PL_reg_call_cc->prev;
3813 if (regmatch(cur_call_cc->node)) {
3814 PL_reg_call_cc = cur_call_cc;
3815 regcpblow(cp);
3816 sayYES;
3817 }
02db2b7b 3818 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3819 regcppop();
3820 PL_reg_call_cc = cur_call_cc;
3821 PL_regcc = cctmp;
3822 PL_reg_re = re;
3823 cache_re(re);
3824
3825 DEBUG_r(
3826 PerlIO_printf(Perl_debug_log,
3827 "%*s continuation failed...\n",
3828 REPORT_CODE_OFF+PL_regindent*2, "")
3829 );
7821416a 3830 sayNO_SILENT;
0f5d15d6 3831 }
7821416a
IZ
3832 if (locinput < PL_regtill) {
3833 DEBUG_r(PerlIO_printf(Perl_debug_log,
3834 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3835 PL_colors[4],
3836 (long)(locinput - PL_reg_starttry),
3837 (long)(PL_regtill - PL_reg_starttry),
3838 PL_colors[5]));
3839 sayNO_FINAL; /* Cannot match: too short. */
3840 }
3841 PL_reginput = locinput; /* put where regtry can find it */
3842 sayYES_FINAL; /* Success! */
7e5428c5 3843 case SUCCEED:
3280af22 3844 PL_reginput = locinput; /* put where regtry can find it */
7821416a 3845 sayYES_LOUD; /* Success! */
c277df42
IZ
3846 case SUSPEND:
3847 n = 1;
9fe1d20c 3848 PL_reginput = locinput;
9041c2e3 3849 goto do_ifmatch;
a0d0e21e 3850 case UNLESSM:
c277df42 3851 n = 0;
a0ed51b3 3852 if (scan->flags) {
efb30f32
HS
3853 s = HOPBACKc(locinput, scan->flags);
3854 if (!s)
3855 goto say_yes;
3856 PL_reginput = s;
a0ed51b3
LW
3857 }
3858 else
3859 PL_reginput = locinput;
c2b0868c 3860 PL_reg_flags ^= RF_false;
c277df42
IZ
3861 goto do_ifmatch;
3862 case IFMATCH:
3863 n = 1;
a0ed51b3 3864 if (scan->flags) {
efb30f32
HS
3865 s = HOPBACKc(locinput, scan->flags);
3866 if (!s)
3867 goto say_no;
3868 PL_reginput = s;
a0ed51b3
LW
3869 }
3870 else
3871 PL_reginput = locinput;
3872
c277df42 3873 do_ifmatch:
c277df42
IZ
3874 inner = NEXTOPER(NEXTOPER(scan));
3875 if (regmatch(inner) != n) {
c2b0868c
HS
3876 if (n == 0)
3877 PL_reg_flags ^= RF_false;
c277df42
IZ
3878 say_no:
3879 if (logical) {
3880 logical = 0;
3881 sw = 0;
3882 goto do_longjump;
a0ed51b3
LW
3883 }
3884 else
c277df42
IZ
3885 sayNO;
3886 }
c2b0868c
HS
3887 if (n == 0)
3888 PL_reg_flags ^= RF_false;
c277df42
IZ
3889 say_yes:
3890 if (logical) {
3891 logical = 0;
3892 sw = 1;
3893 }
fe44a5e8 3894 if (OP(scan) == SUSPEND) {
3280af22 3895 locinput = PL_reginput;
565764a8 3896 nextchr = UCHARAT(locinput);
fe44a5e8 3897 }
c277df42
IZ
3898 /* FALL THROUGH. */
3899 case LONGJMP:
3900 do_longjump:
3901 next = scan + ARG(scan);
3902 if (next == scan)
3903 next = NULL;
a0d0e21e
LW
3904 break;
3905 default:
b900a521 3906 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 3907 PTR2UV(scan), OP(scan));
cea2e8a9 3908 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 3909 }
02db2b7b 3910 reenter:
a0d0e21e
LW
3911 scan = next;
3912 }
a687059c 3913
a0d0e21e
LW
3914 /*
3915 * We get here only if there's trouble -- normally "case END" is
3916 * the terminating point.
3917 */
cea2e8a9 3918 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 3919 /*NOTREACHED*/
4633a7c4
LW
3920 sayNO;
3921
7821416a
IZ
3922yes_loud:
3923 DEBUG_r(
3924 PerlIO_printf(Perl_debug_log,
3925 "%*s %scould match...%s\n",
3926 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3927 );
3928 goto yes;
3929yes_final:
3930 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3931 PL_colors[4],PL_colors[5]));
4633a7c4
LW
3932yes:
3933#ifdef DEBUGGING
3280af22 3934 PL_regindent--;
4633a7c4 3935#endif
02db2b7b
IZ
3936
3937#if 0 /* Breaks $^R */
3938 if (unwind)
3939 regcpblow(firstcp);
3940#endif
4633a7c4
LW
3941 return 1;
3942
3943no:
7821416a
IZ
3944 DEBUG_r(
3945 PerlIO_printf(Perl_debug_log,
3946 "%*s %sfailed...%s\n",
3947 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3948 );
3949 goto do_no;
3950no_final:
3951do_no:
02db2b7b
IZ
3952 if (unwind) {
3953 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3954
3955 switch (uw->type) {
3956 case RE_UNWIND_BRANCH:
3957 case RE_UNWIND_BRANCHJ:
3958 {
3959 re_unwind_branch_t *uwb = &(uw->branch);
3960 I32 lastparen = uwb->lastparen;
9041c2e3 3961
02db2b7b
IZ
3962 REGCP_UNWIND(uwb->lastcp);
3963 for (n = *PL_reglastparen; n > lastparen; n--)
3964 PL_regendp[n] = -1;
3965 *PL_reglastparen = n;
3966 scan = next = uwb->next;
9041c2e3
NIS
3967 if ( !scan ||
3968 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b
IZ
3969 ? BRANCH : BRANCHJ) ) { /* Failure */
3970 unwind = uwb->prev;
3971#ifdef DEBUGGING
3972 PL_regindent--;
3973#endif
3974 goto do_no;
3975 }
3976 /* Have more choice yet. Reuse the same uwb. */
3977 /*SUPPRESS 560*/
3978 if ((n = (uwb->type == RE_UNWIND_BRANCH
3979 ? NEXT_OFF(next) : ARG(next))))
3980 next += n;
3981 else
3982 next = NULL; /* XXXX Needn't unwinding in this case... */
3983 uwb->next = next;
3984 next = NEXTOPER(scan);
3985 if (uwb->type == RE_UNWIND_BRANCHJ)
3986 next = NEXTOPER(next);
3987 locinput = uwb->locinput;
3988 nextchr = uwb->nextchr;
3989#ifdef DEBUGGING
3990 PL_regindent = uwb->regindent;
3991#endif
3992
3993 goto reenter;
3994 }
3995 /* NOT REACHED */
3996 default:
3997 Perl_croak(aTHX_ "regexp unwind memory corruption");
3998 }
3999 /* NOT REACHED */
4000 }
4633a7c4 4001#ifdef DEBUGGING
3280af22 4002 PL_regindent--;
4633a7c4 4003#endif
a0d0e21e 4004 return 0;
a687059c
LW
4005}
4006
4007/*
4008 - regrepeat - repeatedly match something simple, report how many
4009 */
4010/*
4011 * [This routine now assumes that it will only match on things of length 1.
4012 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4013 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4014 */
76e3520e 4015STATIC I32
cea2e8a9 4016S_regrepeat(pTHX_ regnode *p, I32 max)
a687059c 4017{
a0d0e21e 4018 register char *scan;
a0d0e21e 4019 register I32 c;
3280af22 4020 register char *loceol = PL_regeol;
a0ed51b3 4021 register I32 hardcount = 0;
53c4c00c 4022 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4023
3280af22 4024 scan = PL_reginput;
faf11cac
HS
4025 if (max == REG_INFTY)
4026 max = I32_MAX;
4027 else if (max < loceol - scan)
a0d0e21e 4028 loceol = scan + max;
a0d0e21e 4029 switch (OP(p)) {
22c35a8c 4030 case REG_ANY:
1aa99e6b 4031 if (do_utf8) {
ffc61ed2 4032 loceol = PL_regeol;
1aa99e6b 4033 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4034 scan += UTF8SKIP(scan);
4035 hardcount++;
4036 }
4037 } else {
4038 while (scan < loceol && *scan != '\n')
4039 scan++;
a0ed51b3
LW
4040 }
4041 break;
ffc61ed2 4042 case SANY:
def8e4ea
JH
4043 if (do_utf8) {
4044 loceol = PL_regeol;
a0804c9e 4045 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4046 scan += UTF8SKIP(scan);
4047 hardcount++;
4048 }
4049 }
4050 else
4051 scan = loceol;
a0ed51b3 4052 break;
f33976b4
DB
4053 case CANY:
4054 scan = loceol;
4055 break;
090f7165
JH
4056 case EXACT: /* length of string is 1 */
4057 c = (U8)*STRING(p);
4058 while (scan < loceol && UCHARAT(scan) == c)
4059 scan++;
bbce6d69 4060 break;
4061 case EXACTF: /* length of string is 1 */
cd439c50 4062 c = (U8)*STRING(p);
bbce6d69 4063 while (scan < loceol &&
22c35a8c 4064 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4065 scan++;
4066 break;
4067 case EXACTFL: /* length of string is 1 */
3280af22 4068 PL_reg_flags |= RF_tainted;
cd439c50 4069 c = (U8)*STRING(p);
bbce6d69 4070 while (scan < loceol &&
22c35a8c 4071 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4072 scan++;
4073 break;
4074 case ANYOF:
ffc61ed2
JH
4075 if (do_utf8) {
4076 loceol = PL_regeol;
cfc92286
JH
4077 while (hardcount < max && scan < loceol &&
4078 reginclass(p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4079 scan += UTF8SKIP(scan);
4080 hardcount++;
4081 }
4082 } else {
7d3e948e 4083 while (scan < loceol && REGINCLASS(p, (U8*)scan))
ffc61ed2
JH
4084 scan++;
4085 }
a0d0e21e
LW
4086 break;
4087 case ALNUM:
1aa99e6b 4088 if (do_utf8) {
ffc61ed2 4089 loceol = PL_regeol;
8269fa76 4090 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4091 while (hardcount < max && scan < loceol &&
3568d838 4092 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4093 scan += UTF8SKIP(scan);
4094 hardcount++;
4095 }
4096 } else {
4097 while (scan < loceol && isALNUM(*scan))
4098 scan++;
a0ed51b3
LW
4099 }
4100 break;
bbce6d69 4101 case ALNUML:
3280af22 4102 PL_reg_flags |= RF_tainted;
1aa99e6b 4103 if (do_utf8) {
ffc61ed2 4104 loceol = PL_regeol;
1aa99e6b
IH
4105 while (hardcount < max && scan < loceol &&
4106 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4107 scan += UTF8SKIP(scan);
4108 hardcount++;
4109 }
4110 } else {
4111 while (scan < loceol && isALNUM_LC(*scan))
4112 scan++;
a0ed51b3
LW
4113 }
4114 break;
a0d0e21e 4115 case NALNUM:
1aa99e6b 4116 if (do_utf8) {
ffc61ed2 4117 loceol = PL_regeol;
8269fa76 4118 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4119 while (hardcount < max && scan < loceol &&
3568d838 4120 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4121 scan += UTF8SKIP(scan);
4122 hardcount++;
4123 }
4124 } else {
4125 while (scan < loceol && !isALNUM(*scan))
4126 scan++;
a0ed51b3
LW
4127 }
4128 break;
bbce6d69 4129 case NALNUML:
3280af22 4130 PL_reg_flags |= RF_tainted;
1aa99e6b 4131 if (do_utf8) {
ffc61ed2 4132 loceol = PL_regeol;
1aa99e6b
IH
4133 while (hardcount < max && scan < loceol &&
4134 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4135 scan += UTF8SKIP(scan);
4136 hardcount++;
4137 }
4138 } else {
4139 while (scan < loceol && !isALNUM_LC(*scan))
4140 scan++;
a0ed51b3
LW
4141 }
4142 break;
a0d0e21e 4143 case SPACE:
1aa99e6b 4144 if (do_utf8) {
ffc61ed2 4145 loceol = PL_regeol;
8269fa76 4146 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4147 while (hardcount < max && scan < loceol &&
3568d838
JH
4148 (*scan == ' ' ||
4149 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4150 scan += UTF8SKIP(scan);
4151 hardcount++;
4152 }
4153 } else {
4154 while (scan < loceol && isSPACE(*scan))
4155 scan++;
a0ed51b3
LW
4156 }
4157 break;
bbce6d69 4158 case SPACEL:
3280af22 4159 PL_reg_flags |= RF_tainted;
1aa99e6b 4160 if (do_utf8) {
ffc61ed2 4161 loceol = PL_regeol;
1aa99e6b 4162 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4163 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4164 scan += UTF8SKIP(scan);
4165 hardcount++;
4166 }
4167 } else {
4168 while (scan < loceol && isSPACE_LC(*scan))
4169 scan++;
a0ed51b3
LW
4170 }
4171 break;
a0d0e21e 4172 case NSPACE:
1aa99e6b 4173 if (do_utf8) {
ffc61ed2 4174 loceol = PL_regeol;
8269fa76 4175 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4176 while (hardcount < max && scan < loceol &&
3568d838
JH
4177 !(*scan == ' ' ||
4178 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4179 scan += UTF8SKIP(scan);
4180 hardcount++;
4181 }
4182 } else {
4183 while (scan < loceol && !isSPACE(*scan))
4184 scan++;
4185 break;
a0ed51b3 4186 }
bbce6d69 4187 case NSPACEL:
3280af22 4188 PL_reg_flags |= RF_tainted;
1aa99e6b 4189 if (do_utf8) {
ffc61ed2 4190 loceol = PL_regeol;
1aa99e6b 4191 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4192 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4193 scan += UTF8SKIP(scan);
4194 hardcount++;
4195 }
4196 } else {
4197 while (scan < loceol && !isSPACE_LC(*scan))
4198 scan++;
a0ed51b3
LW
4199 }
4200 break;
a0d0e21e 4201 case DIGIT:
1aa99e6b 4202 if (do_utf8) {
ffc61ed2 4203 loceol = PL_regeol;
8269fa76 4204 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4205 while (hardcount < max && scan < loceol &&
3568d838 4206 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4207 scan += UTF8SKIP(scan);
4208 hardcount++;
4209 }
4210 } else {
4211 while (scan < loceol && isDIGIT(*scan))
4212 scan++;
a0ed51b3
LW
4213 }
4214 break;
a0d0e21e 4215 case NDIGIT:
1aa99e6b 4216 if (do_utf8) {
ffc61ed2 4217 loceol = PL_regeol;
8269fa76 4218 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4219 while (hardcount < max && scan < loceol &&
3568d838 4220 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4221 scan += UTF8SKIP(scan);
4222 hardcount++;
4223 }
4224 } else {
4225 while (scan < loceol && !isDIGIT(*scan))
4226 scan++;
a0ed51b3
LW
4227 }
4228 break;
a0d0e21e
LW
4229 default: /* Called on something of 0 width. */
4230 break; /* So match right here or not at all. */
4231 }
a687059c 4232
a0ed51b3
LW
4233 if (hardcount)
4234 c = hardcount;
4235 else
4236 c = scan - PL_reginput;
3280af22 4237 PL_reginput = scan;
a687059c 4238
9041c2e3 4239 DEBUG_r(
c277df42
IZ
4240 {
4241 SV *prop = sv_newmortal();
4242
4243 regprop(prop, p);
9041c2e3
NIS
4244 PerlIO_printf(Perl_debug_log,
4245 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7b0972df 4246 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
c277df42 4247 });
9041c2e3 4248
a0d0e21e 4249 return(c);
a687059c
LW
4250}
4251
4252/*
c277df42 4253 - regrepeat_hard - repeatedly match something, report total lenth and length
9041c2e3 4254 *
0e788c72 4255 * The repeater is supposed to have constant non-zero length.
c277df42
IZ
4256 */
4257
76e3520e 4258STATIC I32
cea2e8a9 4259S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
c277df42 4260{
b7953727 4261 register char *scan = Nullch;
c277df42 4262 register char *start;
3280af22 4263 register char *loceol = PL_regeol;
a0ed51b3 4264 I32 l = 0;
708e3b05 4265 I32 count = 0, res = 1;
a0ed51b3
LW
4266
4267 if (!max)
4268 return 0;
c277df42 4269
3280af22 4270 start = PL_reginput;
53c4c00c 4271 if (PL_reg_match_utf8) {
708e3b05 4272 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4273 if (!count++) {
4274 l = 0;
4275 while (start < PL_reginput) {
4276 l++;
4277 start += UTF8SKIP(start);
4278 }
4279 *lp = l;
4280 if (l == 0)
4281 return max;
4282 }
4283 if (count == max)
4284 return count;
4285 }
4286 }
4287 else {
708e3b05 4288 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4289 if (!count++) {
4290 *lp = l = PL_reginput - start;
4291 if (max != REG_INFTY && l*max < loceol - scan)
4292 loceol = scan + l*max;
4293 if (l == 0)
4294 return max;
c277df42
IZ
4295 }
4296 }
4297 }
708e3b05 4298 if (!res)
3280af22 4299 PL_reginput = scan;
9041c2e3 4300
a0ed51b3 4301 return count;
c277df42
IZ
4302}
4303
4304/*
ffc61ed2
JH
4305- regclass_swash - prepare the utf8 swash
4306*/
4307
4308SV *
9e55ce06 4309Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4310{
9e55ce06
JH
4311 SV *sw = NULL;
4312 SV *si = NULL;
4313 SV *alt = NULL;
ffc61ed2
JH
4314
4315 if (PL_regdata && PL_regdata->count) {
4316 U32 n = ARG(node);
4317
4318 if (PL_regdata->what[n] == 's') {
4319 SV *rv = (SV*)PL_regdata->data[n];
4320 AV *av = (AV*)SvRV((SV*)rv);
b11f357e 4321 SV **ary = AvARRAY(av);
9e55ce06 4322 SV **a, **b;
9041c2e3 4323
9e55ce06
JH
4324 /* See the end of regcomp.c:S_reglass() for
4325 * documentation of these array elements. */
4326
b11f357e
JH
4327 si = *ary;
4328 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4329 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4330
ffc61ed2
JH
4331 if (a)
4332 sw = *a;
4333 else if (si && doinit) {
4334 sw = swash_init("utf8", "", si, 1, 0);
4335 (void)av_store(av, 1, sw);
4336 }
9e55ce06
JH
4337 if (b)
4338 alt = *b;
ffc61ed2
JH
4339 }
4340 }
4341
9e55ce06
JH
4342 if (listsvp)
4343 *listsvp = si;
4344 if (altsvp)
4345 *altsvp = alt;
ffc61ed2
JH
4346
4347 return sw;
4348}
4349
4350/*
ba7b4546 4351 - reginclass - determine if a character falls into a character class
832705d4
JH
4352
4353 The n is the ANYOF regnode, the p is the target string, lenp
4354 is pointer to the maximum length of how far to go in the p
4355 (if the lenp is zero, UTF8SKIP(p) is used),
4356 do_utf8 tells whether the target string is in UTF-8.
4357
bbce6d69 4358 */
4359
76e3520e 4360STATIC bool
ba7b4546 4361S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4362{
ffc61ed2 4363 char flags = ANYOF_FLAGS(n);
bbce6d69 4364 bool match = FALSE;
cc07378b 4365 UV c = *p;
ae9ddab8 4366 STRLEN len = 0;
9e55ce06 4367 STRLEN plen;
1aa99e6b 4368
ae9ddab8
JH
4369 if (do_utf8 && !UTF8_IS_INVARIANT(c))
4370 c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
4371 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
bbce6d69 4372
0f0076b4 4373 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4374 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4375 if (lenp)
4376 *lenp = 0;
ffc61ed2 4377 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4378 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4379 match = TRUE;
bbce6d69 4380 }
3568d838 4381 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4382 match = TRUE;
ffc61ed2 4383 if (!match) {
9e55ce06
JH
4384 AV *av;
4385 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4386
4387 if (sw) {
3568d838 4388 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4389 match = TRUE;
4390 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4391 if (!match && lenp && av) {
4392 I32 i;
4393
4394 for (i = 0; i <= av_len(av); i++) {
4395 SV* sv = *av_fetch(av, i, FALSE);
4396 STRLEN len;
4397 char *s = SvPV(sv, len);
4398
061b10df 4399 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4400 *lenp = len;
4401 match = TRUE;
4402 break;
4403 }
4404 }
4405 }
4406 if (!match) {
4a623e43
JH
4407 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
4408 STRLEN tmplen;
4409
9e55ce06
JH
4410 to_utf8_fold(p, tmpbuf, &tmplen);
4411 if (swash_fetch(sw, tmpbuf, do_utf8))
4412 match = TRUE;
4413 }
ffc61ed2
JH
4414 }
4415 }
bbce6d69 4416 }
9e55ce06 4417 if (match && lenp && *lenp == 0)
0f0076b4 4418 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4419 }
1aa99e6b 4420 if (!match && c < 256) {
ffc61ed2
JH
4421 if (ANYOF_BITMAP_TEST(n, c))
4422 match = TRUE;
4423 else if (flags & ANYOF_FOLD) {
eb160463 4424 U8 f;
a0ed51b3 4425
ffc61ed2
JH
4426 if (flags & ANYOF_LOCALE) {
4427 PL_reg_flags |= RF_tainted;
4428 f = PL_fold_locale[c];
4429 }
4430 else
4431 f = PL_fold[c];
4432 if (f != c && ANYOF_BITMAP_TEST(n, f))
4433 match = TRUE;
4434 }
4435
4436 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4437 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4438 if (
4439 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4440 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4441 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4442 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4443 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4444 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4445 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4446 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4447 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4448 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4449 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4450 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4451 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4452 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4453 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4454 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4455 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4456 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4457 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4458 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4459 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4460 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4461 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4462 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4463 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4464 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4465 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4466 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4467 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4468 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4469 ) /* How's that for a conditional? */
4470 {
4471 match = TRUE;
4472 }
a0ed51b3 4473 }
a0ed51b3
LW
4474 }
4475
a0ed51b3
LW
4476 return (flags & ANYOF_INVERT) ? !match : match;
4477}
161b471a 4478
dfe13c55 4479STATIC U8 *
cea2e8a9 4480S_reghop(pTHX_ U8 *s, I32 off)
9041c2e3 4481{
1aa99e6b
IH
4482 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4483}
4484
4485STATIC U8 *
4486S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
9041c2e3 4487{
a0ed51b3 4488 if (off >= 0) {
1aa99e6b 4489 while (off-- && s < lim) {
ffc61ed2 4490 /* XXX could check well-formedness here */
a0ed51b3 4491 s += UTF8SKIP(s);
ffc61ed2 4492 }
a0ed51b3
LW
4493 }
4494 else {
4495 while (off++) {
1aa99e6b 4496 if (s > lim) {
a0ed51b3 4497 s--;
ffc61ed2 4498 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4499 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4500 s--;
ffc61ed2
JH
4501 }
4502 /* XXX could check well-formedness here */
a0ed51b3
LW
4503 }
4504 }
4505 }
4506 return s;
4507}
161b471a 4508
dfe13c55 4509STATIC U8 *
1aa99e6b 4510S_reghopmaybe(pTHX_ U8 *s, I32 off)
9041c2e3 4511{
1aa99e6b
IH
4512 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4513}
4514
4515STATIC U8 *
4516S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
a0ed51b3
LW
4517{
4518 if (off >= 0) {
1aa99e6b 4519 while (off-- && s < lim) {
ffc61ed2 4520 /* XXX could check well-formedness here */
a0ed51b3 4521 s += UTF8SKIP(s);
ffc61ed2 4522 }
a0ed51b3
LW
4523 if (off >= 0)
4524 return 0;
4525 }
4526 else {
4527 while (off++) {
1aa99e6b 4528 if (s > lim) {
a0ed51b3 4529 s--;
ffc61ed2 4530 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4531 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4532 s--;
ffc61ed2
JH
4533 }
4534 /* XXX could check well-formedness here */
a0ed51b3
LW
4535 }
4536 else
4537 break;
4538 }
4539 if (off <= 0)
4540 return 0;
4541 }
4542 return s;
4543}
51371543 4544
51371543 4545static void
acfe0abc 4546restore_pos(pTHX_ void *arg)
51371543 4547{
51371543
GS
4548 if (PL_reg_eval_set) {
4549 if (PL_reg_oldsaved) {
4550 PL_reg_re->subbeg = PL_reg_oldsaved;
4551 PL_reg_re->sublen = PL_reg_oldsavedlen;
ed252734
NC
4552#ifdef PERL_COPY_ON_WRITE
4553 PL_reg_re->saved_copy = PL_nrs;
4554#endif
51371543
GS
4555 RX_MATCH_COPIED_on(PL_reg_re);
4556 }
4557 PL_reg_magic->mg_len = PL_reg_oldpos;
4558 PL_reg_eval_set = 0;
4559 PL_curpm = PL_reg_oldcurpm;
4560 }
4561}
33b8afdf
JH
4562
4563STATIC void
4564S_to_utf8_substr(pTHX_ register regexp *prog)
4565{
4566 SV* sv;
4567 if (prog->float_substr && !prog->float_utf8) {
4568 prog->float_utf8 = sv = NEWSV(117, 0);
d3848741 4569 SvSetSV(sv, prog->float_substr);
33b8afdf
JH
4570 sv_utf8_upgrade(sv);
4571 if (SvTAIL(prog->float_substr))
4572 SvTAIL_on(sv);
4573 if (prog->float_substr == prog->check_substr)
4574 prog->check_utf8 = sv;
4575 }
4576 if (prog->anchored_substr && !prog->anchored_utf8) {
4577 prog->anchored_utf8 = sv = NEWSV(118, 0);
d3848741 4578 SvSetSV(sv, prog->anchored_substr);
33b8afdf
JH
4579 sv_utf8_upgrade(sv);
4580 if (SvTAIL(prog->anchored_substr))
4581 SvTAIL_on(sv);
4582 if (prog->anchored_substr == prog->check_substr)
4583 prog->check_utf8 = sv;
4584 }
4585}
4586
4587STATIC void
4588S_to_byte_substr(pTHX_ register regexp *prog)
4589{
4590 SV* sv;
4591 if (prog->float_utf8 && !prog->float_substr) {
4592 prog->float_substr = sv = NEWSV(117, 0);
d3848741 4593 SvSetSV(sv, prog->float_utf8);
33b8afdf
JH
4594 if (sv_utf8_downgrade(sv, TRUE)) {
4595 if (SvTAIL(prog->float_utf8))
4596 SvTAIL_on(sv);
4597 } else {
4598 SvREFCNT_dec(sv);
4599 prog->float_substr = sv = &PL_sv_undef;
4600 }
4601 if (prog->float_utf8 == prog->check_utf8)
4602 prog->check_substr = sv;
4603 }
4604 if (prog->anchored_utf8 && !prog->anchored_substr) {
4605 prog->anchored_substr = sv = NEWSV(118, 0);
d3848741 4606 SvSetSV(sv, prog->anchored_utf8);
33b8afdf
JH
4607 if (sv_utf8_downgrade(sv, TRUE)) {
4608 if (SvTAIL(prog->anchored_utf8))
4609 SvTAIL_on(sv);
4610 } else {
4611 SvREFCNT_dec(sv);
4612 prog->anchored_substr = sv = &PL_sv_undef;
4613 }
4614 if (prog->anchored_utf8 == prog->check_utf8)
4615 prog->check_substr = sv;
4616 }
4617}