This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[PATCH] more MM_Unix.t tests
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_regexec_flags my_regexec
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
cad2e5aa 38# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
39/* *These* symbols are masked to allow static link. */
40# define Perl_pregexec my_pregexec
9041c2e3 41# define Perl_reginitcolors my_reginitcolors
490a3f88 42# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
43
44# define PERL_NO_GET_CONTEXT
9041c2e3 45#endif
56953603 46
f0fcb552 47/*SUPPRESS 112*/
a687059c 48/*
e50aee73 49 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
50 *
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
53 *
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
57 *
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
60 * from defects in it.
61 *
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
64 *
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
67 *
68 **** Alterations to Henry's code are...
69 ****
bc89e66f 70 **** Copyright (c) 1991-2001, Larry Wall
a687059c 71 ****
9ef589d8
LW
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
a687059c
LW
74 *
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
78 */
79#include "EXTERN.h"
864dbfa3 80#define PERL_IN_REGEXEC_C
a687059c 81#include "perl.h"
0f5d15d6 82
a687059c
LW
83#include "regcomp.h"
84
c277df42
IZ
85#define RF_tainted 1 /* tainted information used? */
86#define RF_warned 2 /* warned about big count? */
ce862d02 87#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
88#define RF_utf8 8 /* String contains multibyte chars? */
89
90#define UTF (PL_reg_flags & RF_utf8)
ce862d02
IZ
91
92#define RS_init 1 /* eval environment created */
93#define RS_set 2 /* replsv value is set */
c277df42 94
a687059c
LW
95#ifndef STATIC
96#define STATIC static
97#endif
98
c277df42
IZ
99/*
100 * Forwards.
101 */
102
a0ed51b3 103#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 104#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 105
dfe13c55
GS
106#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
107#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
108#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
109#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
110#define HOPc(pos,off) ((char*)HOP(pos,off))
111#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 112
efb30f32 113#define HOPBACK(pos, off) ( \
e54858b0 114 (PL_reg_match_utf8) \
efb30f32
HS
115 ? reghopmaybe((U8*)pos, -off) \
116 : (pos - off >= PL_bostr) \
117 ? (U8*)(pos - off) \
118 : (U8*)NULL \
119)
120#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
121
1aa99e6b
IH
122#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
123#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
124#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
125#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
126#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
127#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
128
8269fa76 129#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) (void)CAT2(is_utf8_, a)((U8*)b); } STMT_END
51371543 130
5f80c4cf 131/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
132#define JUMPABLE(rn) ( \
133 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
134 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
135 OP(rn) == PLUS || OP(rn) == MINMOD || \
136 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
137)
138
cca55fe3
JP
139#define HAS_TEXT(rn) ( \
140 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
141)
e2d8ce26 142
cca55fe3 143#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 144 while (JUMPABLE(rn)) \
cca55fe3
JP
145 if (OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
146 PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 147 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
148 else if (OP(rn) == PLUS) \
149 rn = NEXTOPER(rn); \
e2d8ce26 150 else rn += NEXT_OFF(rn); \
5f80c4cf 151} STMT_END
74750237 152
acfe0abc 153static void restore_pos(pTHX_ void *arg);
51371543 154
76e3520e 155STATIC CHECKPOINT
cea2e8a9 156S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 157{
3280af22 158 int retval = PL_savestack_ix;
b1ce53c5
JH
159#define REGCP_PAREN_ELEMS 4
160 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
161 int p;
162
e49a9654
IH
163 if (paren_elems_to_push < 0)
164 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
165
a01268b5 166#define REGCP_OTHER_ELEMS 6
b1ce53c5 167 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 168 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 169/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
170 SSPUSHINT(PL_regendp[p]);
171 SSPUSHINT(PL_regstartp[p]);
3280af22 172 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
173 SSPUSHINT(p);
174 }
b1ce53c5 175/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
176 SSPUSHINT(PL_regsize);
177 SSPUSHINT(*PL_reglastparen);
a01268b5 178 SSPUSHINT(*PL_reglastcloseparen);
3280af22 179 SSPUSHPTR(PL_reginput);
41123dfd
JH
180#define REGCP_FRAME_ELEMS 2
181/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
182 * are needed for the regexp context stack bookkeeping. */
183 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 184 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 185
a0d0e21e
LW
186 return retval;
187}
188
c277df42 189/* These are needed since we do not localize EVAL nodes: */
02db2b7b 190# define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
faccc32b 191 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 192 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 193
02db2b7b 194# define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
c3464db5 195 PerlIO_printf(Perl_debug_log, \
faccc32b 196 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 197 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 198
76e3520e 199STATIC char *
cea2e8a9 200S_regcppop(pTHX)
a0d0e21e 201{
b1ce53c5 202 I32 i;
a0d0e21e
LW
203 U32 paren = 0;
204 char *input;
cf93c79d 205 I32 tmps;
b1ce53c5
JH
206
207 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 208 i = SSPOPINT;
b1ce53c5
JH
209 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
210 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 211 input = (char *) SSPOPPTR;
a01268b5 212 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
213 *PL_reglastparen = SSPOPINT;
214 PL_regsize = SSPOPINT;
b1ce53c5
JH
215
216 /* Now restore the parentheses context. */
41123dfd
JH
217 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
218 i > 0; i -= REGCP_PAREN_ELEMS) {
a0d0e21e 219 paren = (U32)SSPOPINT;
3280af22 220 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
221 PL_regstartp[paren] = SSPOPINT;
222 tmps = SSPOPINT;
3280af22
NIS
223 if (paren <= *PL_reglastparen)
224 PL_regendp[paren] = tmps;
c277df42 225 DEBUG_r(
c3464db5 226 PerlIO_printf(Perl_debug_log,
b900a521 227 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 228 (UV)paren, (IV)PL_regstartp[paren],
b900a521 229 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 230 (IV)PL_regendp[paren],
3280af22 231 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 232 );
a0d0e21e 233 }
c277df42 234 DEBUG_r(
3280af22 235 if (*PL_reglastparen + 1 <= PL_regnpar) {
c3464db5 236 PerlIO_printf(Perl_debug_log,
faccc32b
JH
237 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
238 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
239 }
240 );
daf18116 241#if 1
dafc8851
JH
242 /* It would seem that the similar code in regtry()
243 * already takes care of this, and in fact it is in
244 * a better location to since this code can #if 0-ed out
245 * but the code in regtry() is needed or otherwise tests
246 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
247 * (as of patchlevel 7877) will fail. Then again,
248 * this code seems to be necessary or otherwise
249 * building DynaLoader will fail:
250 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
251 * --jhi */
3280af22
NIS
252 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
253 if (paren > PL_regsize)
cf93c79d
IZ
254 PL_regstartp[paren] = -1;
255 PL_regendp[paren] = -1;
a0d0e21e 256 }
dafc8851 257#endif
a0d0e21e
LW
258 return input;
259}
260
0f5d15d6 261STATIC char *
cea2e8a9 262S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6
IZ
263{
264 I32 tmp = PL_savestack_ix;
265
266 PL_savestack_ix = ss;
267 regcppop();
268 PL_savestack_ix = tmp;
942e002e 269 return Nullch;
0f5d15d6
IZ
270}
271
272typedef struct re_cc_state
273{
274 I32 ss;
275 regnode *node;
276 struct re_cc_state *prev;
277 CURCUR *cc;
278 regexp *re;
279} re_cc_state;
280
02db2b7b 281#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 282
29d1e993
HS
283#define TRYPAREN(paren, n, input) { \
284 if (paren) { \
285 if (n) { \
286 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
287 PL_regendp[paren] = input - PL_bostr; \
288 } \
289 else \
290 PL_regendp[paren] = -1; \
291 } \
292 if (regmatch(next)) \
293 sayYES; \
294 if (paren && n) \
295 PL_regendp[paren] = -1; \
296}
297
298
a687059c 299/*
e50aee73 300 * pregexec and friends
a687059c
LW
301 */
302
303/*
c277df42 304 - pregexec - match a regexp against a string
a687059c 305 */
c277df42 306I32
864dbfa3 307Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 308 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
309/* strend: pointer to null at end of string */
310/* strbeg: real beginning of string */
311/* minend: end of match must be >=minend after stringarg. */
312/* nosave: For optimizations. */
313{
314 return
9041c2e3 315 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
316 nosave ? 0 : REXEC_COPY_STR);
317}
0f5d15d6
IZ
318
319STATIC void
cea2e8a9 320S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
321{
322 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
323#ifdef DEBUGGING
324 PL_regprogram = prog->program;
325#endif
326 PL_regnpar = prog->nparens;
9041c2e3
NIS
327 PL_regdata = prog->data;
328 PL_reg_re = prog;
0f5d15d6 329}
22e551b9 330
9041c2e3 331/*
cad2e5aa
JH
332 * Need to implement the following flags for reg_anch:
333 *
334 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
335 * USE_INTUIT_ML
336 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
337 * INTUIT_AUTORITATIVE_ML
338 * INTUIT_ONCE_NOML - Intuit can match in one location only.
339 * INTUIT_ONCE_ML
340 *
341 * Another flag for this function: SECOND_TIME (so that float substrs
342 * with giant delta may be not rechecked).
343 */
344
345/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
346
2c2d71f5 347/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
cad2e5aa
JH
348 Otherwise, only SvCUR(sv) is used to get strbeg. */
349
350/* XXXX We assume that strpos is strbeg unless sv. */
351
6eb5f6b9
JH
352/* XXXX Some places assume that there is a fixed substring.
353 An update may be needed if optimizer marks as "INTUITable"
354 RExen without fixed substrings. Similarly, it is assumed that
355 lengths of all the strings are no more than minlen, thus they
356 cannot come from lookahead.
357 (Or minlen should take into account lookahead.) */
358
2c2d71f5
JH
359/* A failure to find a constant substring means that there is no need to make
360 an expensive call to REx engine, thus we celebrate a failure. Similarly,
361 finding a substring too deep into the string means that less calls to
30944b6d
IZ
362 regtry() should be needed.
363
364 REx compiler's optimizer found 4 possible hints:
365 a) Anchored substring;
366 b) Fixed substring;
367 c) Whether we are anchored (beginning-of-line or \G);
368 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 369 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
370 string which does not contradict any of them.
371 */
2c2d71f5 372
6eb5f6b9
JH
373/* Most of decisions we do here should have been done at compile time.
374 The nodes of the REx which we used for the search should have been
375 deleted from the finite automaton. */
376
cad2e5aa
JH
377char *
378Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
379 char *strend, U32 flags, re_scream_pos_data *data)
380{
b7953727 381 register I32 start_shift = 0;
cad2e5aa 382 /* Should be nonnegative! */
b7953727 383 register I32 end_shift = 0;
2c2d71f5
JH
384 register char *s;
385 register SV *check;
a1933d95 386 char *strbeg;
cad2e5aa
JH
387 char *t;
388 I32 ml_anch;
6eb5f6b9 389 register char *other_last = Nullch; /* other substr checked before this */
b7953727 390 char *check_at = Nullch; /* check substr found at this pos */
30944b6d
IZ
391#ifdef DEBUGGING
392 char *i_strpos = strpos;
ce333219 393 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 394#endif
cad2e5aa 395
b8d68ded
JH
396 if (prog->reganch & ROPT_UTF8) {
397 DEBUG_r(PerlIO_printf(Perl_debug_log,
398 "UTF-8 regex...\n"));
399 PL_reg_flags |= RF_utf8;
400 }
401
2a782b5b 402 DEBUG_r({
b8d68ded 403 char *s = PL_reg_match_utf8 ?
c728cb41
JH
404 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
405 strpos;
b8d68ded
JH
406 int len = PL_reg_match_utf8 ?
407 strlen(s) : strend - strpos;
2a782b5b
JH
408 if (!PL_colorset)
409 reginitcolors();
b8d68ded
JH
410 if (PL_reg_match_utf8)
411 DEBUG_r(PerlIO_printf(Perl_debug_log,
412 "UTF-8 target...\n"));
2a782b5b
JH
413 PerlIO_printf(Perl_debug_log,
414 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
415 PL_colors[4],PL_colors[5],PL_colors[0],
416 prog->precomp,
417 PL_colors[1],
418 (strlen(prog->precomp) > 60 ? "..." : ""),
419 PL_colors[0],
420 (int)(len > 60 ? 60 : len),
421 s, PL_colors[1],
422 (len > 60 ? "..." : "")
423 );
424 });
cad2e5aa 425
1aa99e6b 426 if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
a72c7584
JH
427 DEBUG_r(PerlIO_printf(Perl_debug_log,
428 "String too short... [re_intuit_start]\n"));
cad2e5aa 429 goto fail;
2c2d71f5 430 }
a1933d95 431 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 432 PL_regeol = strend;
653099ff 433 check = prog->check_substr;
2c2d71f5 434 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
435 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
436 || ( (prog->reganch & ROPT_ANCH_BOL)
2c2d71f5 437 && !PL_multiline ) ); /* Check after \n? */
cad2e5aa 438
7e25d62c
JH
439 if (!ml_anch) {
440 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
441 | ROPT_IMPLICIT)) /* not a real BOL */
442 /* SvCUR is not set on references: SvRV and SvPVX overlap */
443 && sv && !SvROK(sv)
444 && (strpos != strbeg)) {
445 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
446 goto fail;
447 }
448 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 449 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 450 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
451 I32 slen;
452
1aa99e6b 453 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
454 if (SvTAIL(check)) {
455 slen = SvCUR(check); /* >= 1 */
cad2e5aa 456
9041c2e3 457 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5
JH
458 || (strend - s == slen && strend[-1] != '\n')) {
459 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
460 goto fail_finish;
cad2e5aa
JH
461 }
462 /* Now should match s[0..slen-2] */
463 slen--;
653099ff 464 if (slen && (*SvPVX(check) != *s
cad2e5aa 465 || (slen > 1
653099ff 466 && memNE(SvPVX(check), s, slen)))) {
2c2d71f5
JH
467 report_neq:
468 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
469 goto fail_finish;
470 }
cad2e5aa 471 }
653099ff
GS
472 else if (*SvPVX(check) != *s
473 || ((slen = SvCUR(check)) > 1
474 && memNE(SvPVX(check), s, slen)))
2c2d71f5
JH
475 goto report_neq;
476 goto success_at_start;
7e25d62c 477 }
cad2e5aa 478 }
2c2d71f5 479 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 480 s = strpos;
2c2d71f5 481 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 482 end_shift = prog->minlen - start_shift -
653099ff 483 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 484 if (!ml_anch) {
653099ff
GS
485 I32 end = prog->check_offset_max + CHR_SVLEN(check)
486 - (SvTAIL(check) != 0);
1aa99e6b 487 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
488
489 if (end_shift < eshift)
490 end_shift = eshift;
491 }
cad2e5aa 492 }
2c2d71f5 493 else { /* Can match at random position */
cad2e5aa
JH
494 ml_anch = 0;
495 s = strpos;
2c2d71f5
JH
496 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
497 /* Should be nonnegative! */
498 end_shift = prog->minlen - start_shift -
653099ff 499 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
500 }
501
2c2d71f5 502#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 503 if (end_shift < 0)
6bbae5e6 504 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
505#endif
506
2c2d71f5
JH
507 restart:
508 /* Find a possible match in the region s..strend by looking for
509 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 510 if (flags & REXEC_SCREAM) {
cad2e5aa
JH
511 I32 p = -1; /* Internal iterator of scream. */
512 I32 *pp = data ? data->scream_pos : &p;
513
2c2d71f5
JH
514 if (PL_screamfirst[BmRARE(check)] >= 0
515 || ( BmRARE(check) == '\n'
516 && (BmPREVIOUS(check) == SvCUR(check) - 1)
517 && SvTAIL(check) ))
9041c2e3 518 s = screaminstr(sv, check,
2c2d71f5 519 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 520 else
2c2d71f5 521 goto fail_finish;
cad2e5aa
JH
522 if (data)
523 *data->scream_olds = s;
524 }
f33976b4 525 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
526 s = fbm_instr((U8*)(s + start_shift),
527 (U8*)(strend - end_shift),
528 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 529 else
1aa99e6b
IH
530 s = fbm_instr(HOP3(s, start_shift, strend),
531 HOP3(strend, -end_shift, strbeg),
2c2d71f5 532 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
533
534 /* Update the count-of-usability, remove useless subpatterns,
535 unshift s. */
2c2d71f5
JH
536
537 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
538 (s ? "Found" : "Did not find"),
539 ((check == prog->anchored_substr) ? "anchored" : "floating"),
540 PL_colors[0],
7b0972df
JH
541 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
542 SvPVX(check),
2c2d71f5
JH
543 PL_colors[1], (SvTAIL(check) ? "$" : ""),
544 (s ? " at offset " : "...\n") ) );
545
546 if (!s)
547 goto fail_finish;
548
6eb5f6b9
JH
549 check_at = s;
550
2c2d71f5 551 /* Finish the diagnostic message */
30944b6d 552 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
553
554 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
555 Start with the other substr.
556 XXXX no SCREAM optimization yet - and a very coarse implementation
557 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
558 *always* match. Probably should be marked during compile...
559 Probably it is right to do no SCREAM here...
560 */
561
562 if (prog->float_substr && prog->anchored_substr) {
30944b6d 563 /* Take into account the "other" substring. */
2c2d71f5
JH
564 /* XXXX May be hopelessly wrong for UTF... */
565 if (!other_last)
6eb5f6b9 566 other_last = strpos;
2c2d71f5 567 if (check == prog->float_substr) {
30944b6d
IZ
568 do_other_anchored:
569 {
1aa99e6b 570 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5
JH
571 char *s1 = s;
572
2c2d71f5
JH
573 t = s - prog->check_offset_max;
574 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
575 && (!(prog->reganch & ROPT_UTF8)
1aa99e6b 576 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 577 && t > strpos)))
30944b6d 578 /* EMPTY */;
2c2d71f5
JH
579 else
580 t = strpos;
1aa99e6b 581 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
582 if (t < other_last) /* These positions already checked */
583 t = other_last;
1aa99e6b 584 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
585 if (last < last1)
586 last1 = last;
587 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
588 /* On end-of-str: see comment below. */
589 s = fbm_instr((unsigned char*)t,
1aa99e6b
IH
590 HOP3(HOP3(last1, prog->anchored_offset, strend)
591 + SvCUR(prog->anchored_substr),
592 -(SvTAIL(prog->anchored_substr)!=0), strbeg),
593 prog->anchored_substr,
594 PL_multiline ? FBMrf_MULTILINE : 0);
595 DEBUG_r(PerlIO_printf(Perl_debug_log,
596 "%s anchored substr `%s%.*s%s'%s",
2c2d71f5
JH
597 (s ? "Found" : "Contradicts"),
598 PL_colors[0],
7b0972df
JH
599 (int)(SvCUR(prog->anchored_substr)
600 - (SvTAIL(prog->anchored_substr)!=0)),
2c2d71f5
JH
601 SvPVX(prog->anchored_substr),
602 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
603 if (!s) {
604 if (last1 >= last2) {
605 DEBUG_r(PerlIO_printf(Perl_debug_log,
606 ", giving up...\n"));
607 goto fail_finish;
608 }
609 DEBUG_r(PerlIO_printf(Perl_debug_log,
610 ", trying floating at offset %ld...\n",
1aa99e6b
IH
611 (long)(HOP3c(s1, 1, strend) - i_strpos)));
612 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
613 s = HOP3c(last, 1, strend);
2c2d71f5
JH
614 goto restart;
615 }
616 else {
617 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 618 (long)(s - i_strpos)));
1aa99e6b
IH
619 t = HOP3c(s, -prog->anchored_offset, strbeg);
620 other_last = HOP3c(s, 1, strend);
30944b6d 621 s = s1;
2c2d71f5
JH
622 if (t == strpos)
623 goto try_at_start;
2c2d71f5
JH
624 goto try_at_offset;
625 }
30944b6d 626 }
2c2d71f5
JH
627 }
628 else { /* Take into account the floating substring. */
629 char *last, *last1;
630 char *s1 = s;
631
1aa99e6b
IH
632 t = HOP3c(s, -start_shift, strbeg);
633 last1 = last =
634 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
635 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
636 last = HOP3c(t, prog->float_max_offset, strend);
637 s = HOP3c(t, prog->float_min_offset, strend);
6eb5f6b9
JH
638 if (s < other_last)
639 s = other_last;
2c2d71f5
JH
640 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
641 /* fbm_instr() takes into account exact value of end-of-str
642 if the check is SvTAIL(ed). Since false positives are OK,
643 and end-of-str is not later than strend we are OK. */
644 s = fbm_instr((unsigned char*)s,
645 (unsigned char*)last + SvCUR(prog->float_substr)
646 - (SvTAIL(prog->float_substr)!=0),
647 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
648 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
649 (s ? "Found" : "Contradicts"),
650 PL_colors[0],
7b0972df
JH
651 (int)(SvCUR(prog->float_substr)
652 - (SvTAIL(prog->float_substr)!=0)),
2c2d71f5
JH
653 SvPVX(prog->float_substr),
654 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
655 if (!s) {
656 if (last1 == last) {
657 DEBUG_r(PerlIO_printf(Perl_debug_log,
658 ", giving up...\n"));
659 goto fail_finish;
660 }
661 DEBUG_r(PerlIO_printf(Perl_debug_log,
662 ", trying anchored starting at offset %ld...\n",
30944b6d 663 (long)(s1 + 1 - i_strpos)));
803ff556 664 other_last = last;
1aa99e6b 665 s = HOP3c(t, 1, strend);
2c2d71f5
JH
666 goto restart;
667 }
668 else {
669 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 670 (long)(s - i_strpos)));
803ff556 671 other_last = s; /* Fix this later. --Hugo */
30944b6d 672 s = s1;
2c2d71f5
JH
673 if (t == strpos)
674 goto try_at_start;
2c2d71f5
JH
675 goto try_at_offset;
676 }
677 }
cad2e5aa 678 }
2c2d71f5
JH
679
680 t = s - prog->check_offset_max;
2c2d71f5
JH
681 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
682 && (!(prog->reganch & ROPT_UTF8)
1aa99e6b
IH
683 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
684 && t > strpos))) {
2c2d71f5
JH
685 /* Fixed substring is found far enough so that the match
686 cannot start at strpos. */
687 try_at_offset:
cad2e5aa 688 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
689 /* Eventually fbm_*() should handle this, but often
690 anchored_offset is not 0, so this check will not be wasted. */
691 /* XXXX In the code below we prefer to look for "^" even in
692 presence of anchored substrings. And we search even
693 beyond the found float position. These pessimizations
694 are historical artefacts only. */
695 find_anchor:
2c2d71f5 696 while (t < strend - prog->minlen) {
cad2e5aa 697 if (*t == '\n') {
4ee3650e 698 if (t < check_at - prog->check_offset_min) {
30944b6d 699 if (prog->anchored_substr) {
4ee3650e
GS
700 /* Since we moved from the found position,
701 we definitely contradict the found anchored
30944b6d
IZ
702 substr. Due to the above check we do not
703 contradict "check" substr.
704 Thus we can arrive here only if check substr
705 is float. Redo checking for "other"=="fixed".
706 */
9041c2e3 707 strpos = t + 1;
30944b6d
IZ
708 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
709 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
710 goto do_other_anchored;
711 }
4ee3650e
GS
712 /* We don't contradict the found floating substring. */
713 /* XXXX Why not check for STCLASS? */
cad2e5aa 714 s = t + 1;
2c2d71f5 715 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
30944b6d 716 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
717 goto set_useful;
718 }
4ee3650e
GS
719 /* Position contradicts check-string */
720 /* XXXX probably better to look for check-string
721 than for "\n", so one should lower the limit for t? */
722 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
30944b6d 723 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 724 other_last = strpos = s = t + 1;
cad2e5aa
JH
725 goto restart;
726 }
727 t++;
728 }
2c2d71f5
JH
729 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
730 PL_colors[0],PL_colors[1]));
731 goto fail_finish;
cad2e5aa 732 }
f5952150
GS
733 else {
734 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
735 PL_colors[0],PL_colors[1]));
736 }
cad2e5aa
JH
737 s = t;
738 set_useful:
2c2d71f5 739 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
cad2e5aa
JH
740 }
741 else {
f5952150 742 /* The found string does not prohibit matching at strpos,
2c2d71f5 743 - no optimization of calling REx engine can be performed,
f5952150
GS
744 unless it was an MBOL and we are not after MBOL,
745 or a future STCLASS check will fail this. */
2c2d71f5
JH
746 try_at_start:
747 /* Even in this situation we may use MBOL flag if strpos is offset
748 wrt the start of the string. */
05b4157f 749 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 750 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
751 /* May be due to an implicit anchor of m{.*foo} */
752 && !(prog->reganch & ROPT_IMPLICIT))
753 {
cad2e5aa
JH
754 t = strpos;
755 goto find_anchor;
756 }
30944b6d 757 DEBUG_r( if (ml_anch)
f5952150
GS
758 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
759 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
30944b6d 760 );
2c2d71f5 761 success_at_start:
30944b6d 762 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
66e933ab 763 && prog->check_substr /* Could be deleted already */
cad2e5aa 764 && --BmUSEFUL(prog->check_substr) < 0
66e933ab
GS
765 && prog->check_substr == prog->float_substr)
766 {
cad2e5aa 767 /* If flags & SOMETHING - do not do it many times on the same match */
f5952150 768 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
cad2e5aa
JH
769 SvREFCNT_dec(prog->check_substr);
770 prog->check_substr = Nullsv; /* disable */
771 prog->float_substr = Nullsv; /* clear */
5e39e1e5 772 check = Nullsv; /* abort */
cad2e5aa 773 s = strpos;
3cf5c195
IZ
774 /* XXXX This is a remnant of the old implementation. It
775 looks wasteful, since now INTUIT can use many
6eb5f6b9 776 other heuristics. */
cad2e5aa
JH
777 prog->reganch &= ~RE_USE_INTUIT;
778 }
779 else
780 s = strpos;
781 }
782
6eb5f6b9
JH
783 /* Last resort... */
784 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
785 if (prog->regstclass) {
786 /* minlen == 0 is possible if regstclass is \b or \B,
787 and the fixed substr is ''$.
788 Since minlen is already taken into account, s+1 is before strend;
789 accidentally, minlen >= 1 guaranties no false positives at s + 1
790 even for \b or \B. But (minlen? 1 : 0) below assumes that
791 regstclass does not come from lookahead... */
792 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
793 This leaves EXACTF only, which is dealt with in find_byclass(). */
1aa99e6b 794 U8* str = (U8*)STRING(prog->regstclass);
66e933ab 795 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 796 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 797 : 1);
6eb5f6b9 798 char *endpos = (prog->anchored_substr || ml_anch)
1aa99e6b
IH
799 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
800 : (prog->float_substr
801 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
802 cl_l, strend)
803 : strend);
a1933d95 804 char *startpos = strbeg;
6eb5f6b9
JH
805
806 t = s;
76384e4a 807 if (prog->reganch & ROPT_UTF8) {
ffc61ed2 808 PL_regdata = prog->data;
76384e4a
GS
809 PL_bostr = startpos;
810 }
f33976b4 811 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
6eb5f6b9
JH
812 if (!s) {
813#ifdef DEBUGGING
b7953727 814 char *what = 0;
6eb5f6b9
JH
815#endif
816 if (endpos == strend) {
817 DEBUG_r( PerlIO_printf(Perl_debug_log,
818 "Could not match STCLASS...\n") );
819 goto fail;
820 }
66e933ab
GS
821 DEBUG_r( PerlIO_printf(Perl_debug_log,
822 "This position contradicts STCLASS...\n") );
653099ff
GS
823 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
824 goto fail;
6eb5f6b9
JH
825 /* Contradict one of substrings */
826 if (prog->anchored_substr) {
6eb5f6b9
JH
827 if (prog->anchored_substr == check) {
828 DEBUG_r( what = "anchored" );
829 hop_and_restart:
1aa99e6b 830 s = HOP3c(t, 1, strend);
66e933ab
GS
831 if (s + start_shift + end_shift > strend) {
832 /* XXXX Should be taken into account earlier? */
833 DEBUG_r( PerlIO_printf(Perl_debug_log,
834 "Could not match STCLASS...\n") );
835 goto fail;
836 }
5e39e1e5
HS
837 if (!check)
838 goto giveup;
6eb5f6b9 839 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 840 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
841 what, (long)(s + start_shift - i_strpos)) );
842 goto restart;
843 }
66e933ab 844 /* Have both, check_string is floating */
6eb5f6b9
JH
845 if (t + start_shift >= check_at) /* Contradicts floating=check */
846 goto retry_floating_check;
847 /* Recheck anchored substring, but not floating... */
9041c2e3 848 s = check_at;
5e39e1e5
HS
849 if (!check)
850 goto giveup;
6eb5f6b9 851 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 852 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
853 (long)(other_last - i_strpos)) );
854 goto do_other_anchored;
855 }
60e71179
GS
856 /* Another way we could have checked stclass at the
857 current position only: */
858 if (ml_anch) {
859 s = t = t + 1;
5e39e1e5
HS
860 if (!check)
861 goto giveup;
60e71179 862 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150
GS
863 "Looking for /%s^%s/m starting at offset %ld...\n",
864 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
60e71179 865 goto try_at_offset;
66e933ab 866 }
60e71179
GS
867 if (!prog->float_substr) /* Could have been deleted */
868 goto fail;
6eb5f6b9
JH
869 /* Check is floating subtring. */
870 retry_floating_check:
871 t = check_at - start_shift;
872 DEBUG_r( what = "floating" );
873 goto hop_and_restart;
874 }
b7953727
JH
875 if (t != s) {
876 DEBUG_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 877 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
878 (long)(t - i_strpos), (long)(s - i_strpos))
879 );
880 }
881 else {
882 DEBUG_r(PerlIO_printf(Perl_debug_log,
883 "Does not contradict STCLASS...\n");
884 );
885 }
6eb5f6b9 886 }
5e39e1e5
HS
887 giveup:
888 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
889 PL_colors[4], (check ? "Guessed" : "Giving up"),
890 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 891 return s;
2c2d71f5
JH
892
893 fail_finish: /* Substring not found */
66e933ab
GS
894 if (prog->check_substr) /* could be removed already */
895 BmUSEFUL(prog->check_substr) += 5; /* hooray */
cad2e5aa 896 fail:
2c2d71f5 897 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
cad2e5aa
JH
898 PL_colors[4],PL_colors[5]));
899 return Nullch;
900}
9661b544 901
6eb5f6b9 902/* We know what class REx starts with. Try to find this position... */
3c3eec57
GS
903STATIC char *
904S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
a687059c 905{
6eb5f6b9
JH
906 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
907 char *m;
d8093b23
G
908 STRLEN ln;
909 unsigned int c1;
910 unsigned int c2;
6eb5f6b9
JH
911 char *e;
912 register I32 tmp = 1; /* Scratch variable? */
53c4c00c 913 register bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 914
6eb5f6b9
JH
915 /* We know what class it must start with. */
916 switch (OP(c)) {
6eb5f6b9 917 case ANYOF:
a0ed51b3 918 while (s < strend) {
e0f9d4a8
JH
919 STRLEN skip = do_utf8 ? UTF8SKIP(s) : 1;
920
921 if (reginclass(c, (U8*)s, do_utf8) ||
922 (ANYOF_UNICODE_FOLD_SHARP_S(c, s, strend) &&
923 (skip = 2))) {
6eb5f6b9
JH
924 if (tmp && (norun || regtry(prog, s)))
925 goto got_it;
926 else
927 tmp = doevery;
a0ed51b3 928 }
e0f9d4a8
JH
929 else
930 tmp = 1;
931 s += skip;
a0d0e21e 932 }
6eb5f6b9 933 break;
f33976b4
DB
934 case CANY:
935 while (s < strend) {
936 if (tmp && (norun || regtry(prog, s)))
937 goto got_it;
938 else
939 tmp = doevery;
940 s++;
941 }
942 break;
6eb5f6b9
JH
943 case EXACTF:
944 m = STRING(c);
945 ln = STR_LEN(c);
1aa99e6b 946 if (UTF) {
a2a2844f 947 STRLEN ulen1, ulen2;
e7ae6809
JH
948 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
949 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
950
951 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
952 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
953
954 c1 = utf8_to_uvuni(tmpbuf1, 0);
955 c2 = utf8_to_uvuni(tmpbuf2, 0);
1aa99e6b
IH
956 }
957 else {
958 c1 = *(U8*)m;
959 c2 = PL_fold[c1];
960 }
6eb5f6b9
JH
961 goto do_exactf;
962 case EXACTFL:
963 m = STRING(c);
964 ln = STR_LEN(c);
d8093b23 965 c1 = *(U8*)m;
6eb5f6b9
JH
966 c2 = PL_fold_locale[c1];
967 do_exactf:
93c50829 968 e = do_utf8 ? s + ln : strend - ln;
b3c9acc1 969
6eb5f6b9
JH
970 if (norun && e < s)
971 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 972
60a8b682
JH
973 /* The idea in the EXACTF* cases is to first find the
974 * first character of the EXACTF* node and then, if
975 * necessary, case-insensitively compare the full
976 * text of the node. The c1 and c2 are the first
977 * characters (though in Unicode it gets a bit
978 * more complicated because there are more cases
979 * than just upper and lower: one is really supposed
980 * to use the so-called folding case for case-insensitive
981 * matching (called "loose matching" in Unicode). */
982
1aa99e6b 983 if (do_utf8) {
575cac57
JH
984 UV c, f;
985 U8 tmpbuf [UTF8_MAXLEN+1];
986 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
987 STRLEN len, foldlen;
d7f013c8 988
09091399 989 if (c1 == c2) {
1aa99e6b 990 while (s <= e) {
575cac57 991 c = utf8_to_uvchr((U8*)s, &len);
80aecb99
JH
992 if ( c == c1
993 && (ln == len ||
66423254
JH
994 ibcmp_utf8(s, (char **)0, 0, do_utf8,
995 m, (char **)0, ln, UTF))
55da9344 996 && (norun || regtry(prog, s)) )
1aa99e6b 997 goto got_it;
80aecb99
JH
998 else {
999 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1000 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1001 if ( f != c
1002 && (f == c1 || f == c2)
1003 && (ln == foldlen ||
66423254
JH
1004 !ibcmp_utf8((char *) foldbuf,
1005 (char **)0, foldlen, do_utf8,
d07ddd77 1006 m,
66423254 1007 (char **)0, ln, UTF))
80aecb99
JH
1008 && (norun || regtry(prog, s)) )
1009 goto got_it;
1010 }
1aa99e6b
IH
1011 s += len;
1012 }
09091399
JH
1013 }
1014 else {
1aa99e6b 1015 while (s <= e) {
575cac57 1016 c = utf8_to_uvchr((U8*)s, &len);
80aecb99 1017
60a8b682 1018 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1019 * Note that not all the possible combinations
1020 * are handled here: some of them are handled
1021 * by the standard folding rules, and some of
1022 * them (the character class or ANYOF cases)
1023 * are handled during compiletime in
1024 * regexec.c:S_regclass(). */
880bd946
JH
1025 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1026 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1027 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1028
1029 if ( (c == c1 || c == c2)
1030 && (ln == len ||
66423254
JH
1031 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1032 m, (char **)0, ln, UTF))
55da9344 1033 && (norun || regtry(prog, s)) )
1aa99e6b 1034 goto got_it;
80aecb99
JH
1035 else {
1036 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1037 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1038 if ( f != c
1039 && (f == c1 || f == c2)
1040 && (ln == foldlen ||
1041 !ibcmp_utf8((char *)foldbuf,
66423254 1042 (char **)0, foldlen, do_utf8,
d07ddd77 1043 m,
66423254 1044 (char **)0, ln, UTF))
80aecb99
JH
1045 && (norun || regtry(prog, s)) )
1046 goto got_it;
1047 }
1aa99e6b
IH
1048 s += len;
1049 }
09091399 1050 }
1aa99e6b
IH
1051 }
1052 else {
1053 if (c1 == c2)
1054 while (s <= e) {
1055 if ( *(U8*)s == c1
1056 && (ln == 1 || !(OP(c) == EXACTF
1057 ? ibcmp(s, m, ln)
1058 : ibcmp_locale(s, m, ln)))
1059 && (norun || regtry(prog, s)) )
1060 goto got_it;
1061 s++;
1062 }
1063 else
1064 while (s <= e) {
1065 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1066 && (ln == 1 || !(OP(c) == EXACTF
1067 ? ibcmp(s, m, ln)
1068 : ibcmp_locale(s, m, ln)))
1069 && (norun || regtry(prog, s)) )
1070 goto got_it;
1071 s++;
1072 }
b3c9acc1
IZ
1073 }
1074 break;
bbce6d69 1075 case BOUNDL:
3280af22 1076 PL_reg_flags |= RF_tainted;
bbce6d69 1077 /* FALL THROUGH */
a0d0e21e 1078 case BOUND:
ffc61ed2 1079 if (do_utf8) {
12d33761 1080 if (s == PL_bostr)
ffc61ed2
JH
1081 tmp = '\n';
1082 else {
1aa99e6b 1083 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
9041c2e3 1084
0064a8a9
JH
1085 if (s > (char*)r)
1086 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
ffc61ed2
JH
1087 }
1088 tmp = ((OP(c) == BOUND ?
9041c2e3 1089 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1090 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1091 while (s < strend) {
1092 if (tmp == !(OP(c) == BOUND ?
3568d838 1093 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1094 isALNUM_LC_utf8((U8*)s)))
1095 {
1096 tmp = !tmp;
1097 if ((norun || regtry(prog, s)))
1098 goto got_it;
1099 }
1100 s += UTF8SKIP(s);
a687059c 1101 }
a0d0e21e 1102 }
667bb95a 1103 else {
12d33761 1104 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1105 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1106 while (s < strend) {
1107 if (tmp ==
1108 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1109 tmp = !tmp;
1110 if ((norun || regtry(prog, s)))
1111 goto got_it;
1112 }
1113 s++;
a0ed51b3 1114 }
a0ed51b3 1115 }
6eb5f6b9 1116 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1117 goto got_it;
1118 break;
bbce6d69 1119 case NBOUNDL:
3280af22 1120 PL_reg_flags |= RF_tainted;
bbce6d69 1121 /* FALL THROUGH */
a0d0e21e 1122 case NBOUND:
ffc61ed2 1123 if (do_utf8) {
12d33761 1124 if (s == PL_bostr)
ffc61ed2
JH
1125 tmp = '\n';
1126 else {
1aa99e6b 1127 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
9041c2e3 1128
0064a8a9
JH
1129 if (s > (char*)r)
1130 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
ffc61ed2
JH
1131 }
1132 tmp = ((OP(c) == NBOUND ?
9041c2e3 1133 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1134 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1135 while (s < strend) {
1136 if (tmp == !(OP(c) == NBOUND ?
3568d838 1137 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1138 isALNUM_LC_utf8((U8*)s)))
1139 tmp = !tmp;
1140 else if ((norun || regtry(prog, s)))
1141 goto got_it;
1142 s += UTF8SKIP(s);
1143 }
a0d0e21e 1144 }
667bb95a 1145 else {
12d33761 1146 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1147 tmp = ((OP(c) == NBOUND ?
1148 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1149 while (s < strend) {
1150 if (tmp ==
1151 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1152 tmp = !tmp;
1153 else if ((norun || regtry(prog, s)))
1154 goto got_it;
1155 s++;
1156 }
a0ed51b3 1157 }
6eb5f6b9 1158 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1159 goto got_it;
1160 break;
a0d0e21e 1161 case ALNUM:
ffc61ed2 1162 if (do_utf8) {
8269fa76 1163 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1164 while (s < strend) {
3568d838 1165 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1166 if (tmp && (norun || regtry(prog, s)))
1167 goto got_it;
1168 else
1169 tmp = doevery;
1170 }
bbce6d69 1171 else
ffc61ed2
JH
1172 tmp = 1;
1173 s += UTF8SKIP(s);
bbce6d69 1174 }
bbce6d69 1175 }
ffc61ed2
JH
1176 else {
1177 while (s < strend) {
1178 if (isALNUM(*s)) {
1179 if (tmp && (norun || regtry(prog, s)))
1180 goto got_it;
1181 else
1182 tmp = doevery;
1183 }
a0ed51b3 1184 else
ffc61ed2
JH
1185 tmp = 1;
1186 s++;
a0ed51b3 1187 }
a0ed51b3
LW
1188 }
1189 break;
bbce6d69 1190 case ALNUML:
3280af22 1191 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1192 if (do_utf8) {
1193 while (s < strend) {
1194 if (isALNUM_LC_utf8((U8*)s)) {
1195 if (tmp && (norun || regtry(prog, s)))
1196 goto got_it;
1197 else
1198 tmp = doevery;
1199 }
a687059c 1200 else
ffc61ed2
JH
1201 tmp = 1;
1202 s += UTF8SKIP(s);
a0d0e21e 1203 }
a0d0e21e 1204 }
ffc61ed2
JH
1205 else {
1206 while (s < strend) {
1207 if (isALNUM_LC(*s)) {
1208 if (tmp && (norun || regtry(prog, s)))
1209 goto got_it;
1210 else
1211 tmp = doevery;
1212 }
a0ed51b3 1213 else
ffc61ed2
JH
1214 tmp = 1;
1215 s++;
a0ed51b3 1216 }
a0ed51b3
LW
1217 }
1218 break;
a0d0e21e 1219 case NALNUM:
ffc61ed2 1220 if (do_utf8) {
8269fa76 1221 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1222 while (s < strend) {
3568d838 1223 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1224 if (tmp && (norun || regtry(prog, s)))
1225 goto got_it;
1226 else
1227 tmp = doevery;
1228 }
bbce6d69 1229 else
ffc61ed2
JH
1230 tmp = 1;
1231 s += UTF8SKIP(s);
bbce6d69 1232 }
bbce6d69 1233 }
ffc61ed2
JH
1234 else {
1235 while (s < strend) {
1236 if (!isALNUM(*s)) {
1237 if (tmp && (norun || regtry(prog, s)))
1238 goto got_it;
1239 else
1240 tmp = doevery;
1241 }
a0ed51b3 1242 else
ffc61ed2
JH
1243 tmp = 1;
1244 s++;
a0ed51b3 1245 }
a0ed51b3
LW
1246 }
1247 break;
bbce6d69 1248 case NALNUML:
3280af22 1249 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1250 if (do_utf8) {
1251 while (s < strend) {
1252 if (!isALNUM_LC_utf8((U8*)s)) {
1253 if (tmp && (norun || regtry(prog, s)))
1254 goto got_it;
1255 else
1256 tmp = doevery;
1257 }
a687059c 1258 else
ffc61ed2
JH
1259 tmp = 1;
1260 s += UTF8SKIP(s);
a687059c 1261 }
a0d0e21e 1262 }
ffc61ed2
JH
1263 else {
1264 while (s < strend) {
1265 if (!isALNUM_LC(*s)) {
1266 if (tmp && (norun || regtry(prog, s)))
1267 goto got_it;
1268 else
1269 tmp = doevery;
1270 }
a0ed51b3 1271 else
ffc61ed2
JH
1272 tmp = 1;
1273 s++;
a0ed51b3 1274 }
a0ed51b3
LW
1275 }
1276 break;
a0d0e21e 1277 case SPACE:
ffc61ed2 1278 if (do_utf8) {
8269fa76 1279 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1280 while (s < strend) {
3568d838 1281 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1282 if (tmp && (norun || regtry(prog, s)))
1283 goto got_it;
1284 else
1285 tmp = doevery;
1286 }
a0d0e21e 1287 else
ffc61ed2
JH
1288 tmp = 1;
1289 s += UTF8SKIP(s);
2304df62 1290 }
a0d0e21e 1291 }
ffc61ed2
JH
1292 else {
1293 while (s < strend) {
1294 if (isSPACE(*s)) {
1295 if (tmp && (norun || regtry(prog, s)))
1296 goto got_it;
1297 else
1298 tmp = doevery;
1299 }
a0ed51b3 1300 else
ffc61ed2
JH
1301 tmp = 1;
1302 s++;
a0ed51b3 1303 }
a0ed51b3
LW
1304 }
1305 break;
bbce6d69 1306 case SPACEL:
3280af22 1307 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1308 if (do_utf8) {
1309 while (s < strend) {
1310 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1311 if (tmp && (norun || regtry(prog, s)))
1312 goto got_it;
1313 else
1314 tmp = doevery;
1315 }
bbce6d69 1316 else
ffc61ed2
JH
1317 tmp = 1;
1318 s += UTF8SKIP(s);
bbce6d69 1319 }
bbce6d69 1320 }
ffc61ed2
JH
1321 else {
1322 while (s < strend) {
1323 if (isSPACE_LC(*s)) {
1324 if (tmp && (norun || regtry(prog, s)))
1325 goto got_it;
1326 else
1327 tmp = doevery;
1328 }
a0ed51b3 1329 else
ffc61ed2
JH
1330 tmp = 1;
1331 s++;
a0ed51b3 1332 }
a0ed51b3
LW
1333 }
1334 break;
a0d0e21e 1335 case NSPACE:
ffc61ed2 1336 if (do_utf8) {
8269fa76 1337 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1338 while (s < strend) {
3568d838 1339 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1340 if (tmp && (norun || regtry(prog, s)))
1341 goto got_it;
1342 else
1343 tmp = doevery;
1344 }
a0d0e21e 1345 else
ffc61ed2
JH
1346 tmp = 1;
1347 s += UTF8SKIP(s);
a687059c 1348 }
a0d0e21e 1349 }
ffc61ed2
JH
1350 else {
1351 while (s < strend) {
1352 if (!isSPACE(*s)) {
1353 if (tmp && (norun || regtry(prog, s)))
1354 goto got_it;
1355 else
1356 tmp = doevery;
1357 }
a0ed51b3 1358 else
ffc61ed2
JH
1359 tmp = 1;
1360 s++;
a0ed51b3 1361 }
a0ed51b3
LW
1362 }
1363 break;
bbce6d69 1364 case NSPACEL:
3280af22 1365 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1366 if (do_utf8) {
1367 while (s < strend) {
1368 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1369 if (tmp && (norun || regtry(prog, s)))
1370 goto got_it;
1371 else
1372 tmp = doevery;
1373 }
bbce6d69 1374 else
ffc61ed2
JH
1375 tmp = 1;
1376 s += UTF8SKIP(s);
bbce6d69 1377 }
bbce6d69 1378 }
ffc61ed2
JH
1379 else {
1380 while (s < strend) {
1381 if (!isSPACE_LC(*s)) {
1382 if (tmp && (norun || regtry(prog, s)))
1383 goto got_it;
1384 else
1385 tmp = doevery;
1386 }
a0ed51b3 1387 else
ffc61ed2
JH
1388 tmp = 1;
1389 s++;
a0ed51b3 1390 }
a0ed51b3
LW
1391 }
1392 break;
a0d0e21e 1393 case DIGIT:
ffc61ed2 1394 if (do_utf8) {
8269fa76 1395 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1396 while (s < strend) {
3568d838 1397 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1398 if (tmp && (norun || regtry(prog, s)))
1399 goto got_it;
1400 else
1401 tmp = doevery;
1402 }
a0d0e21e 1403 else
ffc61ed2
JH
1404 tmp = 1;
1405 s += UTF8SKIP(s);
2b69d0c2 1406 }
a0d0e21e 1407 }
ffc61ed2
JH
1408 else {
1409 while (s < strend) {
1410 if (isDIGIT(*s)) {
1411 if (tmp && (norun || regtry(prog, s)))
1412 goto got_it;
1413 else
1414 tmp = doevery;
1415 }
a0ed51b3 1416 else
ffc61ed2
JH
1417 tmp = 1;
1418 s++;
a0ed51b3 1419 }
a0ed51b3
LW
1420 }
1421 break;
b8c5462f
JH
1422 case DIGITL:
1423 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1424 if (do_utf8) {
1425 while (s < strend) {
1426 if (isDIGIT_LC_utf8((U8*)s)) {
1427 if (tmp && (norun || regtry(prog, s)))
1428 goto got_it;
1429 else
1430 tmp = doevery;
1431 }
b8c5462f 1432 else
ffc61ed2
JH
1433 tmp = 1;
1434 s += UTF8SKIP(s);
b8c5462f 1435 }
b8c5462f 1436 }
ffc61ed2
JH
1437 else {
1438 while (s < strend) {
1439 if (isDIGIT_LC(*s)) {
1440 if (tmp && (norun || regtry(prog, s)))
1441 goto got_it;
1442 else
1443 tmp = doevery;
1444 }
b8c5462f 1445 else
ffc61ed2
JH
1446 tmp = 1;
1447 s++;
b8c5462f 1448 }
b8c5462f
JH
1449 }
1450 break;
a0d0e21e 1451 case NDIGIT:
ffc61ed2 1452 if (do_utf8) {
8269fa76 1453 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1454 while (s < strend) {
3568d838 1455 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1456 if (tmp && (norun || regtry(prog, s)))
1457 goto got_it;
1458 else
1459 tmp = doevery;
1460 }
a0d0e21e 1461 else
ffc61ed2
JH
1462 tmp = 1;
1463 s += UTF8SKIP(s);
a687059c 1464 }
a0d0e21e 1465 }
ffc61ed2
JH
1466 else {
1467 while (s < strend) {
1468 if (!isDIGIT(*s)) {
1469 if (tmp && (norun || regtry(prog, s)))
1470 goto got_it;
1471 else
1472 tmp = doevery;
1473 }
a0ed51b3 1474 else
ffc61ed2
JH
1475 tmp = 1;
1476 s++;
a0ed51b3 1477 }
a0ed51b3
LW
1478 }
1479 break;
b8c5462f
JH
1480 case NDIGITL:
1481 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1482 if (do_utf8) {
1483 while (s < strend) {
1484 if (!isDIGIT_LC_utf8((U8*)s)) {
1485 if (tmp && (norun || regtry(prog, s)))
1486 goto got_it;
1487 else
1488 tmp = doevery;
1489 }
b8c5462f 1490 else
ffc61ed2
JH
1491 tmp = 1;
1492 s += UTF8SKIP(s);
b8c5462f 1493 }
a0ed51b3 1494 }
ffc61ed2
JH
1495 else {
1496 while (s < strend) {
1497 if (!isDIGIT_LC(*s)) {
1498 if (tmp && (norun || regtry(prog, s)))
1499 goto got_it;
1500 else
1501 tmp = doevery;
1502 }
cf93c79d 1503 else
ffc61ed2
JH
1504 tmp = 1;
1505 s++;
b8c5462f 1506 }
b8c5462f
JH
1507 }
1508 break;
b3c9acc1 1509 default:
3c3eec57
GS
1510 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1511 break;
d6a28714 1512 }
6eb5f6b9
JH
1513 return 0;
1514 got_it:
1515 return s;
1516}
1517
1518/*
1519 - regexec_flags - match a regexp against a string
1520 */
1521I32
1522Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1523 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1524/* strend: pointer to null at end of string */
1525/* strbeg: real beginning of string */
1526/* minend: end of match must be >=minend after stringarg. */
1527/* data: May be used for some additional optimizations. */
1528/* nosave: For optimizations. */
1529{
6eb5f6b9
JH
1530 register char *s;
1531 register regnode *c;
1532 register char *startpos = stringarg;
6eb5f6b9
JH
1533 I32 minlen; /* must match at least this many chars */
1534 I32 dontbother = 0; /* how many characters not to try at end */
155aba94 1535 /* I32 start_shift = 0; */ /* Offset of the start to find
6eb5f6b9
JH
1536 constant substr. */ /* CC */
1537 I32 end_shift = 0; /* Same for the end. */ /* CC */
1538 I32 scream_pos = -1; /* Internal iterator of scream. */
1539 char *scream_olds;
1540 SV* oreplsv = GvSV(PL_replgv);
1aa99e6b 1541 bool do_utf8 = DO_UTF8(sv);
2a782b5b 1542#ifdef DEBUGGING
9e55ce06
JH
1543 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1544 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1545#endif
6eb5f6b9
JH
1546
1547 PL_regcc = 0;
1548
1549 cache_re(prog);
1550#ifdef DEBUGGING
aea4f609 1551 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1552#endif
1553
1554 /* Be paranoid... */
1555 if (prog == NULL || startpos == NULL) {
1556 Perl_croak(aTHX_ "NULL regexp parameter");
1557 return 0;
1558 }
1559
1560 minlen = prog->minlen;
61a36c01 1561 if (strend - startpos < minlen) {
a72c7584
JH
1562 DEBUG_r(PerlIO_printf(Perl_debug_log,
1563 "String too short [regexec_flags]...\n"));
1564 goto phooey;
1aa99e6b 1565 }
6eb5f6b9 1566
6eb5f6b9
JH
1567 /* Check validity of program. */
1568 if (UCHARAT(prog->program) != REG_MAGIC) {
1569 Perl_croak(aTHX_ "corrupted regexp program");
1570 }
1571
1572 PL_reg_flags = 0;
1573 PL_reg_eval_set = 0;
1574 PL_reg_maxiter = 0;
1575
1576 if (prog->reganch & ROPT_UTF8)
1577 PL_reg_flags |= RF_utf8;
1578
1579 /* Mark beginning of line for ^ and lookbehind. */
1580 PL_regbol = startpos;
1581 PL_bostr = strbeg;
1582 PL_reg_sv = sv;
1583
1584 /* Mark end of line for $ (and such) */
1585 PL_regeol = strend;
1586
1587 /* see how far we have to get to not match where we matched before */
1588 PL_regtill = startpos+minend;
1589
1590 /* We start without call_cc context. */
1591 PL_reg_call_cc = 0;
1592
1593 /* If there is a "must appear" string, look for it. */
1594 s = startpos;
1595
1596 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1597 MAGIC *mg;
1598
1599 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1600 PL_reg_ganch = startpos;
1601 else if (sv && SvTYPE(sv) >= SVt_PVMG
1602 && SvMAGIC(sv)
14befaf4
DM
1603 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1604 && mg->mg_len >= 0) {
6eb5f6b9
JH
1605 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1606 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1607 if (s > PL_reg_ganch)
6eb5f6b9
JH
1608 goto phooey;
1609 s = PL_reg_ganch;
1610 }
1611 }
1612 else /* pos() not defined */
1613 PL_reg_ganch = strbeg;
1614 }
1615
699c3c34
JH
1616 if (do_utf8 == (UTF!=0) &&
1617 !(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
6eb5f6b9
JH
1618 re_scream_pos_data d;
1619
1620 d.scream_olds = &scream_olds;
1621 d.scream_pos = &scream_pos;
1622 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7
JH
1623 if (!s) {
1624 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1625 goto phooey; /* not present */
3fa9c3d7 1626 }
6eb5f6b9
JH
1627 }
1628
2a782b5b 1629 DEBUG_r({
9e55ce06
JH
1630 char *s0 = UTF ?
1631 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
c728cb41 1632 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1633 prog->precomp;
1634 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1635 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1636 UNI_DISPLAY_REGEX) : startpos;
9e55ce06 1637 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1638 if (!PL_colorset)
1639 reginitcolors();
1640 PerlIO_printf(Perl_debug_log,
9e55ce06 1641 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
2a782b5b 1642 PL_colors[4],PL_colors[5],PL_colors[0],
9e55ce06 1643 len0, len0, s0,
2a782b5b 1644 PL_colors[1],
9e55ce06 1645 len0 > 60 ? "..." : "",
2a782b5b 1646 PL_colors[0],
9e55ce06
JH
1647 (int)(len1 > 60 ? 60 : len1),
1648 s1, PL_colors[1],
1649 (len1 > 60 ? "..." : "")
2a782b5b
JH
1650 );
1651 });
6eb5f6b9
JH
1652
1653 /* Simplest case: anchored match need be tried only once. */
1654 /* [unless only anchor is BOL and multiline is set] */
1655 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1656 if (s == startpos && regtry(prog, startpos))
1657 goto got_it;
1658 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1659 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1660 {
1661 char *end;
1662
1663 if (minlen)
1664 dontbother = minlen - 1;
1aa99e6b 1665 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9
JH
1666 /* for multiline we only have to try after newlines */
1667 if (prog->check_substr) {
1668 if (s == startpos)
1669 goto after_try;
1670 while (1) {
1671 if (regtry(prog, s))
1672 goto got_it;
1673 after_try:
1674 if (s >= end)
1675 goto phooey;
1676 if (prog->reganch & RE_USE_INTUIT) {
1677 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1678 if (!s)
1679 goto phooey;
1680 }
1681 else
1682 s++;
1683 }
1684 } else {
1685 if (s > startpos)
1686 s--;
1687 while (s < end) {
1688 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1689 if (regtry(prog, s))
1690 goto got_it;
1691 }
1692 }
1693 }
1694 }
1695 goto phooey;
1696 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1697 if (regtry(prog, PL_reg_ganch))
1698 goto got_it;
1699 goto phooey;
1700 }
1701
1702 /* Messy cases: unanchored match. */
9041c2e3 1703 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1704 /* we have /x+whatever/ */
1705 /* it must be a one character string (XXXX Except UTF?) */
1706 char ch = SvPVX(prog->anchored_substr)[0];
bf93d4cc
GS
1707#ifdef DEBUGGING
1708 int did_match = 0;
1709#endif
1710
1aa99e6b 1711 if (do_utf8) {
6eb5f6b9
JH
1712 while (s < strend) {
1713 if (*s == ch) {
bf93d4cc 1714 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1715 if (regtry(prog, s)) goto got_it;
1716 s += UTF8SKIP(s);
1717 while (s < strend && *s == ch)
1718 s += UTF8SKIP(s);
1719 }
1720 s += UTF8SKIP(s);
1721 }
1722 }
1723 else {
1724 while (s < strend) {
1725 if (*s == ch) {
bf93d4cc 1726 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1727 if (regtry(prog, s)) goto got_it;
1728 s++;
1729 while (s < strend && *s == ch)
1730 s++;
1731 }
1732 s++;
1733 }
1734 }
b7953727 1735 DEBUG_r(if (!did_match)
bf93d4cc 1736 PerlIO_printf(Perl_debug_log,
b7953727
JH
1737 "Did not find anchored character...\n")
1738 );
6eb5f6b9
JH
1739 }
1740 /*SUPPRESS 560*/
1aa99e6b
IH
1741 else if (do_utf8 == (UTF!=0) &&
1742 (prog->anchored_substr != Nullsv
9041c2e3 1743 || (prog->float_substr != Nullsv
1aa99e6b 1744 && prog->float_max_offset < strend - s))) {
9041c2e3 1745 SV *must = prog->anchored_substr
6eb5f6b9 1746 ? prog->anchored_substr : prog->float_substr;
9041c2e3 1747 I32 back_max =
6eb5f6b9 1748 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
9041c2e3 1749 I32 back_min =
6eb5f6b9 1750 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1aa99e6b 1751 char *last = HOP3c(strend, /* Cannot start after this */
6eb5f6b9 1752 -(I32)(CHR_SVLEN(must)
1aa99e6b 1753 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9 1754 char *last1; /* Last position checked before */
bf93d4cc
GS
1755#ifdef DEBUGGING
1756 int did_match = 0;
1757#endif
6eb5f6b9
JH
1758
1759 if (s > PL_bostr)
1760 last1 = HOPc(s, -1);
1761 else
1762 last1 = s - 1; /* bogus */
1763
1764 /* XXXX check_substr already used to find `s', can optimize if
1765 check_substr==must. */
1766 scream_pos = -1;
1767 dontbother = end_shift;
1768 strend = HOPc(strend, -dontbother);
1769 while ( (s <= last) &&
9041c2e3 1770 ((flags & REXEC_SCREAM)
1aa99e6b 1771 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1772 end_shift, &scream_pos, 0))
1aa99e6b 1773 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1774 (unsigned char*)strend, must,
6eb5f6b9 1775 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
bf93d4cc 1776 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1777 if (HOPc(s, -back_max) > last1) {
1778 last1 = HOPc(s, -back_min);
1779 s = HOPc(s, -back_max);
1780 }
1781 else {
1782 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1783
1784 last1 = HOPc(s, -back_min);
1785 s = t;
1786 }
1aa99e6b 1787 if (do_utf8) {
6eb5f6b9
JH
1788 while (s <= last1) {
1789 if (regtry(prog, s))
1790 goto got_it;
1791 s += UTF8SKIP(s);
1792 }
1793 }
1794 else {
1795 while (s <= last1) {
1796 if (regtry(prog, s))
1797 goto got_it;
1798 s++;
1799 }
1800 }
1801 }
b7953727
JH
1802 DEBUG_r(if (!did_match)
1803 PerlIO_printf(Perl_debug_log,
1804 "Did not find %s substr `%s%.*s%s'%s...\n",
bf93d4cc
GS
1805 ((must == prog->anchored_substr)
1806 ? "anchored" : "floating"),
1807 PL_colors[0],
1808 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1809 SvPVX(must),
b7953727
JH
1810 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1811 );
6eb5f6b9
JH
1812 goto phooey;
1813 }
155aba94 1814 else if ((c = prog->regstclass)) {
66e933ab
GS
1815 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1816 /* don't bother with what can't match */
6eb5f6b9 1817 strend = HOPc(strend, -(minlen - 1));
ffc61ed2
JH
1818 DEBUG_r({
1819 SV *prop = sv_newmortal();
9e55ce06
JH
1820 char *s0;
1821 char *s1;
1822 int len0;
1823 int len1;
1824
ffc61ed2 1825 regprop(prop, c);
9e55ce06
JH
1826 s0 = UTF ?
1827 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
c728cb41 1828 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1829 SvPVX(prop);
1830 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1831 s1 = UTF ?
c728cb41 1832 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1833 len1 = UTF ? SvCUR(dsv1) : strend - s;
1834 PerlIO_printf(Perl_debug_log,
1835 "Matching stclass `%*.*s' against `%*.*s'\n",
1836 len0, len0, s0,
1837 len1, len1, s1);
ffc61ed2 1838 });
6eb5f6b9
JH
1839 if (find_byclass(prog, c, s, strend, startpos, 0))
1840 goto got_it;
bf93d4cc 1841 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1842 }
1843 else {
1844 dontbother = 0;
1845 if (prog->float_substr != Nullsv) { /* Trim the end. */
1846 char *last;
d6a28714
JH
1847
1848 if (flags & REXEC_SCREAM) {
1849 last = screaminstr(sv, prog->float_substr, s - strbeg,
1850 end_shift, &scream_pos, 1); /* last one */
1851 if (!last)
ffc61ed2 1852 last = scream_olds; /* Only one occurrence. */
b8c5462f 1853 }
d6a28714
JH
1854 else {
1855 STRLEN len;
1856 char *little = SvPV(prog->float_substr, len);
1857
1858 if (SvTAIL(prog->float_substr)) {
1859 if (memEQ(strend - len + 1, little, len - 1))
1860 last = strend - len + 1;
1861 else if (!PL_multiline)
9041c2e3 1862 last = memEQ(strend - len, little, len)
d6a28714 1863 ? strend - len : Nullch;
b8c5462f 1864 else
d6a28714
JH
1865 goto find_last;
1866 } else {
1867 find_last:
9041c2e3 1868 if (len)
d6a28714 1869 last = rninstr(s, strend, little, little + len);
b8c5462f 1870 else
d6a28714 1871 last = strend; /* matching `$' */
b8c5462f 1872 }
b8c5462f 1873 }
bf93d4cc
GS
1874 if (last == NULL) {
1875 DEBUG_r(PerlIO_printf(Perl_debug_log,
1876 "%sCan't trim the tail, match fails (should not happen)%s\n",
1877 PL_colors[4],PL_colors[5]));
1878 goto phooey; /* Should not happen! */
1879 }
d6a28714
JH
1880 dontbother = strend - last + prog->float_min_offset;
1881 }
1882 if (minlen && (dontbother < minlen))
1883 dontbother = minlen - 1;
1884 strend -= dontbother; /* this one's always in bytes! */
1885 /* We don't know much -- general case. */
1aa99e6b 1886 if (do_utf8) {
d6a28714
JH
1887 for (;;) {
1888 if (regtry(prog, s))
1889 goto got_it;
1890 if (s >= strend)
1891 break;
b8c5462f 1892 s += UTF8SKIP(s);
d6a28714
JH
1893 };
1894 }
1895 else {
1896 do {
1897 if (regtry(prog, s))
1898 goto got_it;
1899 } while (s++ < strend);
1900 }
1901 }
1902
1903 /* Failure. */
1904 goto phooey;
1905
1906got_it:
1907 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1908
1909 if (PL_reg_eval_set) {
1910 /* Preserve the current value of $^R */
1911 if (oreplsv != GvSV(PL_replgv))
1912 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1913 restored, the value remains
1914 the same. */
acfe0abc 1915 restore_pos(aTHX_ 0);
d6a28714
JH
1916 }
1917
1918 /* make sure $`, $&, $', and $digit will work later */
1919 if ( !(flags & REXEC_NOT_FIRST) ) {
1920 if (RX_MATCH_COPIED(prog)) {
1921 Safefree(prog->subbeg);
1922 RX_MATCH_COPIED_off(prog);
1923 }
1924 if (flags & REXEC_COPY_STR) {
1925 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1926
1927 s = savepvn(strbeg, i);
1928 prog->subbeg = s;
1929 prog->sublen = i;
1930 RX_MATCH_COPIED_on(prog);
1931 }
1932 else {
1933 prog->subbeg = strbeg;
1934 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1935 }
1936 }
9041c2e3 1937
d6a28714
JH
1938 return 1;
1939
1940phooey:
bf93d4cc
GS
1941 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1942 PL_colors[4],PL_colors[5]));
d6a28714 1943 if (PL_reg_eval_set)
acfe0abc 1944 restore_pos(aTHX_ 0);
d6a28714
JH
1945 return 0;
1946}
1947
1948/*
1949 - regtry - try match at specific point
1950 */
1951STATIC I32 /* 0 failure, 1 success */
1952S_regtry(pTHX_ regexp *prog, char *startpos)
1953{
d6a28714
JH
1954 register I32 i;
1955 register I32 *sp;
1956 register I32 *ep;
1957 CHECKPOINT lastcp;
1958
02db2b7b
IZ
1959#ifdef DEBUGGING
1960 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1961#endif
d6a28714
JH
1962 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1963 MAGIC *mg;
1964
1965 PL_reg_eval_set = RS_init;
1966 DEBUG_r(DEBUG_s(
b900a521
JH
1967 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1968 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 1969 ));
e8347627 1970 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
1971 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1972 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1973 SAVETMPS;
1974 /* Apparently this is not needed, judging by wantarray. */
e8347627 1975 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
1976 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1977
1978 if (PL_reg_sv) {
1979 /* Make $_ available to executed code. */
1980 if (PL_reg_sv != DEFSV) {
4d1ff10f 1981 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
d6a28714
JH
1982 SAVESPTR(DEFSV);
1983 DEFSV = PL_reg_sv;
b8c5462f 1984 }
d6a28714 1985
9041c2e3 1986 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 1987 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 1988 /* prepare for quick setting of pos */
14befaf4
DM
1989 sv_magic(PL_reg_sv, (SV*)0,
1990 PERL_MAGIC_regex_global, Nullch, 0);
1991 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 1992 mg->mg_len = -1;
b8c5462f 1993 }
d6a28714
JH
1994 PL_reg_magic = mg;
1995 PL_reg_oldpos = mg->mg_len;
c76ac1ee 1996 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 1997 }
09687e5a 1998 if (!PL_reg_curpm) {
0f79a09d 1999 Newz(22,PL_reg_curpm, 1, PMOP);
09687e5a
AB
2000#ifdef USE_ITHREADS
2001 {
2002 SV* repointer = newSViv(0);
577e12cc 2003 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2004 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2005 av_push(PL_regex_padav,repointer);
2006 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2007 PL_regex_pad = AvARRAY(PL_regex_padav);
2008 }
2009#endif
2010 }
aaa362c4 2011 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2012 PL_reg_oldcurpm = PL_curpm;
2013 PL_curpm = PL_reg_curpm;
2014 if (RX_MATCH_COPIED(prog)) {
2015 /* Here is a serious problem: we cannot rewrite subbeg,
2016 since it may be needed if this match fails. Thus
2017 $` inside (?{}) could fail... */
2018 PL_reg_oldsaved = prog->subbeg;
2019 PL_reg_oldsavedlen = prog->sublen;
2020 RX_MATCH_COPIED_off(prog);
2021 }
2022 else
2023 PL_reg_oldsaved = Nullch;
2024 prog->subbeg = PL_bostr;
2025 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2026 }
2027 prog->startp[0] = startpos - PL_bostr;
2028 PL_reginput = startpos;
2029 PL_regstartp = prog->startp;
2030 PL_regendp = prog->endp;
2031 PL_reglastparen = &prog->lastparen;
a01268b5 2032 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714
JH
2033 prog->lastparen = 0;
2034 PL_regsize = 0;
2035 DEBUG_r(PL_reg_starttry = startpos);
2036 if (PL_reg_start_tmpl <= prog->nparens) {
2037 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2038 if(PL_reg_start_tmp)
2039 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2040 else
2041 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2042 }
2043
128e8167
JH
2044#ifdef DEBUGGING
2045 sv_setpvn(PERL_DEBUG_PAD(0), "", 0);
2046 sv_setpvn(PERL_DEBUG_PAD(1), "", 0);
2047 sv_setpvn(PERL_DEBUG_PAD(2), "", 0);
2048#endif
2049
d6a28714
JH
2050 /* XXXX What this code is doing here?!!! There should be no need
2051 to do this again and again, PL_reglastparen should take care of
3dd2943c 2052 this! --ilya*/
dafc8851
JH
2053
2054 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2055 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2056 * PL_reglastparen), is not needed at all by the test suite
2057 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2058 * enough, for building DynaLoader, or otherwise this
2059 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2060 * will happen. Meanwhile, this code *is* needed for the
2061 * above-mentioned test suite tests to succeed. The common theme
2062 * on those tests seems to be returning null fields from matches.
2063 * --jhi */
dafc8851 2064#if 1
d6a28714
JH
2065 sp = prog->startp;
2066 ep = prog->endp;
2067 if (prog->nparens) {
09e8ae3b 2068 for (i = prog->nparens; i > *PL_reglastparen; i--) {
d6a28714
JH
2069 *++sp = -1;
2070 *++ep = -1;
2071 }
2072 }
dafc8851 2073#endif
02db2b7b 2074 REGCP_SET(lastcp);
d6a28714
JH
2075 if (regmatch(prog->program + 1)) {
2076 prog->endp[0] = PL_reginput - PL_bostr;
2077 return 1;
2078 }
02db2b7b 2079 REGCP_UNWIND(lastcp);
d6a28714
JH
2080 return 0;
2081}
2082
02db2b7b
IZ
2083#define RE_UNWIND_BRANCH 1
2084#define RE_UNWIND_BRANCHJ 2
2085
2086union re_unwind_t;
2087
2088typedef struct { /* XX: makes sense to enlarge it... */
2089 I32 type;
2090 I32 prev;
2091 CHECKPOINT lastcp;
2092} re_unwind_generic_t;
2093
2094typedef struct {
2095 I32 type;
2096 I32 prev;
2097 CHECKPOINT lastcp;
2098 I32 lastparen;
2099 regnode *next;
2100 char *locinput;
2101 I32 nextchr;
2102#ifdef DEBUGGING
2103 int regindent;
2104#endif
2105} re_unwind_branch_t;
2106
2107typedef union re_unwind_t {
2108 I32 type;
2109 re_unwind_generic_t generic;
2110 re_unwind_branch_t branch;
2111} re_unwind_t;
2112
8ba1375e
MJD
2113#define sayYES goto yes
2114#define sayNO goto no
e0f9d4a8 2115#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2116#define sayYES_FINAL goto yes_final
2117#define sayYES_LOUD goto yes_loud
2118#define sayNO_FINAL goto no_final
2119#define sayNO_SILENT goto do_no
2120#define saySAME(x) if (x) goto yes; else goto no
2121
2122#define REPORT_CODE_OFF 24
2123
d6a28714
JH
2124/*
2125 - regmatch - main matching routine
2126 *
2127 * Conceptually the strategy is simple: check to see whether the current
2128 * node matches, call self recursively to see whether the rest matches,
2129 * and then act accordingly. In practice we make some effort to avoid
2130 * recursion, in particular by going through "ordinary" nodes (that don't
2131 * need to know whether the rest of the match failed) by a loop instead of
2132 * by recursion.
2133 */
2134/* [lwall] I've hoisted the register declarations to the outer block in order to
2135 * maybe save a little bit of pushing and popping on the stack. It also takes
2136 * advantage of machines that use a register save mask on subroutine entry.
2137 */
2138STATIC I32 /* 0 failure, 1 success */
2139S_regmatch(pTHX_ regnode *prog)
2140{
d6a28714
JH
2141 register regnode *scan; /* Current node. */
2142 regnode *next; /* Next node. */
2143 regnode *inner; /* Next node in internal branch. */
2144 register I32 nextchr; /* renamed nextchr - nextchar colides with
2145 function of same name */
2146 register I32 n; /* no or next */
b7953727
JH
2147 register I32 ln = 0; /* len or last */
2148 register char *s = Nullch; /* operand or save */
d6a28714 2149 register char *locinput = PL_reginput;
b7953727 2150 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2151 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2152 I32 unwind = 0;
b7953727 2153#if 0
02db2b7b 2154 I32 firstcp = PL_savestack_ix;
b7953727 2155#endif
53c4c00c 2156 register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2157#ifdef DEBUGGING
ce333219
JH
2158 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2159 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2160 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2a782b5b 2161#endif
02db2b7b 2162
d6a28714
JH
2163#ifdef DEBUGGING
2164 PL_regindent++;
2165#endif
2166
2167 /* Note that nextchr is a byte even in UTF */
2168 nextchr = UCHARAT(locinput);
2169 scan = prog;
2170 while (scan != NULL) {
8ba1375e 2171
2a782b5b 2172 DEBUG_r( {
d6a28714
JH
2173 SV *prop = sv_newmortal();
2174 int docolor = *PL_colors[0];
2175 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2176 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2177 /* The part of the string before starttry has one color
2178 (pref0_len chars), between starttry and current
2179 position another one (pref_len - pref0_len chars),
2180 after the current position the third one.
2181 We assume that pref0_len <= pref_len, otherwise we
2182 decrease pref0_len. */
9041c2e3 2183 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2184 ? (5 + taill) - l : locinput - PL_bostr;
2185 int pref0_len;
d6a28714 2186
df1ffd02 2187 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2188 pref_len++;
2189 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2190 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2191 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2192 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2193 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2194 l--;
d6a28714
JH
2195 if (pref0_len < 0)
2196 pref0_len = 0;
2197 if (pref0_len > pref_len)
2198 pref0_len = pref_len;
2199 regprop(prop, scan);
2a782b5b
JH
2200 {
2201 char *s0 =
df1ffd02 2202 do_utf8 ?
2a782b5b 2203 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2204 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2205 locinput - pref_len;
df1ffd02
JH
2206 int len0 = do_utf8 ? strlen(s0) : pref0_len;
2207 char *s1 = do_utf8 ?
2a782b5b 2208 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2209 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2210 locinput - pref_len + pref0_len;
df1ffd02
JH
2211 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2212 char *s2 = do_utf8 ?
2a782b5b 2213 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2214 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2215 locinput;
df1ffd02 2216 int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2217 PerlIO_printf(Perl_debug_log,
2218 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2219 (IV)(locinput - PL_bostr),
2220 PL_colors[4],
2221 len0, s0,
2222 PL_colors[5],
2223 PL_colors[2],
2224 len1, s1,
2225 PL_colors[3],
2226 (docolor ? "" : "> <"),
2227 PL_colors[0],
2228 len2, s2,
2229 PL_colors[1],
2230 15 - l - pref_len + 1,
2231 "",
2232 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2233 SvPVX(prop));
2234 }
2235 });
d6a28714
JH
2236
2237 next = scan + NEXT_OFF(scan);
2238 if (next == scan)
2239 next = NULL;
2240
2241 switch (OP(scan)) {
2242 case BOL:
12d33761
HS
2243 if (locinput == PL_bostr || (PL_multiline &&
2244 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
d6a28714
JH
2245 {
2246 /* regtill = regbol; */
b8c5462f
JH
2247 break;
2248 }
d6a28714
JH
2249 sayNO;
2250 case MBOL:
12d33761
HS
2251 if (locinput == PL_bostr ||
2252 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2253 {
b8c5462f
JH
2254 break;
2255 }
d6a28714
JH
2256 sayNO;
2257 case SBOL:
c2a73568 2258 if (locinput == PL_bostr)
b8c5462f 2259 break;
d6a28714
JH
2260 sayNO;
2261 case GPOS:
2262 if (locinput == PL_reg_ganch)
2263 break;
2264 sayNO;
2265 case EOL:
2266 if (PL_multiline)
2267 goto meol;
2268 else
2269 goto seol;
2270 case MEOL:
2271 meol:
2272 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2273 sayNO;
b8c5462f 2274 break;
d6a28714
JH
2275 case SEOL:
2276 seol:
2277 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2278 sayNO;
d6a28714 2279 if (PL_regeol - locinput > 1)
b8c5462f 2280 sayNO;
b8c5462f 2281 break;
d6a28714
JH
2282 case EOS:
2283 if (PL_regeol != locinput)
b8c5462f 2284 sayNO;
d6a28714 2285 break;
ffc61ed2 2286 case SANY:
d6a28714 2287 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2288 sayNO;
f33976b4
DB
2289 if (do_utf8) {
2290 locinput += PL_utf8skip[nextchr];
2291 if (locinput > PL_regeol)
2292 sayNO;
2293 nextchr = UCHARAT(locinput);
2294 }
2295 else
2296 nextchr = UCHARAT(++locinput);
2297 break;
2298 case CANY:
2299 if (!nextchr && locinput >= PL_regeol)
2300 sayNO;
b8c5462f 2301 nextchr = UCHARAT(++locinput);
a0d0e21e 2302 break;
ffc61ed2 2303 case REG_ANY:
1aa99e6b
IH
2304 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2305 sayNO;
2306 if (do_utf8) {
b8c5462f 2307 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2308 if (locinput > PL_regeol)
2309 sayNO;
a0ed51b3 2310 nextchr = UCHARAT(locinput);
a0ed51b3 2311 }
1aa99e6b
IH
2312 else
2313 nextchr = UCHARAT(++locinput);
a0ed51b3 2314 break;
d6a28714 2315 case EXACT:
cd439c50
IZ
2316 s = STRING(scan);
2317 ln = STR_LEN(scan);
1aa99e6b 2318 if (do_utf8 != (UTF!=0)) {
bc517b45 2319 /* The target and the pattern have differing utf8ness. */
1aa99e6b
IH
2320 char *l = locinput;
2321 char *e = s + ln;
bc517b45 2322 STRLEN ulen;
a72c7584 2323
5ff6fc6d
JH
2324 if (do_utf8) {
2325 /* The target is utf8, the pattern is not utf8. */
1aa99e6b
IH
2326 while (s < e) {
2327 if (l >= PL_regeol)
5ff6fc6d
JH
2328 sayNO;
2329 if (NATIVE_TO_UNI(*(U8*)s) !=
bc517b45 2330 utf8_to_uvchr((U8*)l, &ulen))
5ff6fc6d 2331 sayNO;
bc517b45 2332 l += ulen;
5ff6fc6d 2333 s ++;
1aa99e6b 2334 }
5ff6fc6d
JH
2335 }
2336 else {
2337 /* The target is not utf8, the pattern is utf8. */
1aa99e6b
IH
2338 while (s < e) {
2339 if (l >= PL_regeol)
2340 sayNO;
5ff6fc6d 2341 if (NATIVE_TO_UNI(*((U8*)l)) !=
bc517b45 2342 utf8_to_uvchr((U8*)s, &ulen))
1aa99e6b 2343 sayNO;
bc517b45 2344 s += ulen;
a72c7584 2345 l ++;
1aa99e6b 2346 }
5ff6fc6d 2347 }
1aa99e6b
IH
2348 locinput = l;
2349 nextchr = UCHARAT(locinput);
2350 break;
2351 }
bc517b45 2352 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2353 /* Inline the first character, for speed. */
2354 if (UCHARAT(s) != nextchr)
2355 sayNO;
2356 if (PL_regeol - locinput < ln)
2357 sayNO;
2358 if (ln > 1 && memNE(s, locinput, ln))
2359 sayNO;
2360 locinput += ln;
2361 nextchr = UCHARAT(locinput);
2362 break;
2363 case EXACTFL:
b8c5462f
JH
2364 PL_reg_flags |= RF_tainted;
2365 /* FALL THROUGH */
d6a28714 2366 case EXACTF:
cd439c50
IZ
2367 s = STRING(scan);
2368 ln = STR_LEN(scan);
d6a28714 2369
d07ddd77
JH
2370 if (do_utf8 || UTF) {
2371 /* Either target or the pattern are utf8. */
d6a28714 2372 char *l = locinput;
d07ddd77 2373 char *e = PL_regeol;
bc517b45 2374
d07ddd77 2375 if (ibcmp_utf8(s, 0, ln, do_utf8,
5486206c
JH
2376 l, &e, 0, UTF)) {
2377 /* One more case for the sharp s:
2378 * pack("U0U*", 0xDF) =~ /ss/i,
2379 * the 0xC3 0x9F are the UTF-8
2380 * byte sequence for the U+00DF. */
2381 if (!(do_utf8 &&
2382 toLOWER(s[0]) == 's' &&
2383 ln >= 2 &&
2384 toLOWER(s[1]) == 's' &&
2385 (U8)l[0] == 0xC3 &&
2386 e - l >= 2 &&
2387 (U8)l[1] == 0x9F))
2388 sayNO;
2389 }
d07ddd77
JH
2390 locinput = e;
2391 nextchr = UCHARAT(locinput);
2392 break;
a0ed51b3 2393 }
d6a28714 2394
bc517b45
JH
2395 /* Neither the target and the pattern are utf8. */
2396
d6a28714
JH
2397 /* Inline the first character, for speed. */
2398 if (UCHARAT(s) != nextchr &&
2399 UCHARAT(s) != ((OP(scan) == EXACTF)
2400 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2401 sayNO;
d6a28714 2402 if (PL_regeol - locinput < ln)
b8c5462f 2403 sayNO;
d6a28714
JH
2404 if (ln > 1 && (OP(scan) == EXACTF
2405 ? ibcmp(s, locinput, ln)
2406 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2407 sayNO;
d6a28714
JH
2408 locinput += ln;
2409 nextchr = UCHARAT(locinput);
a0d0e21e 2410 break;
d6a28714 2411 case ANYOF:
ffc61ed2 2412 if (do_utf8) {
9e55ce06
JH
2413 STRLEN inclasslen = PL_regeol - locinput;
2414
2415 if (!reginclasslen(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2416 sayNO_ANYOF;
ffc61ed2
JH
2417 if (locinput >= PL_regeol)
2418 sayNO;
9e55ce06 2419 locinput += inclasslen;
b8c5462f 2420 nextchr = UCHARAT(locinput);
e0f9d4a8 2421 break;
ffc61ed2
JH
2422 }
2423 else {
2424 if (nextchr < 0)
2425 nextchr = UCHARAT(locinput);
2426 if (!reginclass(scan, (U8*)locinput, do_utf8))
e0f9d4a8 2427 sayNO_ANYOF;
ffc61ed2
JH
2428 if (!nextchr && locinput >= PL_regeol)
2429 sayNO;
2430 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2431 break;
2432 }
2433 no_anyof:
2434 /* If we might have the case of the German sharp s
2435 * in a casefolding Unicode character class. */
2436
2437 if (ANYOF_UNICODE_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2438 locinput += 2;
2439 nextchr = UCHARAT(locinput);
ffc61ed2 2440 }
e0f9d4a8
JH
2441 else
2442 sayNO;
b8c5462f 2443 break;
d6a28714 2444 case ALNUML:
b8c5462f
JH
2445 PL_reg_flags |= RF_tainted;
2446 /* FALL THROUGH */
d6a28714 2447 case ALNUM:
b8c5462f 2448 if (!nextchr)
4633a7c4 2449 sayNO;
ffc61ed2 2450 if (do_utf8) {
ad24be35 2451 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2452 if (!(OP(scan) == ALNUM
3568d838 2453 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2454 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2455 {
2456 sayNO;
a0ed51b3 2457 }
b8c5462f 2458 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2459 nextchr = UCHARAT(locinput);
2460 break;
2461 }
ffc61ed2 2462 if (!(OP(scan) == ALNUM
d6a28714 2463 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2464 sayNO;
b8c5462f 2465 nextchr = UCHARAT(++locinput);
a0d0e21e 2466 break;
d6a28714 2467 case NALNUML:
b8c5462f
JH
2468 PL_reg_flags |= RF_tainted;
2469 /* FALL THROUGH */
d6a28714
JH
2470 case NALNUM:
2471 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2472 sayNO;
ffc61ed2 2473 if (do_utf8) {
8269fa76 2474 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2475 if (OP(scan) == NALNUM
3568d838 2476 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2477 : isALNUM_LC_utf8((U8*)locinput))
2478 {
b8c5462f 2479 sayNO;
d6a28714 2480 }
b8c5462f
JH
2481 locinput += PL_utf8skip[nextchr];
2482 nextchr = UCHARAT(locinput);
2483 break;
2484 }
ffc61ed2 2485 if (OP(scan) == NALNUM
d6a28714 2486 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2487 sayNO;
76e3520e 2488 nextchr = UCHARAT(++locinput);
a0d0e21e 2489 break;
d6a28714
JH
2490 case BOUNDL:
2491 case NBOUNDL:
3280af22 2492 PL_reg_flags |= RF_tainted;
bbce6d69 2493 /* FALL THROUGH */
d6a28714
JH
2494 case BOUND:
2495 case NBOUND:
2496 /* was last char in word? */
ffc61ed2 2497 if (do_utf8) {
12d33761
HS
2498 if (locinput == PL_bostr)
2499 ln = '\n';
ffc61ed2
JH
2500 else {
2501 U8 *r = reghop((U8*)locinput, -1);
9041c2e3 2502
2b9d42f0 2503 ln = utf8n_to_uvchr(r, s - (char*)r, 0, 0);
ffc61ed2
JH
2504 }
2505 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2506 ln = isALNUM_uni(ln);
8269fa76 2507 LOAD_UTF8_CHARCLASS(alnum,"a");
3568d838 2508 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2509 }
2510 else {
9041c2e3 2511 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2512 n = isALNUM_LC_utf8((U8*)locinput);
2513 }
a0ed51b3 2514 }
d6a28714 2515 else {
12d33761
HS
2516 ln = (locinput != PL_bostr) ?
2517 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2518 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2519 ln = isALNUM(ln);
2520 n = isALNUM(nextchr);
2521 }
2522 else {
2523 ln = isALNUM_LC(ln);
2524 n = isALNUM_LC(nextchr);
2525 }
d6a28714 2526 }
ffc61ed2
JH
2527 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2528 OP(scan) == BOUNDL))
2529 sayNO;
a0ed51b3 2530 break;
d6a28714 2531 case SPACEL:
3280af22 2532 PL_reg_flags |= RF_tainted;
bbce6d69 2533 /* FALL THROUGH */
d6a28714 2534 case SPACE:
9442cb0e 2535 if (!nextchr)
4633a7c4 2536 sayNO;
1aa99e6b 2537 if (do_utf8) {
fd400ab9 2538 if (UTF8_IS_CONTINUED(nextchr)) {
8269fa76 2539 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2540 if (!(OP(scan) == SPACE
3568d838 2541 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2542 : isSPACE_LC_utf8((U8*)locinput)))
2543 {
2544 sayNO;
2545 }
2546 locinput += PL_utf8skip[nextchr];
2547 nextchr = UCHARAT(locinput);
2548 break;
d6a28714 2549 }
ffc61ed2
JH
2550 if (!(OP(scan) == SPACE
2551 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2552 sayNO;
2553 nextchr = UCHARAT(++locinput);
2554 }
2555 else {
2556 if (!(OP(scan) == SPACE
2557 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2558 sayNO;
2559 nextchr = UCHARAT(++locinput);
a0ed51b3 2560 }
a0ed51b3 2561 break;
d6a28714 2562 case NSPACEL:
3280af22 2563 PL_reg_flags |= RF_tainted;
bbce6d69 2564 /* FALL THROUGH */
d6a28714 2565 case NSPACE:
9442cb0e 2566 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2567 sayNO;
1aa99e6b 2568 if (do_utf8) {
8269fa76 2569 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2570 if (OP(scan) == NSPACE
3568d838 2571 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 2572 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
2573 {
2574 sayNO;
2575 }
2576 locinput += PL_utf8skip[nextchr];
2577 nextchr = UCHARAT(locinput);
2578 break;
a0ed51b3 2579 }
ffc61ed2 2580 if (OP(scan) == NSPACE
d6a28714 2581 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 2582 sayNO;
76e3520e 2583 nextchr = UCHARAT(++locinput);
a0d0e21e 2584 break;
d6a28714 2585 case DIGITL:
a0ed51b3
LW
2586 PL_reg_flags |= RF_tainted;
2587 /* FALL THROUGH */
d6a28714 2588 case DIGIT:
9442cb0e 2589 if (!nextchr)
a0ed51b3 2590 sayNO;
1aa99e6b 2591 if (do_utf8) {
8269fa76 2592 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2593 if (!(OP(scan) == DIGIT
3568d838 2594 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 2595 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 2596 {
a0ed51b3 2597 sayNO;
dfe13c55 2598 }
6f06b55f 2599 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2600 nextchr = UCHARAT(locinput);
2601 break;
2602 }
ffc61ed2 2603 if (!(OP(scan) == DIGIT
9442cb0e 2604 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
2605 sayNO;
2606 nextchr = UCHARAT(++locinput);
2607 break;
d6a28714 2608 case NDIGITL:
b8c5462f
JH
2609 PL_reg_flags |= RF_tainted;
2610 /* FALL THROUGH */
d6a28714 2611 case NDIGIT:
9442cb0e 2612 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2613 sayNO;
1aa99e6b 2614 if (do_utf8) {
8269fa76 2615 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2616 if (OP(scan) == NDIGIT
3568d838 2617 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
2618 : isDIGIT_LC_utf8((U8*)locinput))
2619 {
a0ed51b3 2620 sayNO;
9442cb0e 2621 }
6f06b55f 2622 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2623 nextchr = UCHARAT(locinput);
2624 break;
2625 }
ffc61ed2 2626 if (OP(scan) == NDIGIT
9442cb0e 2627 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
2628 sayNO;
2629 nextchr = UCHARAT(++locinput);
2630 break;
2631 case CLUMP:
b7c83a7e 2632 if (locinput >= PL_regeol)
a0ed51b3 2633 sayNO;
b7c83a7e
JH
2634 if (do_utf8) {
2635 LOAD_UTF8_CHARCLASS(mark,"~");
2636 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2637 sayNO;
2638 locinput += PL_utf8skip[nextchr];
2639 while (locinput < PL_regeol &&
2640 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2641 locinput += UTF8SKIP(locinput);
2642 if (locinput > PL_regeol)
2643 sayNO;
eb08e2da
JH
2644 }
2645 else
2646 locinput++;
a0ed51b3
LW
2647 nextchr = UCHARAT(locinput);
2648 break;
c8756f30 2649 case REFFL:
3280af22 2650 PL_reg_flags |= RF_tainted;
c8756f30 2651 /* FALL THROUGH */
c277df42 2652 case REF:
c8756f30 2653 case REFF:
c277df42 2654 n = ARG(scan); /* which paren pair */
cf93c79d 2655 ln = PL_regstartp[n];
2c2d71f5 2656 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
cf93c79d 2657 if (*PL_reglastparen < n || ln == -1)
af3f8c16 2658 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 2659 if (ln == PL_regendp[n])
a0d0e21e 2660 break;
a0ed51b3 2661
cf93c79d 2662 s = PL_bostr + ln;
1aa99e6b 2663 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 2664 char *l = locinput;
cf93c79d 2665 char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
2666 /*
2667 * Note that we can't do the "other character" lookup trick as
2668 * in the 8-bit case (no pun intended) because in Unicode we
2669 * have to map both upper and title case to lower case.
2670 */
2671 if (OP(scan) == REFF) {
a2a2844f 2672 STRLEN ulen1, ulen2;
e7ae6809
JH
2673 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2674 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a0ed51b3
LW
2675 while (s < e) {
2676 if (l >= PL_regeol)
2677 sayNO;
a2a2844f
JH
2678 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2679 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 2680 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 2681 sayNO;
a2a2844f
JH
2682 s += ulen1;
2683 l += ulen2;
a0ed51b3
LW
2684 }
2685 }
2686 locinput = l;
2687 nextchr = UCHARAT(locinput);
2688 break;
2689 }
2690
a0d0e21e 2691 /* Inline the first character, for speed. */
76e3520e 2692 if (UCHARAT(s) != nextchr &&
c8756f30
AK
2693 (OP(scan) == REF ||
2694 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 2695 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 2696 sayNO;
cf93c79d 2697 ln = PL_regendp[n] - ln;
3280af22 2698 if (locinput + ln > PL_regeol)
4633a7c4 2699 sayNO;
c8756f30
AK
2700 if (ln > 1 && (OP(scan) == REF
2701 ? memNE(s, locinput, ln)
2702 : (OP(scan) == REFF
2703 ? ibcmp(s, locinput, ln)
2704 : ibcmp_locale(s, locinput, ln))))
4633a7c4 2705 sayNO;
a0d0e21e 2706 locinput += ln;
76e3520e 2707 nextchr = UCHARAT(locinput);
a0d0e21e
LW
2708 break;
2709
2710 case NOTHING:
c277df42 2711 case TAIL:
a0d0e21e
LW
2712 break;
2713 case BACK:
2714 break;
c277df42
IZ
2715 case EVAL:
2716 {
2717 dSP;
533c011a 2718 OP_4tree *oop = PL_op;
3280af22
NIS
2719 COP *ocurcop = PL_curcop;
2720 SV **ocurpad = PL_curpad;
c277df42 2721 SV *ret;
9041c2e3 2722
c277df42 2723 n = ARG(scan);
533c011a 2724 PL_op = (OP_4tree*)PL_regdata->data[n];
d7d93a81 2725 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
dfad63ad 2726 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
cf93c79d 2727 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 2728
8e5e9ebe
RGS
2729 {
2730 SV **before = SP;
2731 CALLRUNOPS(aTHX); /* Scalar context. */
2732 SPAGAIN;
2733 if (SP == before)
2734 ret = Nullsv; /* protect against empty (?{}) blocks. */
2735 else {
2736 ret = POPs;
2737 PUTBACK;
2738 }
2739 }
2740
0f5d15d6
IZ
2741 PL_op = oop;
2742 PL_curpad = ocurpad;
2743 PL_curcop = ocurcop;
c277df42 2744 if (logical) {
0f5d15d6
IZ
2745 if (logical == 2) { /* Postponed subexpression. */
2746 regexp *re;
22c35a8c 2747 MAGIC *mg = Null(MAGIC*);
0f5d15d6 2748 re_cc_state state;
0f5d15d6
IZ
2749 CHECKPOINT cp, lastcp;
2750
2751 if(SvROK(ret) || SvRMAGICAL(ret)) {
2752 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2753
2754 if(SvMAGICAL(sv))
14befaf4 2755 mg = mg_find(sv, PERL_MAGIC_qr);
0f5d15d6
IZ
2756 }
2757 if (mg) {
2758 re = (regexp *)mg->mg_obj;
df0003d4 2759 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
2760 }
2761 else {
2762 STRLEN len;
2763 char *t = SvPV(ret, len);
2764 PMOP pm;
2765 char *oprecomp = PL_regprecomp;
2766 I32 osize = PL_regsize;
2767 I32 onpar = PL_regnpar;
2768
5fcd1c1b 2769 Zero(&pm, 1, PMOP);
cea2e8a9 2770 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
9041c2e3 2771 if (!(SvFLAGS(ret)
0f5d15d6 2772 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
14befaf4
DM
2773 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2774 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
2775 PL_regprecomp = oprecomp;
2776 PL_regsize = osize;
2777 PL_regnpar = onpar;
2778 }
2779 DEBUG_r(
9041c2e3 2780 PerlIO_printf(Perl_debug_log,
0f5d15d6
IZ
2781 "Entering embedded `%s%.60s%s%s'\n",
2782 PL_colors[0],
2783 re->precomp,
2784 PL_colors[1],
2785 (strlen(re->precomp) > 60 ? "..." : ""))
2786 );
2787 state.node = next;
2788 state.prev = PL_reg_call_cc;
2789 state.cc = PL_regcc;
2790 state.re = PL_reg_re;
2791
2ab05381 2792 PL_regcc = 0;
9041c2e3 2793
0f5d15d6 2794 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 2795 REGCP_SET(lastcp);
0f5d15d6
IZ
2796 cache_re(re);
2797 state.ss = PL_savestack_ix;
2798 *PL_reglastparen = 0;
a01268b5 2799 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
2800 PL_reg_call_cc = &state;
2801 PL_reginput = locinput;
2c2d71f5
JH
2802
2803 /* XXXX This is too dramatic a measure... */
2804 PL_reg_maxiter = 0;
2805
0f5d15d6 2806 if (regmatch(re->program + 1)) {
2c914db6
IZ
2807 /* Even though we succeeded, we need to restore
2808 global variables, since we may be wrapped inside
2809 SUSPEND, thus the match may be not finished yet. */
2810
2811 /* XXXX Do this only if SUSPENDed? */
2812 PL_reg_call_cc = state.prev;
2813 PL_regcc = state.cc;
2814 PL_reg_re = state.re;
2815 cache_re(PL_reg_re);
2816
2817 /* XXXX This is too dramatic a measure... */
2818 PL_reg_maxiter = 0;
2819
2820 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
2821 ReREFCNT_dec(re);
2822 regcpblow(cp);
2823 sayYES;
2824 }
0f5d15d6 2825 ReREFCNT_dec(re);
02db2b7b 2826 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
2827 regcppop();
2828 PL_reg_call_cc = state.prev;
2829 PL_regcc = state.cc;
2830 PL_reg_re = state.re;
d3790889 2831 cache_re(PL_reg_re);
2c2d71f5
JH
2832
2833 /* XXXX This is too dramatic a measure... */
2834 PL_reg_maxiter = 0;
2835
8e514ae6 2836 logical = 0;
0f5d15d6
IZ
2837 sayNO;
2838 }
c277df42 2839 sw = SvTRUE(ret);
0f5d15d6 2840 logical = 0;
a0ed51b3
LW
2841 }
2842 else
3280af22 2843 sv_setsv(save_scalar(PL_replgv), ret);
c277df42
IZ
2844 break;
2845 }
a0d0e21e 2846 case OPEN:
c277df42 2847 n = ARG(scan); /* which paren pair */
3280af22
NIS
2848 PL_reg_start_tmp[n] = locinput;
2849 if (n > PL_regsize)
2850 PL_regsize = n;
a0d0e21e
LW
2851 break;
2852 case CLOSE:
c277df42 2853 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
2854 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2855 PL_regendp[n] = locinput - PL_bostr;
3280af22
NIS
2856 if (n > *PL_reglastparen)
2857 *PL_reglastparen = n;
a01268b5 2858 *PL_reglastcloseparen = n;
a0d0e21e 2859 break;
c277df42
IZ
2860 case GROUPP:
2861 n = ARG(scan); /* which paren pair */
cf93c79d 2862 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
2863 break;
2864 case IFTHEN:
2c2d71f5 2865 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
2866 if (sw)
2867 next = NEXTOPER(NEXTOPER(scan));
2868 else {
2869 next = scan + ARG(scan);
2870 if (OP(next) == IFTHEN) /* Fake one. */
2871 next = NEXTOPER(NEXTOPER(next));
2872 }
2873 break;
2874 case LOGICAL:
0f5d15d6 2875 logical = scan->flags;
c277df42 2876 break;
2ab05381
IZ
2877/*******************************************************************
2878 PL_regcc contains infoblock about the innermost (...)* loop, and
2879 a pointer to the next outer infoblock.
2880
2881 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2882
2883 1) After matching X, regnode for CURLYX is processed;
2884
9041c2e3 2885 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
2886 regmatch() recursively with the starting point at WHILEM node;
2887
2888 3) Each hit of WHILEM node tries to match A and Z (in the order
2889 depending on the current iteration, min/max of {min,max} and
2890 greediness). The information about where are nodes for "A"
2891 and "Z" is read from the infoblock, as is info on how many times "A"
2892 was already matched, and greediness.
2893
2894 4) After A matches, the same WHILEM node is hit again.
2895
2896 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2897 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2898 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2899 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2900 of the external loop.
2901
2902 Currently present infoblocks form a tree with a stem formed by PL_curcc
2903 and whatever it mentions via ->next, and additional attached trees
2904 corresponding to temporarily unset infoblocks as in "5" above.
2905
9041c2e3 2906 In the following picture infoblocks for outer loop of
2ab05381
IZ
2907 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2908 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2909 infoblocks are drawn below the "reset" infoblock.
2910
2911 In fact in the picture below we do not show failed matches for Z and T
2912 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2913 more obvious *why* one needs to *temporary* unset infoblocks.]
2914
2915 Matched REx position InfoBlocks Comment
2916 (Y(A)*?Z)*?T x
2917 Y(A)*?Z)*?T x <- O
2918 Y (A)*?Z)*?T x <- O
2919 Y A)*?Z)*?T x <- O <- I
2920 YA )*?Z)*?T x <- O <- I
2921 YA A)*?Z)*?T x <- O <- I
2922 YAA )*?Z)*?T x <- O <- I
2923 YAA Z)*?T x <- O # Temporary unset I
2924 I
2925
2926 YAAZ Y(A)*?Z)*?T x <- O
2927 I
2928
2929 YAAZY (A)*?Z)*?T x <- O
2930 I
2931
2932 YAAZY A)*?Z)*?T x <- O <- I
2933 I
2934
2935 YAAZYA )*?Z)*?T x <- O <- I
2936 I
2937
2938 YAAZYA Z)*?T x <- O # Temporary unset I
2939 I,I
2940
2941 YAAZYAZ )*?T x <- O
2942 I,I
2943
2944 YAAZYAZ T x # Temporary unset O
2945 O
2946 I,I
2947
2948 YAAZYAZT x
2949 O
2950 I,I
2951 *******************************************************************/
a0d0e21e
LW
2952 case CURLYX: {
2953 CURCUR cc;
3280af22 2954 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
2955 /* No need to save/restore up to this paren */
2956 I32 parenfloor = scan->flags;
c277df42
IZ
2957
2958 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2959 next += ARG(next);
3280af22
NIS
2960 cc.oldcc = PL_regcc;
2961 PL_regcc = &cc;
cb434fcc
IZ
2962 /* XXXX Probably it is better to teach regpush to support
2963 parenfloor > PL_regsize... */
2964 if (parenfloor > *PL_reglastparen)
2965 parenfloor = *PL_reglastparen; /* Pessimization... */
2966 cc.parenfloor = parenfloor;
a0d0e21e
LW
2967 cc.cur = -1;
2968 cc.min = ARG1(scan);
2969 cc.max = ARG2(scan);
c277df42 2970 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
2971 cc.next = next;
2972 cc.minmod = minmod;
2973 cc.lastloc = 0;
3280af22 2974 PL_reginput = locinput;
a0d0e21e
LW
2975 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2976 regcpblow(cp);
3280af22 2977 PL_regcc = cc.oldcc;
4633a7c4 2978 saySAME(n);
a0d0e21e
LW
2979 }
2980 /* NOT REACHED */
2981 case WHILEM: {
2982 /*
2983 * This is really hard to understand, because after we match
2984 * what we're trying to match, we must make sure the rest of
2c2d71f5 2985 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
2986 * to go back UP the parse tree by recursing ever deeper. And
2987 * if it fails, we have to reset our parent's current state
2988 * that we can try again after backing off.
2989 */
2990
c277df42 2991 CHECKPOINT cp, lastcp;
3280af22 2992 CURCUR* cc = PL_regcc;
c277df42
IZ
2993 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2994
4633a7c4 2995 n = cc->cur + 1; /* how many we know we matched */
3280af22 2996 PL_reginput = locinput;
a0d0e21e 2997
c277df42 2998 DEBUG_r(
9041c2e3
NIS
2999 PerlIO_printf(Perl_debug_log,
3000 "%*s %ld out of %ld..%ld cc=%lx\n",
3280af22 3001 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3002 (long)n, (long)cc->min,
c277df42
IZ
3003 (long)cc->max, (long)cc)
3004 );
4633a7c4 3005
a0d0e21e
LW
3006 /* If degenerate scan matches "", assume scan done. */
3007
579cf2c3 3008 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3009 PL_regcc = cc->oldcc;
2ab05381
IZ
3010 if (PL_regcc)
3011 ln = PL_regcc->cur;
c277df42 3012 DEBUG_r(
c3464db5
DD
3013 PerlIO_printf(Perl_debug_log,
3014 "%*s empty match detected, try continuation...\n",
3280af22 3015 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3016 );
a0d0e21e 3017 if (regmatch(cc->next))
4633a7c4 3018 sayYES;
2ab05381
IZ
3019 if (PL_regcc)
3020 PL_regcc->cur = ln;
3280af22 3021 PL_regcc = cc;
4633a7c4 3022 sayNO;
a0d0e21e
LW
3023 }
3024
3025 /* First just match a string of min scans. */
3026
3027 if (n < cc->min) {
3028 cc->cur = n;
3029 cc->lastloc = locinput;
4633a7c4
LW
3030 if (regmatch(cc->scan))
3031 sayYES;
3032 cc->cur = n - 1;
c277df42 3033 cc->lastloc = lastloc;
4633a7c4 3034 sayNO;
a0d0e21e
LW
3035 }
3036
2c2d71f5
JH
3037 if (scan->flags) {
3038 /* Check whether we already were at this position.
3039 Postpone detection until we know the match is not
3040 *that* much linear. */
3041 if (!PL_reg_maxiter) {
3042 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3043 PL_reg_leftiter = PL_reg_maxiter;
3044 }
3045 if (PL_reg_leftiter-- == 0) {
3046 I32 size = (PL_reg_maxiter + 7)/8;
3047 if (PL_reg_poscache) {
3048 if (PL_reg_poscache_size < size) {
3049 Renew(PL_reg_poscache, size, char);
3050 PL_reg_poscache_size = size;
3051 }
3052 Zero(PL_reg_poscache, size, char);
3053 }
3054 else {
3055 PL_reg_poscache_size = size;
3056 Newz(29, PL_reg_poscache, size, char);
3057 }
3058 DEBUG_r(
3059 PerlIO_printf(Perl_debug_log,
3060 "%sDetected a super-linear match, switching on caching%s...\n",
3061 PL_colors[4], PL_colors[5])
3062 );
3063 }
3064 if (PL_reg_leftiter < 0) {
3065 I32 o = locinput - PL_bostr, b;
3066
3067 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3068 b = o % 8;
3069 o /= 8;
3070 if (PL_reg_poscache[o] & (1<<b)) {
3071 DEBUG_r(
3072 PerlIO_printf(Perl_debug_log,
3073 "%*s already tried at this position...\n",
3074 REPORT_CODE_OFF+PL_regindent*2, "")
3075 );
7821416a 3076 sayNO_SILENT;
2c2d71f5
JH
3077 }
3078 PL_reg_poscache[o] |= (1<<b);
3079 }
3080 }
3081
a0d0e21e
LW
3082 /* Prefer next over scan for minimal matching. */
3083
3084 if (cc->minmod) {
3280af22 3085 PL_regcc = cc->oldcc;
2ab05381
IZ
3086 if (PL_regcc)
3087 ln = PL_regcc->cur;
5f05dabc 3088 cp = regcppush(cc->parenfloor);
02db2b7b 3089 REGCP_SET(lastcp);
5f05dabc 3090 if (regmatch(cc->next)) {
c277df42 3091 regcpblow(cp);
4633a7c4 3092 sayYES; /* All done. */
5f05dabc 3093 }
02db2b7b 3094 REGCP_UNWIND(lastcp);
5f05dabc 3095 regcppop();
2ab05381
IZ
3096 if (PL_regcc)
3097 PL_regcc->cur = ln;
3280af22 3098 PL_regcc = cc;
a0d0e21e 3099
c277df42 3100 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3101 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3102 && !(PL_reg_flags & RF_warned)) {
3103 PL_reg_flags |= RF_warned;
e476b1b5 3104 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2f3ca594
GS
3105 "Complex regular subexpression recursion",
3106 REG_INFTY - 1);
c277df42 3107 }
4633a7c4 3108 sayNO;
c277df42 3109 }
a687059c 3110
c277df42 3111 DEBUG_r(
c3464db5
DD
3112 PerlIO_printf(Perl_debug_log,
3113 "%*s trying longer...\n",
3280af22 3114 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3115 );
a0d0e21e 3116 /* Try scanning more and see if it helps. */
3280af22 3117 PL_reginput = locinput;
a0d0e21e
LW
3118 cc->cur = n;
3119 cc->lastloc = locinput;
5f05dabc 3120 cp = regcppush(cc->parenfloor);
02db2b7b 3121 REGCP_SET(lastcp);
5f05dabc 3122 if (regmatch(cc->scan)) {
c277df42 3123 regcpblow(cp);
4633a7c4 3124 sayYES;
5f05dabc 3125 }
02db2b7b 3126 REGCP_UNWIND(lastcp);
5f05dabc 3127 regcppop();
4633a7c4 3128 cc->cur = n - 1;
c277df42 3129 cc->lastloc = lastloc;
4633a7c4 3130 sayNO;
a0d0e21e
LW
3131 }
3132
3133 /* Prefer scan over next for maximal matching. */
3134
3135 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3136 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3137 cc->cur = n;
3138 cc->lastloc = locinput;
02db2b7b 3139 REGCP_SET(lastcp);
5f05dabc 3140 if (regmatch(cc->scan)) {
c277df42 3141 regcpblow(cp);
4633a7c4 3142 sayYES;
5f05dabc 3143 }
02db2b7b 3144 REGCP_UNWIND(lastcp);
a0d0e21e 3145 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3146 PL_reginput = locinput;
c277df42 3147 DEBUG_r(
c3464db5
DD
3148 PerlIO_printf(Perl_debug_log,
3149 "%*s failed, try continuation...\n",
3280af22 3150 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3151 );
3152 }
9041c2e3 3153 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3154 && !(PL_reg_flags & RF_warned)) {
3280af22 3155 PL_reg_flags |= RF_warned;
e476b1b5 3156 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
cb5d145d
GS
3157 "Complex regular subexpression recursion",
3158 REG_INFTY - 1);
a0d0e21e
LW
3159 }
3160
3161 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3162 PL_regcc = cc->oldcc;
2ab05381
IZ
3163 if (PL_regcc)
3164 ln = PL_regcc->cur;
a0d0e21e 3165 if (regmatch(cc->next))
4633a7c4 3166 sayYES;
2ab05381
IZ
3167 if (PL_regcc)
3168 PL_regcc->cur = ln;
3280af22 3169 PL_regcc = cc;
4633a7c4 3170 cc->cur = n - 1;
c277df42 3171 cc->lastloc = lastloc;
4633a7c4 3172 sayNO;
a0d0e21e
LW
3173 }
3174 /* NOT REACHED */
9041c2e3 3175 case BRANCHJ:
c277df42
IZ
3176 next = scan + ARG(scan);
3177 if (next == scan)
3178 next = NULL;
3179 inner = NEXTOPER(NEXTOPER(scan));
3180 goto do_branch;
9041c2e3 3181 case BRANCH:
c277df42
IZ
3182 inner = NEXTOPER(scan);
3183 do_branch:
3184 {
c277df42
IZ
3185 c1 = OP(scan);
3186 if (OP(next) != c1) /* No choice. */
3187 next = inner; /* Avoid recursion. */
a0d0e21e 3188 else {
02db2b7b
IZ
3189 I32 lastparen = *PL_reglastparen;
3190 I32 unwind1;
3191 re_unwind_branch_t *uw;
3192
3193 /* Put unwinding data on stack */
3194 unwind1 = SSNEWt(1,re_unwind_branch_t);
3195 uw = SSPTRt(unwind1,re_unwind_branch_t);
3196 uw->prev = unwind;
3197 unwind = unwind1;
3198 uw->type = ((c1 == BRANCH)
3199 ? RE_UNWIND_BRANCH
3200 : RE_UNWIND_BRANCHJ);
3201 uw->lastparen = lastparen;
3202 uw->next = next;
3203 uw->locinput = locinput;
3204 uw->nextchr = nextchr;
3205#ifdef DEBUGGING
3206 uw->regindent = ++PL_regindent;
3207#endif
c277df42 3208
02db2b7b
IZ
3209 REGCP_SET(uw->lastcp);
3210
3211 /* Now go into the first branch */
3212 next = inner;
a687059c 3213 }
a0d0e21e
LW
3214 }
3215 break;
3216 case MINMOD:
3217 minmod = 1;
3218 break;
c277df42
IZ
3219 case CURLYM:
3220 {
00db4c45 3221 I32 l = 0;
c277df42 3222 CHECKPOINT lastcp;
9041c2e3 3223
c277df42
IZ
3224 /* We suppose that the next guy does not need
3225 backtracking: in particular, it is of constant length,
3226 and has no parenths to influence future backrefs. */
3227 ln = ARG1(scan); /* min to match */
3228 n = ARG2(scan); /* max to match */
c277df42
IZ
3229 paren = scan->flags;
3230 if (paren) {
3280af22
NIS
3231 if (paren > PL_regsize)
3232 PL_regsize = paren;
3233 if (paren > *PL_reglastparen)
3234 *PL_reglastparen = paren;
c277df42 3235 }
dc45a647 3236 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3237 if (paren)
3238 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3239 PL_reginput = locinput;
c277df42
IZ
3240 if (minmod) {
3241 minmod = 0;
3242 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3243 sayNO;
f31a99c8
HS
3244 /* if we matched something zero-length we don't need to
3245 backtrack - capturing parens are already defined, so
3246 the caveat in the maximal case doesn't apply
3247
3248 XXXX if ln == 0, we can redo this check first time
3249 through the following loop
3250 */
3251 if (ln && l == 0)
3252 n = ln; /* don't backtrack */
3280af22 3253 locinput = PL_reginput;
cca55fe3 3254 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3255 regnode *text_node = next;
3256
cca55fe3 3257 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3258
cca55fe3 3259 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3260 else {
cca55fe3
JP
3261 if (PL_regkind[(U8)OP(text_node)] == REF) {
3262 I32 n, ln;
3263 n = ARG(text_node); /* which paren pair */
3264 ln = PL_regstartp[n];
3265 /* assume yes if we haven't seen CLOSEn */
3266 if (
3267 *PL_reglastparen < n ||
3268 ln == -1 ||
3269 ln == PL_regendp[n]
3270 ) {
3271 c1 = c2 = -1000;
3272 goto assume_ok_MM;
3273 }
3274 c1 = *(PL_bostr + ln);
3275 }
3276 else { c1 = (U8)*STRING(text_node); }
af5decee 3277 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3278 c2 = PL_fold[c1];
af5decee 3279 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3280 c2 = PL_fold_locale[c1];
3281 else
3282 c2 = c1;
3283 }
a0ed51b3
LW
3284 }
3285 else
c277df42 3286 c1 = c2 = -1000;
cca55fe3 3287 assume_ok_MM:
02db2b7b 3288 REGCP_SET(lastcp);
5f4b28b2 3289 /* This may be improved if l == 0. */
c277df42
IZ
3290 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3291 /* If it could work, try it. */
3292 if (c1 == -1000 ||
3280af22
NIS
3293 UCHARAT(PL_reginput) == c1 ||
3294 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3295 {
3296 if (paren) {
f31a99c8 3297 if (ln) {
cf93c79d
IZ
3298 PL_regstartp[paren] =
3299 HOPc(PL_reginput, -l) - PL_bostr;
3300 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3301 }
3302 else
cf93c79d 3303 PL_regendp[paren] = -1;
c277df42
IZ
3304 }
3305 if (regmatch(next))
3306 sayYES;
02db2b7b 3307 REGCP_UNWIND(lastcp);
c277df42
IZ
3308 }
3309 /* Couldn't or didn't -- move forward. */
3280af22 3310 PL_reginput = locinput;
c277df42
IZ
3311 if (regrepeat_hard(scan, 1, &l)) {
3312 ln++;
3280af22 3313 locinput = PL_reginput;
c277df42
IZ
3314 }
3315 else
3316 sayNO;
3317 }
a0ed51b3
LW
3318 }
3319 else {
c277df42 3320 n = regrepeat_hard(scan, n, &l);
f31a99c8
HS
3321 /* if we matched something zero-length we don't need to
3322 backtrack, unless the minimum count is zero and we
3323 are capturing the result - in that case the capture
3324 being defined or not may affect later execution
3325 */
3326 if (n != 0 && l == 0 && !(paren && ln == 0))
3327 ln = n; /* don't backtrack */
3280af22 3328 locinput = PL_reginput;
c277df42 3329 DEBUG_r(
5c0ca799 3330 PerlIO_printf(Perl_debug_log,
faccc32b 3331 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3332 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3333 (IV) n, (IV)l)
c277df42
IZ
3334 );
3335 if (n >= ln) {
cca55fe3 3336 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3337 regnode *text_node = next;
3338
cca55fe3 3339 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3340
cca55fe3 3341 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3342 else {
cca55fe3
JP
3343 if (PL_regkind[(U8)OP(text_node)] == REF) {
3344 I32 n, ln;
3345 n = ARG(text_node); /* which paren pair */
3346 ln = PL_regstartp[n];
3347 /* assume yes if we haven't seen CLOSEn */
3348 if (
3349 *PL_reglastparen < n ||
3350 ln == -1 ||
3351 ln == PL_regendp[n]
3352 ) {
3353 c1 = c2 = -1000;
3354 goto assume_ok_REG;
3355 }
3356 c1 = *(PL_bostr + ln);
3357 }
3358 else { c1 = (U8)*STRING(text_node); }
3359
af5decee 3360 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3361 c2 = PL_fold[c1];
af5decee 3362 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3363 c2 = PL_fold_locale[c1];
3364 else
3365 c2 = c1;
3366 }
a0ed51b3
LW
3367 }
3368 else
c277df42
IZ
3369 c1 = c2 = -1000;
3370 }
cca55fe3 3371 assume_ok_REG:
02db2b7b 3372 REGCP_SET(lastcp);
c277df42
IZ
3373 while (n >= ln) {
3374 /* If it could work, try it. */
3375 if (c1 == -1000 ||
3280af22
NIS
3376 UCHARAT(PL_reginput) == c1 ||
3377 UCHARAT(PL_reginput) == c2)
a0ed51b3
LW
3378 {
3379 DEBUG_r(
c3464db5 3380 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3381 "%*s trying tail with n=%"IVdf"...\n",
3382 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3383 );
3384 if (paren) {
3385 if (n) {
cf93c79d
IZ
3386 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3387 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3388 }
a0ed51b3 3389 else
cf93c79d 3390 PL_regendp[paren] = -1;
c277df42 3391 }
a0ed51b3
LW
3392 if (regmatch(next))
3393 sayYES;
02db2b7b 3394 REGCP_UNWIND(lastcp);
a0ed51b3 3395 }
c277df42
IZ
3396 /* Couldn't or didn't -- back up. */
3397 n--;
dfe13c55 3398 locinput = HOPc(locinput, -l);
3280af22 3399 PL_reginput = locinput;
c277df42
IZ
3400 }
3401 }
3402 sayNO;
3403 break;
3404 }
3405 case CURLYN:
3406 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3407 if (paren > PL_regsize)
3408 PL_regsize = paren;
3409 if (paren > *PL_reglastparen)
3410 *PL_reglastparen = paren;
c277df42
IZ
3411 ln = ARG1(scan); /* min to match */
3412 n = ARG2(scan); /* max to match */
dc45a647 3413 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3414 goto repeat;
a0d0e21e 3415 case CURLY:
c277df42 3416 paren = 0;
a0d0e21e
LW
3417 ln = ARG1(scan); /* min to match */
3418 n = ARG2(scan); /* max to match */
dc45a647 3419 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3420 goto repeat;
3421 case STAR:
3422 ln = 0;
c277df42 3423 n = REG_INFTY;
a0d0e21e 3424 scan = NEXTOPER(scan);
c277df42 3425 paren = 0;
a0d0e21e
LW
3426 goto repeat;
3427 case PLUS:
c277df42
IZ
3428 ln = 1;
3429 n = REG_INFTY;
3430 scan = NEXTOPER(scan);
3431 paren = 0;
3432 repeat:
a0d0e21e
LW
3433 /*
3434 * Lookahead to avoid useless match attempts
3435 * when we know what character comes next.
3436 */
5f80c4cf
JP
3437
3438 /*
3439 * Used to only do .*x and .*?x, but now it allows
3440 * for )'s, ('s and (?{ ... })'s to be in the way
3441 * of the quantifier and the EXACT-like node. -- japhy
3442 */
3443
cca55fe3 3444 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3445 U8 *s;
3446 regnode *text_node = next;
3447
cca55fe3 3448 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3449
cca55fe3 3450 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3451 else {
cca55fe3
JP
3452 if (PL_regkind[(U8)OP(text_node)] == REF) {
3453 I32 n, ln;
3454 n = ARG(text_node); /* which paren pair */
3455 ln = PL_regstartp[n];
3456 /* assume yes if we haven't seen CLOSEn */
3457 if (
3458 *PL_reglastparen < n ||
3459 ln == -1 ||
3460 ln == PL_regendp[n]
3461 ) {
3462 c1 = c2 = -1000;
3463 goto assume_ok_easy;
3464 }
9246c65e 3465 s = (U8*)PL_bostr + ln;
cca55fe3
JP
3466 }
3467 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
3468
3469 if (!UTF) {
3470 c2 = c1 = *s;
f65d3ee7 3471 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3472 c2 = PL_fold[c1];
f65d3ee7 3473 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf 3474 c2 = PL_fold_locale[c1];
1aa99e6b 3475 }
5f80c4cf 3476 else { /* UTF */
f65d3ee7 3477 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3478 STRLEN ulen1, ulen2;
e7ae6809
JH
3479 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3480 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
3481
3482 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3483 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3484
3485 c1 = utf8_to_uvuni(tmpbuf1, 0);
3486 c2 = utf8_to_uvuni(tmpbuf2, 0);
5f80c4cf
JP
3487 }
3488 else {
3489 c2 = c1 = utf8_to_uvchr(s, NULL);
3490 }
1aa99e6b
IH
3491 }
3492 }
bbce6d69 3493 }
a0d0e21e 3494 else
bbce6d69 3495 c1 = c2 = -1000;
cca55fe3 3496 assume_ok_easy:
3280af22 3497 PL_reginput = locinput;
a0d0e21e 3498 if (minmod) {
c277df42 3499 CHECKPOINT lastcp;
a0d0e21e
LW
3500 minmod = 0;
3501 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 3502 sayNO;
a0ed51b3 3503 locinput = PL_reginput;
02db2b7b 3504 REGCP_SET(lastcp);
0fe9bf95 3505 if (c1 != -1000) {
1aa99e6b 3506 char *e; /* Should not check after this */
0fe9bf95
IZ
3507 char *old = locinput;
3508
1aa99e6b 3509 if (n == REG_INFTY) {
0fe9bf95 3510 e = PL_regeol - 1;
1aa99e6b
IH
3511 if (do_utf8)
3512 while (UTF8_IS_CONTINUATION(*(U8*)e))
3513 e--;
3514 }
3515 else if (do_utf8) {
3516 int m = n - ln;
3517 for (e = locinput;
3518 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3519 e += UTF8SKIP(e);
3520 }
3521 else {
3522 e = locinput + n - ln;
3523 if (e >= PL_regeol)
3524 e = PL_regeol - 1;
3525 }
0fe9bf95 3526 while (1) {
1aa99e6b 3527 int count;
0fe9bf95 3528 /* Find place 'next' could work */
1aa99e6b
IH
3529 if (!do_utf8) {
3530 if (c1 == c2) {
a8e8ab15
JH
3531 while (locinput <= e &&
3532 UCHARAT(locinput) != c1)
1aa99e6b
IH
3533 locinput++;
3534 } else {
9041c2e3 3535 while (locinput <= e
a8e8ab15
JH
3536 && UCHARAT(locinput) != c1
3537 && UCHARAT(locinput) != c2)
1aa99e6b
IH
3538 locinput++;
3539 }
3540 count = locinput - old;
3541 }
3542 else {
3543 STRLEN len;
3544 if (c1 == c2) {
3545 for (count = 0;
3546 locinput <= e &&
9041c2e3 3547 utf8_to_uvchr((U8*)locinput, &len) != c1;
1aa99e6b
IH
3548 count++)
3549 locinput += len;
3550
3551 } else {
3552 for (count = 0; locinput <= e; count++) {
9041c2e3 3553 UV c = utf8_to_uvchr((U8*)locinput, &len);
1aa99e6b
IH
3554 if (c == c1 || c == c2)
3555 break;
9041c2e3 3556 locinput += len;
1aa99e6b
IH
3557 }
3558 }
0fe9bf95 3559 }
9041c2e3 3560 if (locinput > e)
0fe9bf95
IZ
3561 sayNO;
3562 /* PL_reginput == old now */
3563