This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
another Storable test (Re: perl@16005)
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_regexec_flags my_regexec
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
cad2e5aa 38# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
39/* *These* symbols are masked to allow static link. */
40# define Perl_pregexec my_pregexec
9041c2e3 41# define Perl_reginitcolors my_reginitcolors
490a3f88 42# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
43
44# define PERL_NO_GET_CONTEXT
9041c2e3 45#endif
56953603 46
f0fcb552 47/*SUPPRESS 112*/
a687059c 48/*
e50aee73 49 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
50 *
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
53 *
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
57 *
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
60 * from defects in it.
61 *
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
64 *
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
67 *
68 **** Alterations to Henry's code are...
69 ****
be3c0a43 70 **** Copyright (c) 1991-2002, Larry Wall
a687059c 71 ****
9ef589d8
LW
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
a687059c
LW
74 *
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
78 */
79#include "EXTERN.h"
864dbfa3 80#define PERL_IN_REGEXEC_C
a687059c 81#include "perl.h"
0f5d15d6 82
a687059c
LW
83#include "regcomp.h"
84
c277df42
IZ
85#define RF_tainted 1 /* tainted information used? */
86#define RF_warned 2 /* warned about big count? */
ce862d02 87#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
88#define RF_utf8 8 /* String contains multibyte chars? */
89
90#define UTF (PL_reg_flags & RF_utf8)
ce862d02
IZ
91
92#define RS_init 1 /* eval environment created */
93#define RS_set 2 /* replsv value is set */
c277df42 94
a687059c
LW
95#ifndef STATIC
96#define STATIC static
97#endif
98
ba7b4546 99#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 100
c277df42
IZ
101/*
102 * Forwards.
103 */
104
33b8afdf 105#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 106#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 107
dfe13c55
GS
108#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
109#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
110#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
111#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
112#define HOPc(pos,off) ((char*)HOP(pos,off))
113#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 114
efb30f32 115#define HOPBACK(pos, off) ( \
e54858b0 116 (PL_reg_match_utf8) \
efb30f32
HS
117 ? reghopmaybe((U8*)pos, -off) \
118 : (pos - off >= PL_bostr) \
119 ? (U8*)(pos - off) \
120 : (U8*)NULL \
121)
122#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
123
1aa99e6b
IH
124#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
125#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
126#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
127#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
128#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
129#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
130
977e4b84 131#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
51371543 132
5f80c4cf 133/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
134#define JUMPABLE(rn) ( \
135 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
136 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
137 OP(rn) == PLUS || OP(rn) == MINMOD || \
138 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
139)
140
cca55fe3
JP
141#define HAS_TEXT(rn) ( \
142 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
143)
e2d8ce26 144
a84d97b6
HS
145/*
146 Search for mandatory following text node; for lookahead, the text must
147 follow but for lookbehind (rn->flags != 0) we skip to the next step.
148*/
cca55fe3 149#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 150 while (JUMPABLE(rn)) \
a84d97b6 151 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 152 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
153 else if (OP(rn) == PLUS) \
154 rn = NEXTOPER(rn); \
a84d97b6
HS
155 else if (OP(rn) == IFMATCH) \
156 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 157 else rn += NEXT_OFF(rn); \
5f80c4cf 158} STMT_END
74750237 159
acfe0abc 160static void restore_pos(pTHX_ void *arg);
51371543 161
76e3520e 162STATIC CHECKPOINT
cea2e8a9 163S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 164{
3280af22 165 int retval = PL_savestack_ix;
b1ce53c5
JH
166#define REGCP_PAREN_ELEMS 4
167 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
168 int p;
169
e49a9654
IH
170 if (paren_elems_to_push < 0)
171 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
172
a01268b5 173#define REGCP_OTHER_ELEMS 6
b1ce53c5 174 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 175 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 176/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
177 SSPUSHINT(PL_regendp[p]);
178 SSPUSHINT(PL_regstartp[p]);
3280af22 179 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
180 SSPUSHINT(p);
181 }
b1ce53c5 182/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
183 SSPUSHINT(PL_regsize);
184 SSPUSHINT(*PL_reglastparen);
a01268b5 185 SSPUSHINT(*PL_reglastcloseparen);
3280af22 186 SSPUSHPTR(PL_reginput);
41123dfd
JH
187#define REGCP_FRAME_ELEMS 2
188/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
189 * are needed for the regexp context stack bookkeeping. */
190 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 191 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 192
a0d0e21e
LW
193 return retval;
194}
195
c277df42 196/* These are needed since we do not localize EVAL nodes: */
02db2b7b 197# define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
faccc32b 198 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 199 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 200
02db2b7b 201# define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
c3464db5 202 PerlIO_printf(Perl_debug_log, \
faccc32b 203 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 204 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 205
76e3520e 206STATIC char *
cea2e8a9 207S_regcppop(pTHX)
a0d0e21e 208{
b1ce53c5 209 I32 i;
a0d0e21e
LW
210 U32 paren = 0;
211 char *input;
cf93c79d 212 I32 tmps;
b1ce53c5
JH
213
214 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 215 i = SSPOPINT;
b1ce53c5
JH
216 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
217 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 218 input = (char *) SSPOPPTR;
a01268b5 219 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
220 *PL_reglastparen = SSPOPINT;
221 PL_regsize = SSPOPINT;
b1ce53c5
JH
222
223 /* Now restore the parentheses context. */
41123dfd
JH
224 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
225 i > 0; i -= REGCP_PAREN_ELEMS) {
a0d0e21e 226 paren = (U32)SSPOPINT;
3280af22 227 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
228 PL_regstartp[paren] = SSPOPINT;
229 tmps = SSPOPINT;
3280af22
NIS
230 if (paren <= *PL_reglastparen)
231 PL_regendp[paren] = tmps;
c277df42 232 DEBUG_r(
c3464db5 233 PerlIO_printf(Perl_debug_log,
b900a521 234 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 235 (UV)paren, (IV)PL_regstartp[paren],
b900a521 236 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 237 (IV)PL_regendp[paren],
3280af22 238 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 239 );
a0d0e21e 240 }
c277df42 241 DEBUG_r(
3280af22 242 if (*PL_reglastparen + 1 <= PL_regnpar) {
c3464db5 243 PerlIO_printf(Perl_debug_log,
faccc32b
JH
244 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
245 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
246 }
247 );
daf18116 248#if 1
dafc8851
JH
249 /* It would seem that the similar code in regtry()
250 * already takes care of this, and in fact it is in
251 * a better location to since this code can #if 0-ed out
252 * but the code in regtry() is needed or otherwise tests
253 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
254 * (as of patchlevel 7877) will fail. Then again,
255 * this code seems to be necessary or otherwise
256 * building DynaLoader will fail:
257 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
258 * --jhi */
3280af22
NIS
259 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
260 if (paren > PL_regsize)
cf93c79d
IZ
261 PL_regstartp[paren] = -1;
262 PL_regendp[paren] = -1;
a0d0e21e 263 }
dafc8851 264#endif
a0d0e21e
LW
265 return input;
266}
267
0f5d15d6 268STATIC char *
cea2e8a9 269S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6
IZ
270{
271 I32 tmp = PL_savestack_ix;
272
273 PL_savestack_ix = ss;
274 regcppop();
275 PL_savestack_ix = tmp;
942e002e 276 return Nullch;
0f5d15d6
IZ
277}
278
279typedef struct re_cc_state
280{
281 I32 ss;
282 regnode *node;
283 struct re_cc_state *prev;
284 CURCUR *cc;
285 regexp *re;
286} re_cc_state;
287
02db2b7b 288#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 289
29d1e993
HS
290#define TRYPAREN(paren, n, input) { \
291 if (paren) { \
292 if (n) { \
293 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
294 PL_regendp[paren] = input - PL_bostr; \
295 } \
296 else \
297 PL_regendp[paren] = -1; \
298 } \
299 if (regmatch(next)) \
300 sayYES; \
301 if (paren && n) \
302 PL_regendp[paren] = -1; \
303}
304
305
a687059c 306/*
e50aee73 307 * pregexec and friends
a687059c
LW
308 */
309
310/*
c277df42 311 - pregexec - match a regexp against a string
a687059c 312 */
c277df42 313I32
864dbfa3 314Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 315 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
316/* strend: pointer to null at end of string */
317/* strbeg: real beginning of string */
318/* minend: end of match must be >=minend after stringarg. */
319/* nosave: For optimizations. */
320{
321 return
9041c2e3 322 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
323 nosave ? 0 : REXEC_COPY_STR);
324}
0f5d15d6
IZ
325
326STATIC void
cea2e8a9 327S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
328{
329 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
330#ifdef DEBUGGING
331 PL_regprogram = prog->program;
332#endif
333 PL_regnpar = prog->nparens;
9041c2e3
NIS
334 PL_regdata = prog->data;
335 PL_reg_re = prog;
0f5d15d6 336}
22e551b9 337
9041c2e3 338/*
cad2e5aa
JH
339 * Need to implement the following flags for reg_anch:
340 *
341 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
342 * USE_INTUIT_ML
343 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
344 * INTUIT_AUTORITATIVE_ML
345 * INTUIT_ONCE_NOML - Intuit can match in one location only.
346 * INTUIT_ONCE_ML
347 *
348 * Another flag for this function: SECOND_TIME (so that float substrs
349 * with giant delta may be not rechecked).
350 */
351
352/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
353
2c2d71f5 354/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
cad2e5aa
JH
355 Otherwise, only SvCUR(sv) is used to get strbeg. */
356
357/* XXXX We assume that strpos is strbeg unless sv. */
358
6eb5f6b9
JH
359/* XXXX Some places assume that there is a fixed substring.
360 An update may be needed if optimizer marks as "INTUITable"
361 RExen without fixed substrings. Similarly, it is assumed that
362 lengths of all the strings are no more than minlen, thus they
363 cannot come from lookahead.
364 (Or minlen should take into account lookahead.) */
365
2c2d71f5
JH
366/* A failure to find a constant substring means that there is no need to make
367 an expensive call to REx engine, thus we celebrate a failure. Similarly,
368 finding a substring too deep into the string means that less calls to
30944b6d
IZ
369 regtry() should be needed.
370
371 REx compiler's optimizer found 4 possible hints:
372 a) Anchored substring;
373 b) Fixed substring;
374 c) Whether we are anchored (beginning-of-line or \G);
375 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 376 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
377 string which does not contradict any of them.
378 */
2c2d71f5 379
6eb5f6b9
JH
380/* Most of decisions we do here should have been done at compile time.
381 The nodes of the REx which we used for the search should have been
382 deleted from the finite automaton. */
383
cad2e5aa
JH
384char *
385Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
386 char *strend, U32 flags, re_scream_pos_data *data)
387{
b7953727 388 register I32 start_shift = 0;
cad2e5aa 389 /* Should be nonnegative! */
b7953727 390 register I32 end_shift = 0;
2c2d71f5
JH
391 register char *s;
392 register SV *check;
a1933d95 393 char *strbeg;
cad2e5aa 394 char *t;
33b8afdf 395 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 396 I32 ml_anch;
6eb5f6b9 397 register char *other_last = Nullch; /* other substr checked before this */
b7953727 398 char *check_at = Nullch; /* check substr found at this pos */
30944b6d
IZ
399#ifdef DEBUGGING
400 char *i_strpos = strpos;
ce333219 401 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 402#endif
cad2e5aa 403
b8d68ded
JH
404 if (prog->reganch & ROPT_UTF8) {
405 DEBUG_r(PerlIO_printf(Perl_debug_log,
406 "UTF-8 regex...\n"));
407 PL_reg_flags |= RF_utf8;
408 }
409
2a782b5b 410 DEBUG_r({
b8d68ded 411 char *s = PL_reg_match_utf8 ?
c728cb41
JH
412 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
413 strpos;
b8d68ded
JH
414 int len = PL_reg_match_utf8 ?
415 strlen(s) : strend - strpos;
2a782b5b
JH
416 if (!PL_colorset)
417 reginitcolors();
b8d68ded
JH
418 if (PL_reg_match_utf8)
419 DEBUG_r(PerlIO_printf(Perl_debug_log,
420 "UTF-8 target...\n"));
2a782b5b
JH
421 PerlIO_printf(Perl_debug_log,
422 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
423 PL_colors[4],PL_colors[5],PL_colors[0],
424 prog->precomp,
425 PL_colors[1],
426 (strlen(prog->precomp) > 60 ? "..." : ""),
427 PL_colors[0],
428 (int)(len > 60 ? 60 : len),
429 s, PL_colors[1],
430 (len > 60 ? "..." : "")
431 );
432 });
cad2e5aa 433
1aa99e6b 434 if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
a72c7584
JH
435 DEBUG_r(PerlIO_printf(Perl_debug_log,
436 "String too short... [re_intuit_start]\n"));
cad2e5aa 437 goto fail;
2c2d71f5 438 }
a1933d95 439 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 440 PL_regeol = strend;
33b8afdf
JH
441 if (do_utf8) {
442 if (!prog->check_utf8 && prog->check_substr)
443 to_utf8_substr(prog);
444 check = prog->check_utf8;
445 } else {
446 if (!prog->check_substr && prog->check_utf8)
447 to_byte_substr(prog);
448 check = prog->check_substr;
449 }
450 if (check == &PL_sv_undef) {
451 DEBUG_r(PerlIO_printf(Perl_debug_log,
452 "Non-utf string cannot match utf check string\n"));
453 goto fail;
454 }
2c2d71f5 455 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
456 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
457 || ( (prog->reganch & ROPT_ANCH_BOL)
2c2d71f5 458 && !PL_multiline ) ); /* Check after \n? */
cad2e5aa 459
7e25d62c
JH
460 if (!ml_anch) {
461 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
462 | ROPT_IMPLICIT)) /* not a real BOL */
463 /* SvCUR is not set on references: SvRV and SvPVX overlap */
464 && sv && !SvROK(sv)
465 && (strpos != strbeg)) {
466 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
467 goto fail;
468 }
469 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 470 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 471 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
472 I32 slen;
473
1aa99e6b 474 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
475 if (SvTAIL(check)) {
476 slen = SvCUR(check); /* >= 1 */
cad2e5aa 477
9041c2e3 478 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5
JH
479 || (strend - s == slen && strend[-1] != '\n')) {
480 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
481 goto fail_finish;
cad2e5aa
JH
482 }
483 /* Now should match s[0..slen-2] */
484 slen--;
653099ff 485 if (slen && (*SvPVX(check) != *s
cad2e5aa 486 || (slen > 1
653099ff 487 && memNE(SvPVX(check), s, slen)))) {
2c2d71f5
JH
488 report_neq:
489 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
490 goto fail_finish;
491 }
cad2e5aa 492 }
653099ff
GS
493 else if (*SvPVX(check) != *s
494 || ((slen = SvCUR(check)) > 1
495 && memNE(SvPVX(check), s, slen)))
2c2d71f5
JH
496 goto report_neq;
497 goto success_at_start;
7e25d62c 498 }
cad2e5aa 499 }
2c2d71f5 500 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 501 s = strpos;
2c2d71f5 502 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 503 end_shift = prog->minlen - start_shift -
653099ff 504 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 505 if (!ml_anch) {
653099ff
GS
506 I32 end = prog->check_offset_max + CHR_SVLEN(check)
507 - (SvTAIL(check) != 0);
1aa99e6b 508 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
509
510 if (end_shift < eshift)
511 end_shift = eshift;
512 }
cad2e5aa 513 }
2c2d71f5 514 else { /* Can match at random position */
cad2e5aa
JH
515 ml_anch = 0;
516 s = strpos;
2c2d71f5
JH
517 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
518 /* Should be nonnegative! */
519 end_shift = prog->minlen - start_shift -
653099ff 520 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
521 }
522
2c2d71f5 523#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 524 if (end_shift < 0)
6bbae5e6 525 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
526#endif
527
2c2d71f5
JH
528 restart:
529 /* Find a possible match in the region s..strend by looking for
530 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 531 if (flags & REXEC_SCREAM) {
cad2e5aa
JH
532 I32 p = -1; /* Internal iterator of scream. */
533 I32 *pp = data ? data->scream_pos : &p;
534
2c2d71f5
JH
535 if (PL_screamfirst[BmRARE(check)] >= 0
536 || ( BmRARE(check) == '\n'
537 && (BmPREVIOUS(check) == SvCUR(check) - 1)
538 && SvTAIL(check) ))
9041c2e3 539 s = screaminstr(sv, check,
2c2d71f5 540 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 541 else
2c2d71f5 542 goto fail_finish;
cad2e5aa
JH
543 if (data)
544 *data->scream_olds = s;
545 }
f33976b4 546 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
547 s = fbm_instr((U8*)(s + start_shift),
548 (U8*)(strend - end_shift),
549 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 550 else
1aa99e6b
IH
551 s = fbm_instr(HOP3(s, start_shift, strend),
552 HOP3(strend, -end_shift, strbeg),
2c2d71f5 553 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
554
555 /* Update the count-of-usability, remove useless subpatterns,
556 unshift s. */
2c2d71f5
JH
557
558 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
559 (s ? "Found" : "Did not find"),
33b8afdf 560 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 561 PL_colors[0],
7b0972df
JH
562 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
563 SvPVX(check),
2c2d71f5
JH
564 PL_colors[1], (SvTAIL(check) ? "$" : ""),
565 (s ? " at offset " : "...\n") ) );
566
567 if (!s)
568 goto fail_finish;
569
6eb5f6b9
JH
570 check_at = s;
571
2c2d71f5 572 /* Finish the diagnostic message */
30944b6d 573 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
574
575 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
576 Start with the other substr.
577 XXXX no SCREAM optimization yet - and a very coarse implementation
578 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
579 *always* match. Probably should be marked during compile...
580 Probably it is right to do no SCREAM here...
581 */
582
33b8afdf 583 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 584 /* Take into account the "other" substring. */
2c2d71f5
JH
585 /* XXXX May be hopelessly wrong for UTF... */
586 if (!other_last)
6eb5f6b9 587 other_last = strpos;
33b8afdf 588 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
589 do_other_anchored:
590 {
1aa99e6b 591 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5 592 char *s1 = s;
33b8afdf 593 SV* must;
2c2d71f5 594
2c2d71f5
JH
595 t = s - prog->check_offset_max;
596 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
597 && (!(prog->reganch & ROPT_UTF8)
1aa99e6b 598 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 599 && t > strpos)))
30944b6d 600 /* EMPTY */;
2c2d71f5
JH
601 else
602 t = strpos;
1aa99e6b 603 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
604 if (t < other_last) /* These positions already checked */
605 t = other_last;
1aa99e6b 606 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
607 if (last < last1)
608 last1 = last;
609 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
610 /* On end-of-str: see comment below. */
33b8afdf
JH
611 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
612 if (must == &PL_sv_undef) {
613 s = (char*)NULL;
614 DEBUG_r(must = prog->anchored_utf8); /* for debug */
615 }
616 else
617 s = fbm_instr(
618 (unsigned char*)t,
619 HOP3(HOP3(last1, prog->anchored_offset, strend)
620 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
621 must,
622 PL_multiline ? FBMrf_MULTILINE : 0
623 );
1aa99e6b
IH
624 DEBUG_r(PerlIO_printf(Perl_debug_log,
625 "%s anchored substr `%s%.*s%s'%s",
2c2d71f5
JH
626 (s ? "Found" : "Contradicts"),
627 PL_colors[0],
33b8afdf
JH
628 (int)(SvCUR(must)
629 - (SvTAIL(must)!=0)),
630 SvPVX(must),
631 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
632 if (!s) {
633 if (last1 >= last2) {
634 DEBUG_r(PerlIO_printf(Perl_debug_log,
635 ", giving up...\n"));
636 goto fail_finish;
637 }
638 DEBUG_r(PerlIO_printf(Perl_debug_log,
639 ", trying floating at offset %ld...\n",
1aa99e6b
IH
640 (long)(HOP3c(s1, 1, strend) - i_strpos)));
641 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
642 s = HOP3c(last, 1, strend);
2c2d71f5
JH
643 goto restart;
644 }
645 else {
646 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 647 (long)(s - i_strpos)));
1aa99e6b
IH
648 t = HOP3c(s, -prog->anchored_offset, strbeg);
649 other_last = HOP3c(s, 1, strend);
30944b6d 650 s = s1;
2c2d71f5
JH
651 if (t == strpos)
652 goto try_at_start;
2c2d71f5
JH
653 goto try_at_offset;
654 }
30944b6d 655 }
2c2d71f5
JH
656 }
657 else { /* Take into account the floating substring. */
33b8afdf
JH
658 char *last, *last1;
659 char *s1 = s;
660 SV* must;
661
662 t = HOP3c(s, -start_shift, strbeg);
663 last1 = last =
664 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
665 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
666 last = HOP3c(t, prog->float_max_offset, strend);
667 s = HOP3c(t, prog->float_min_offset, strend);
668 if (s < other_last)
669 s = other_last;
2c2d71f5 670 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
671 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
672 /* fbm_instr() takes into account exact value of end-of-str
673 if the check is SvTAIL(ed). Since false positives are OK,
674 and end-of-str is not later than strend we are OK. */
675 if (must == &PL_sv_undef) {
676 s = (char*)NULL;
677 DEBUG_r(must = prog->float_utf8); /* for debug message */
678 }
679 else
2c2d71f5 680 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
681 (unsigned char*)last + SvCUR(must)
682 - (SvTAIL(must)!=0),
683 must, PL_multiline ? FBMrf_MULTILINE : 0);
684 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
685 (s ? "Found" : "Contradicts"),
686 PL_colors[0],
687 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
688 SvPVX(must),
689 PL_colors[1], (SvTAIL(must) ? "$" : "")));
690 if (!s) {
691 if (last1 == last) {
2c2d71f5 692 DEBUG_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
693 ", giving up...\n"));
694 goto fail_finish;
2c2d71f5 695 }
33b8afdf
JH
696 DEBUG_r(PerlIO_printf(Perl_debug_log,
697 ", trying anchored starting at offset %ld...\n",
698 (long)(s1 + 1 - i_strpos)));
699 other_last = last;
700 s = HOP3c(t, 1, strend);
701 goto restart;
702 }
703 else {
704 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
705 (long)(s - i_strpos)));
706 other_last = s; /* Fix this later. --Hugo */
707 s = s1;
708 if (t == strpos)
709 goto try_at_start;
710 goto try_at_offset;
711 }
2c2d71f5 712 }
cad2e5aa 713 }
2c2d71f5
JH
714
715 t = s - prog->check_offset_max;
2c2d71f5
JH
716 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
717 && (!(prog->reganch & ROPT_UTF8)
1aa99e6b
IH
718 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
719 && t > strpos))) {
2c2d71f5
JH
720 /* Fixed substring is found far enough so that the match
721 cannot start at strpos. */
722 try_at_offset:
cad2e5aa 723 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
724 /* Eventually fbm_*() should handle this, but often
725 anchored_offset is not 0, so this check will not be wasted. */
726 /* XXXX In the code below we prefer to look for "^" even in
727 presence of anchored substrings. And we search even
728 beyond the found float position. These pessimizations
729 are historical artefacts only. */
730 find_anchor:
2c2d71f5 731 while (t < strend - prog->minlen) {
cad2e5aa 732 if (*t == '\n') {
4ee3650e 733 if (t < check_at - prog->check_offset_min) {
33b8afdf 734 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
735 /* Since we moved from the found position,
736 we definitely contradict the found anchored
30944b6d
IZ
737 substr. Due to the above check we do not
738 contradict "check" substr.
739 Thus we can arrive here only if check substr
740 is float. Redo checking for "other"=="fixed".
741 */
9041c2e3 742 strpos = t + 1;
30944b6d
IZ
743 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
744 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
745 goto do_other_anchored;
746 }
4ee3650e
GS
747 /* We don't contradict the found floating substring. */
748 /* XXXX Why not check for STCLASS? */
cad2e5aa 749 s = t + 1;
2c2d71f5 750 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
30944b6d 751 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
752 goto set_useful;
753 }
4ee3650e
GS
754 /* Position contradicts check-string */
755 /* XXXX probably better to look for check-string
756 than for "\n", so one should lower the limit for t? */
757 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
30944b6d 758 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 759 other_last = strpos = s = t + 1;
cad2e5aa
JH
760 goto restart;
761 }
762 t++;
763 }
2c2d71f5
JH
764 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
765 PL_colors[0],PL_colors[1]));
766 goto fail_finish;
cad2e5aa 767 }
f5952150
GS
768 else {
769 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
770 PL_colors[0],PL_colors[1]));
771 }
cad2e5aa
JH
772 s = t;
773 set_useful:
33b8afdf 774 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
775 }
776 else {
f5952150 777 /* The found string does not prohibit matching at strpos,
2c2d71f5 778 - no optimization of calling REx engine can be performed,
f5952150
GS
779 unless it was an MBOL and we are not after MBOL,
780 or a future STCLASS check will fail this. */
2c2d71f5
JH
781 try_at_start:
782 /* Even in this situation we may use MBOL flag if strpos is offset
783 wrt the start of the string. */
05b4157f 784 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 785 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
786 /* May be due to an implicit anchor of m{.*foo} */
787 && !(prog->reganch & ROPT_IMPLICIT))
788 {
cad2e5aa
JH
789 t = strpos;
790 goto find_anchor;
791 }
30944b6d 792 DEBUG_r( if (ml_anch)
f5952150
GS
793 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
794 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
30944b6d 795 );
2c2d71f5 796 success_at_start:
30944b6d 797 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
798 && (do_utf8 ? (
799 prog->check_utf8 /* Could be deleted already */
800 && --BmUSEFUL(prog->check_utf8) < 0
801 && (prog->check_utf8 == prog->float_utf8)
802 ) : (
803 prog->check_substr /* Could be deleted already */
804 && --BmUSEFUL(prog->check_substr) < 0
805 && (prog->check_substr == prog->float_substr)
806 )))
66e933ab 807 {
cad2e5aa 808 /* If flags & SOMETHING - do not do it many times on the same match */
f5952150 809 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
810 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
811 if (do_utf8 ? prog->check_substr : prog->check_utf8)
812 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
813 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
814 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 815 check = Nullsv; /* abort */
cad2e5aa 816 s = strpos;
3cf5c195
IZ
817 /* XXXX This is a remnant of the old implementation. It
818 looks wasteful, since now INTUIT can use many
6eb5f6b9 819 other heuristics. */
cad2e5aa
JH
820 prog->reganch &= ~RE_USE_INTUIT;
821 }
822 else
823 s = strpos;
824 }
825
6eb5f6b9
JH
826 /* Last resort... */
827 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
828 if (prog->regstclass) {
829 /* minlen == 0 is possible if regstclass is \b or \B,
830 and the fixed substr is ''$.
831 Since minlen is already taken into account, s+1 is before strend;
832 accidentally, minlen >= 1 guaranties no false positives at s + 1
833 even for \b or \B. But (minlen? 1 : 0) below assumes that
834 regstclass does not come from lookahead... */
835 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
836 This leaves EXACTF only, which is dealt with in find_byclass(). */
1aa99e6b 837 U8* str = (U8*)STRING(prog->regstclass);
66e933ab 838 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 839 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 840 : 1);
33b8afdf 841 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 842 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 843 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
844 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
845 cl_l, strend)
846 : strend);
a1933d95 847 char *startpos = strbeg;
6eb5f6b9
JH
848
849 t = s;
76384e4a 850 if (prog->reganch & ROPT_UTF8) {
ffc61ed2 851 PL_regdata = prog->data;
76384e4a
GS
852 PL_bostr = startpos;
853 }
f33976b4 854 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
6eb5f6b9
JH
855 if (!s) {
856#ifdef DEBUGGING
b7953727 857 char *what = 0;
6eb5f6b9
JH
858#endif
859 if (endpos == strend) {
860 DEBUG_r( PerlIO_printf(Perl_debug_log,
861 "Could not match STCLASS...\n") );
862 goto fail;
863 }
66e933ab
GS
864 DEBUG_r( PerlIO_printf(Perl_debug_log,
865 "This position contradicts STCLASS...\n") );
653099ff
GS
866 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
867 goto fail;
6eb5f6b9 868 /* Contradict one of substrings */
33b8afdf
JH
869 if (prog->anchored_substr || prog->anchored_utf8) {
870 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
6eb5f6b9
JH
871 DEBUG_r( what = "anchored" );
872 hop_and_restart:
1aa99e6b 873 s = HOP3c(t, 1, strend);
66e933ab
GS
874 if (s + start_shift + end_shift > strend) {
875 /* XXXX Should be taken into account earlier? */
876 DEBUG_r( PerlIO_printf(Perl_debug_log,
877 "Could not match STCLASS...\n") );
878 goto fail;
879 }
5e39e1e5
HS
880 if (!check)
881 goto giveup;
6eb5f6b9 882 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 883 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
884 what, (long)(s + start_shift - i_strpos)) );
885 goto restart;
886 }
66e933ab 887 /* Have both, check_string is floating */
6eb5f6b9
JH
888 if (t + start_shift >= check_at) /* Contradicts floating=check */
889 goto retry_floating_check;
890 /* Recheck anchored substring, but not floating... */
9041c2e3 891 s = check_at;
5e39e1e5
HS
892 if (!check)
893 goto giveup;
6eb5f6b9 894 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 895 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
896 (long)(other_last - i_strpos)) );
897 goto do_other_anchored;
898 }
60e71179
GS
899 /* Another way we could have checked stclass at the
900 current position only: */
901 if (ml_anch) {
902 s = t = t + 1;
5e39e1e5
HS
903 if (!check)
904 goto giveup;
60e71179 905 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150
GS
906 "Looking for /%s^%s/m starting at offset %ld...\n",
907 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
60e71179 908 goto try_at_offset;
66e933ab 909 }
33b8afdf 910 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 911 goto fail;
6eb5f6b9
JH
912 /* Check is floating subtring. */
913 retry_floating_check:
914 t = check_at - start_shift;
915 DEBUG_r( what = "floating" );
916 goto hop_and_restart;
917 }
b7953727
JH
918 if (t != s) {
919 DEBUG_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 920 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
921 (long)(t - i_strpos), (long)(s - i_strpos))
922 );
923 }
924 else {
925 DEBUG_r(PerlIO_printf(Perl_debug_log,
926 "Does not contradict STCLASS...\n");
927 );
928 }
6eb5f6b9 929 }
5e39e1e5
HS
930 giveup:
931 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
932 PL_colors[4], (check ? "Guessed" : "Giving up"),
933 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 934 return s;
2c2d71f5
JH
935
936 fail_finish: /* Substring not found */
33b8afdf
JH
937 if (prog->check_substr || prog->check_utf8) /* could be removed already */
938 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 939 fail:
2c2d71f5 940 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
cad2e5aa
JH
941 PL_colors[4],PL_colors[5]));
942 return Nullch;
943}
9661b544 944
6eb5f6b9 945/* We know what class REx starts with. Try to find this position... */
3c3eec57
GS
946STATIC char *
947S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
a687059c 948{
6eb5f6b9
JH
949 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
950 char *m;
d8093b23 951 STRLEN ln;
952 unsigned int c1;
953 unsigned int c2;
6eb5f6b9
JH
954 char *e;
955 register I32 tmp = 1; /* Scratch variable? */
53c4c00c 956 register bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 957
6eb5f6b9
JH
958 /* We know what class it must start with. */
959 switch (OP(c)) {
6eb5f6b9 960 case ANYOF:
a0ed51b3 961 while (s < strend) {
7d3e948e
JH
962 STRLEN skip = do_utf8 ? UTF8SKIP(s) : 1;
963
964 if (do_utf8 ?
ba7b4546 965 reginclass(c, (U8*)s, 0, do_utf8) :
7d3e948e 966 REGINCLASS(c, (U8*)s) ||
ebc501f0
JH
967 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
968 /* The assignment of 2 is intentional:
969 * for the sharp s, the skip is 2. */
970 (skip = SHARP_S_SKIP)
971 )) {
6eb5f6b9
JH
972 if (tmp && (norun || regtry(prog, s)))
973 goto got_it;
974 else
975 tmp = doevery;
a0ed51b3 976 }
e0f9d4a8 977 else
7d3e948e 978 tmp = 1;
e0f9d4a8 979 s += skip;
a0d0e21e 980 }
6eb5f6b9 981 break;
f33976b4
DB
982 case CANY:
983 while (s < strend) {
984 if (tmp && (norun || regtry(prog, s)))
985 goto got_it;
986 else
987 tmp = doevery;
988 s++;
989 }
990 break;
6eb5f6b9
JH
991 case EXACTF:
992 m = STRING(c);
993 ln = STR_LEN(c);
1aa99e6b 994 if (UTF) {
a2a2844f 995 STRLEN ulen1, ulen2;
e7ae6809
JH
996 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
997 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
998
999 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1000 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1001
872c91ae
JH
1002 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
1003 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1004 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
1005 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1aa99e6b
IH
1006 }
1007 else {
1008 c1 = *(U8*)m;
1009 c2 = PL_fold[c1];
1010 }
6eb5f6b9
JH
1011 goto do_exactf;
1012 case EXACTFL:
1013 m = STRING(c);
1014 ln = STR_LEN(c);
d8093b23 1015 c1 = *(U8*)m;
6eb5f6b9
JH
1016 c2 = PL_fold_locale[c1];
1017 do_exactf:
6795a346 1018 e = HOP3c(strend, -(I32)ln, s);
b3c9acc1 1019
6eb5f6b9
JH
1020 if (norun && e < s)
1021 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1022
60a8b682
JH
1023 /* The idea in the EXACTF* cases is to first find the
1024 * first character of the EXACTF* node and then, if
1025 * necessary, case-insensitively compare the full
1026 * text of the node. The c1 and c2 are the first
1027 * characters (though in Unicode it gets a bit
1028 * more complicated because there are more cases
7f16dd3d
JH
1029 * than just upper and lower: one needs to use
1030 * the so-called folding case for case-insensitive
1031 * matching (called "loose matching" in Unicode).
1032 * ibcmp_utf8() will do just that. */
60a8b682 1033
1aa99e6b 1034 if (do_utf8) {
575cac57
JH
1035 UV c, f;
1036 U8 tmpbuf [UTF8_MAXLEN+1];
1037 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
1038 STRLEN len, foldlen;
d7f013c8 1039
09091399 1040 if (c1 == c2) {
1aa99e6b 1041 while (s <= e) {
872c91ae
JH
1042 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1043 ckWARN(WARN_UTF8) ?
1044 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1045 if ( c == c1
1046 && (ln == len ||
66423254
JH
1047 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1048 m, (char **)0, ln, UTF))
55da9344 1049 && (norun || regtry(prog, s)) )
1aa99e6b 1050 goto got_it;
80aecb99
JH
1051 else {
1052 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1053 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1054 if ( f != c
1055 && (f == c1 || f == c2)
1056 && (ln == foldlen ||
66423254
JH
1057 !ibcmp_utf8((char *) foldbuf,
1058 (char **)0, foldlen, do_utf8,
d07ddd77 1059 m,
66423254 1060 (char **)0, ln, UTF))
80aecb99
JH
1061 && (norun || regtry(prog, s)) )
1062 goto got_it;
1063 }
1aa99e6b
IH
1064 s += len;
1065 }
09091399
JH
1066 }
1067 else {
1aa99e6b 1068 while (s <= e) {
872c91ae
JH
1069 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1070 ckWARN(WARN_UTF8) ?
1071 0 : UTF8_ALLOW_ANY);
80aecb99 1072
60a8b682 1073 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1074 * Note that not all the possible combinations
1075 * are handled here: some of them are handled
1076 * by the standard folding rules, and some of
1077 * them (the character class or ANYOF cases)
1078 * are handled during compiletime in
1079 * regexec.c:S_regclass(). */
880bd946
JH
1080 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1081 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1082 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1083
1084 if ( (c == c1 || c == c2)
1085 && (ln == len ||
66423254
JH
1086 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1087 m, (char **)0, ln, UTF))
55da9344 1088 && (norun || regtry(prog, s)) )
1aa99e6b 1089 goto got_it;
80aecb99
JH
1090 else {
1091 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1092 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1093 if ( f != c
1094 && (f == c1 || f == c2)
1095 && (ln == foldlen ||
a6872d42 1096 !ibcmp_utf8((char *) foldbuf,
66423254 1097 (char **)0, foldlen, do_utf8,
d07ddd77 1098 m,
66423254 1099 (char **)0, ln, UTF))
80aecb99
JH
1100 && (norun || regtry(prog, s)) )
1101 goto got_it;
1102 }
1aa99e6b
IH
1103 s += len;
1104 }
09091399 1105 }
1aa99e6b
IH
1106 }
1107 else {
1108 if (c1 == c2)
1109 while (s <= e) {
1110 if ( *(U8*)s == c1
1111 && (ln == 1 || !(OP(c) == EXACTF
1112 ? ibcmp(s, m, ln)
1113 : ibcmp_locale(s, m, ln)))
1114 && (norun || regtry(prog, s)) )
1115 goto got_it;
1116 s++;
1117 }
1118 else
1119 while (s <= e) {
1120 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1121 && (ln == 1 || !(OP(c) == EXACTF
1122 ? ibcmp(s, m, ln)
1123 : ibcmp_locale(s, m, ln)))
1124 && (norun || regtry(prog, s)) )
1125 goto got_it;
1126 s++;
1127 }
b3c9acc1
IZ
1128 }
1129 break;
bbce6d69 1130 case BOUNDL:
3280af22 1131 PL_reg_flags |= RF_tainted;
bbce6d69 1132 /* FALL THROUGH */
a0d0e21e 1133 case BOUND:
ffc61ed2 1134 if (do_utf8) {
12d33761 1135 if (s == PL_bostr)
ffc61ed2
JH
1136 tmp = '\n';
1137 else {
b4f7163a 1138 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1139
b4f7163a 1140 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1141 }
1142 tmp = ((OP(c) == BOUND ?
9041c2e3 1143 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1144 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1145 while (s < strend) {
1146 if (tmp == !(OP(c) == BOUND ?
3568d838 1147 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1148 isALNUM_LC_utf8((U8*)s)))
1149 {
1150 tmp = !tmp;
1151 if ((norun || regtry(prog, s)))
1152 goto got_it;
1153 }
1154 s += UTF8SKIP(s);
a687059c 1155 }
a0d0e21e 1156 }
667bb95a 1157 else {
12d33761 1158 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1159 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1160 while (s < strend) {
1161 if (tmp ==
1162 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1163 tmp = !tmp;
1164 if ((norun || regtry(prog, s)))
1165 goto got_it;
1166 }
1167 s++;
a0ed51b3 1168 }
a0ed51b3 1169 }
6eb5f6b9 1170 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1171 goto got_it;
1172 break;
bbce6d69 1173 case NBOUNDL:
3280af22 1174 PL_reg_flags |= RF_tainted;
bbce6d69 1175 /* FALL THROUGH */
a0d0e21e 1176 case NBOUND:
ffc61ed2 1177 if (do_utf8) {
12d33761 1178 if (s == PL_bostr)
ffc61ed2
JH
1179 tmp = '\n';
1180 else {
b4f7163a 1181 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1182
b4f7163a 1183 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1184 }
1185 tmp = ((OP(c) == NBOUND ?
9041c2e3 1186 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1187 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1188 while (s < strend) {
1189 if (tmp == !(OP(c) == NBOUND ?
3568d838 1190 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1191 isALNUM_LC_utf8((U8*)s)))
1192 tmp = !tmp;
1193 else if ((norun || regtry(prog, s)))
1194 goto got_it;
1195 s += UTF8SKIP(s);
1196 }
a0d0e21e 1197 }
667bb95a 1198 else {
12d33761 1199 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1200 tmp = ((OP(c) == NBOUND ?
1201 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1202 while (s < strend) {
1203 if (tmp ==
1204 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1205 tmp = !tmp;
1206 else if ((norun || regtry(prog, s)))
1207 goto got_it;
1208 s++;
1209 }
a0ed51b3 1210 }
6eb5f6b9 1211 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1212 goto got_it;
1213 break;
a0d0e21e 1214 case ALNUM:
ffc61ed2 1215 if (do_utf8) {
8269fa76 1216 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1217 while (s < strend) {
3568d838 1218 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1219 if (tmp && (norun || regtry(prog, s)))
1220 goto got_it;
1221 else
1222 tmp = doevery;
1223 }
bbce6d69 1224 else
ffc61ed2
JH
1225 tmp = 1;
1226 s += UTF8SKIP(s);
bbce6d69 1227 }
bbce6d69 1228 }
ffc61ed2
JH
1229 else {
1230 while (s < strend) {
1231 if (isALNUM(*s)) {
1232 if (tmp && (norun || regtry(prog, s)))
1233 goto got_it;
1234 else
1235 tmp = doevery;
1236 }
a0ed51b3 1237 else
ffc61ed2
JH
1238 tmp = 1;
1239 s++;
a0ed51b3 1240 }
a0ed51b3
LW
1241 }
1242 break;
bbce6d69 1243 case ALNUML:
3280af22 1244 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1245 if (do_utf8) {
1246 while (s < strend) {
1247 if (isALNUM_LC_utf8((U8*)s)) {
1248 if (tmp && (norun || regtry(prog, s)))
1249 goto got_it;
1250 else
1251 tmp = doevery;
1252 }
a687059c 1253 else
ffc61ed2
JH
1254 tmp = 1;
1255 s += UTF8SKIP(s);
a0d0e21e 1256 }
a0d0e21e 1257 }
ffc61ed2
JH
1258 else {
1259 while (s < strend) {
1260 if (isALNUM_LC(*s)) {
1261 if (tmp && (norun || regtry(prog, s)))
1262 goto got_it;
1263 else
1264 tmp = doevery;
1265 }
a0ed51b3 1266 else
ffc61ed2
JH
1267 tmp = 1;
1268 s++;
a0ed51b3 1269 }
a0ed51b3
LW
1270 }
1271 break;
a0d0e21e 1272 case NALNUM:
ffc61ed2 1273 if (do_utf8) {
8269fa76 1274 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1275 while (s < strend) {
3568d838 1276 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1277 if (tmp && (norun || regtry(prog, s)))
1278 goto got_it;
1279 else
1280 tmp = doevery;
1281 }
bbce6d69 1282 else
ffc61ed2
JH
1283 tmp = 1;
1284 s += UTF8SKIP(s);
bbce6d69 1285 }
bbce6d69 1286 }
ffc61ed2
JH
1287 else {
1288 while (s < strend) {
1289 if (!isALNUM(*s)) {
1290 if (tmp && (norun || regtry(prog, s)))
1291 goto got_it;
1292 else
1293 tmp = doevery;
1294 }
a0ed51b3 1295 else
ffc61ed2
JH
1296 tmp = 1;
1297 s++;
a0ed51b3 1298 }
a0ed51b3
LW
1299 }
1300 break;
bbce6d69 1301 case NALNUML:
3280af22 1302 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1303 if (do_utf8) {
1304 while (s < strend) {
1305 if (!isALNUM_LC_utf8((U8*)s)) {
1306 if (tmp && (norun || regtry(prog, s)))
1307 goto got_it;
1308 else
1309 tmp = doevery;
1310 }
a687059c 1311 else
ffc61ed2
JH
1312 tmp = 1;
1313 s += UTF8SKIP(s);
a687059c 1314 }
a0d0e21e 1315 }
ffc61ed2
JH
1316 else {
1317 while (s < strend) {
1318 if (!isALNUM_LC(*s)) {
1319 if (tmp && (norun || regtry(prog, s)))
1320 goto got_it;
1321 else
1322 tmp = doevery;
1323 }
a0ed51b3 1324 else
ffc61ed2
JH
1325 tmp = 1;
1326 s++;
a0ed51b3 1327 }
a0ed51b3
LW
1328 }
1329 break;
a0d0e21e 1330 case SPACE:
ffc61ed2 1331 if (do_utf8) {
8269fa76 1332 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1333 while (s < strend) {
3568d838 1334 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1335 if (tmp && (norun || regtry(prog, s)))
1336 goto got_it;
1337 else
1338 tmp = doevery;
1339 }
a0d0e21e 1340 else
ffc61ed2
JH
1341 tmp = 1;
1342 s += UTF8SKIP(s);
2304df62 1343 }
a0d0e21e 1344 }
ffc61ed2
JH
1345 else {
1346 while (s < strend) {
1347 if (isSPACE(*s)) {
1348 if (tmp && (norun || regtry(prog, s)))
1349 goto got_it;
1350 else
1351 tmp = doevery;
1352 }
a0ed51b3 1353 else
ffc61ed2
JH
1354 tmp = 1;
1355 s++;
a0ed51b3 1356 }
a0ed51b3
LW
1357 }
1358 break;
bbce6d69 1359 case SPACEL:
3280af22 1360 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1361 if (do_utf8) {
1362 while (s < strend) {
1363 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1364 if (tmp && (norun || regtry(prog, s)))
1365 goto got_it;
1366 else
1367 tmp = doevery;
1368 }
bbce6d69 1369 else
ffc61ed2
JH
1370 tmp = 1;
1371 s += UTF8SKIP(s);
bbce6d69 1372 }
bbce6d69 1373 }
ffc61ed2
JH
1374 else {
1375 while (s < strend) {
1376 if (isSPACE_LC(*s)) {
1377 if (tmp && (norun || regtry(prog, s)))
1378 goto got_it;
1379 else
1380 tmp = doevery;
1381 }
a0ed51b3 1382 else
ffc61ed2
JH
1383 tmp = 1;
1384 s++;
a0ed51b3 1385 }
a0ed51b3
LW
1386 }
1387 break;
a0d0e21e 1388 case NSPACE:
ffc61ed2 1389 if (do_utf8) {
8269fa76 1390 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1391 while (s < strend) {
3568d838 1392 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1393 if (tmp && (norun || regtry(prog, s)))
1394 goto got_it;
1395 else
1396 tmp = doevery;
1397 }
a0d0e21e 1398 else
ffc61ed2
JH
1399 tmp = 1;
1400 s += UTF8SKIP(s);
a687059c 1401 }
a0d0e21e 1402 }
ffc61ed2
JH
1403 else {
1404 while (s < strend) {
1405 if (!isSPACE(*s)) {
1406 if (tmp && (norun || regtry(prog, s)))
1407 goto got_it;
1408 else
1409 tmp = doevery;
1410 }
a0ed51b3 1411 else
ffc61ed2
JH
1412 tmp = 1;
1413 s++;
a0ed51b3 1414 }
a0ed51b3
LW
1415 }
1416 break;
bbce6d69 1417 case NSPACEL:
3280af22 1418 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1419 if (do_utf8) {
1420 while (s < strend) {
1421 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1422 if (tmp && (norun || regtry(prog, s)))
1423 goto got_it;
1424 else
1425 tmp = doevery;
1426 }
bbce6d69 1427 else
ffc61ed2
JH
1428 tmp = 1;
1429 s += UTF8SKIP(s);
bbce6d69 1430 }
bbce6d69 1431 }
ffc61ed2
JH
1432 else {
1433 while (s < strend) {
1434 if (!isSPACE_LC(*s)) {
1435 if (tmp && (norun || regtry(prog, s)))
1436 goto got_it;
1437 else
1438 tmp = doevery;
1439 }
a0ed51b3 1440 else
ffc61ed2
JH
1441 tmp = 1;
1442 s++;
a0ed51b3 1443 }
a0ed51b3
LW
1444 }
1445 break;
a0d0e21e 1446 case DIGIT:
ffc61ed2 1447 if (do_utf8) {
8269fa76 1448 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1449 while (s < strend) {
3568d838 1450 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1451 if (tmp && (norun || regtry(prog, s)))
1452 goto got_it;
1453 else
1454 tmp = doevery;
1455 }
a0d0e21e 1456 else
ffc61ed2
JH
1457 tmp = 1;
1458 s += UTF8SKIP(s);
2b69d0c2 1459 }
a0d0e21e 1460 }
ffc61ed2
JH
1461 else {
1462 while (s < strend) {
1463 if (isDIGIT(*s)) {
1464 if (tmp && (norun || regtry(prog, s)))
1465 goto got_it;
1466 else
1467 tmp = doevery;
1468 }
a0ed51b3 1469 else
ffc61ed2
JH
1470 tmp = 1;
1471 s++;
a0ed51b3 1472 }
a0ed51b3
LW
1473 }
1474 break;
b8c5462f
JH
1475 case DIGITL:
1476 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1477 if (do_utf8) {
1478 while (s < strend) {
1479 if (isDIGIT_LC_utf8((U8*)s)) {
1480 if (tmp && (norun || regtry(prog, s)))
1481 goto got_it;
1482 else
1483 tmp = doevery;
1484 }
b8c5462f 1485 else
ffc61ed2
JH
1486 tmp = 1;
1487 s += UTF8SKIP(s);
b8c5462f 1488 }
b8c5462f 1489 }
ffc61ed2
JH
1490 else {
1491 while (s < strend) {
1492 if (isDIGIT_LC(*s)) {
1493 if (tmp && (norun || regtry(prog, s)))
1494 goto got_it;
1495 else
1496 tmp = doevery;
1497 }
b8c5462f 1498 else
ffc61ed2
JH
1499 tmp = 1;
1500 s++;
b8c5462f 1501 }
b8c5462f
JH
1502 }
1503 break;
a0d0e21e 1504 case NDIGIT:
ffc61ed2 1505 if (do_utf8) {
8269fa76 1506 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1507 while (s < strend) {
3568d838 1508 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1509 if (tmp && (norun || regtry(prog, s)))
1510 goto got_it;
1511 else
1512 tmp = doevery;
1513 }
a0d0e21e 1514 else
ffc61ed2
JH
1515 tmp = 1;
1516 s += UTF8SKIP(s);
a687059c 1517 }
a0d0e21e 1518 }
ffc61ed2
JH
1519 else {
1520 while (s < strend) {
1521 if (!isDIGIT(*s)) {
1522 if (tmp && (norun || regtry(prog, s)))
1523 goto got_it;
1524 else
1525 tmp = doevery;
1526 }
a0ed51b3 1527 else
ffc61ed2
JH
1528 tmp = 1;
1529 s++;
a0ed51b3 1530 }
a0ed51b3
LW
1531 }
1532 break;
b8c5462f
JH
1533 case NDIGITL:
1534 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1535 if (do_utf8) {
1536 while (s < strend) {
1537 if (!isDIGIT_LC_utf8((U8*)s)) {
1538 if (tmp && (norun || regtry(prog, s)))
1539 goto got_it;
1540 else
1541 tmp = doevery;
1542 }
b8c5462f 1543 else
ffc61ed2
JH
1544 tmp = 1;
1545 s += UTF8SKIP(s);
b8c5462f 1546 }
a0ed51b3 1547 }
ffc61ed2
JH
1548 else {
1549 while (s < strend) {
1550 if (!isDIGIT_LC(*s)) {
1551 if (tmp && (norun || regtry(prog, s)))
1552 goto got_it;
1553 else
1554 tmp = doevery;
1555 }
cf93c79d 1556 else
ffc61ed2
JH
1557 tmp = 1;
1558 s++;
b8c5462f 1559 }
b8c5462f
JH
1560 }
1561 break;
b3c9acc1 1562 default:
3c3eec57
GS
1563 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1564 break;
d6a28714 1565 }
6eb5f6b9
JH
1566 return 0;
1567 got_it:
1568 return s;
1569}
1570
1571/*
1572 - regexec_flags - match a regexp against a string
1573 */
1574I32
1575Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1576 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1577/* strend: pointer to null at end of string */
1578/* strbeg: real beginning of string */
1579/* minend: end of match must be >=minend after stringarg. */
1580/* data: May be used for some additional optimizations. */
1581/* nosave: For optimizations. */
1582{
6eb5f6b9
JH
1583 register char *s;
1584 register regnode *c;
1585 register char *startpos = stringarg;
6eb5f6b9
JH
1586 I32 minlen; /* must match at least this many chars */
1587 I32 dontbother = 0; /* how many characters not to try at end */
155aba94 1588 /* I32 start_shift = 0; */ /* Offset of the start to find
6eb5f6b9
JH
1589 constant substr. */ /* CC */
1590 I32 end_shift = 0; /* Same for the end. */ /* CC */
1591 I32 scream_pos = -1; /* Internal iterator of scream. */
1592 char *scream_olds;
1593 SV* oreplsv = GvSV(PL_replgv);
1aa99e6b 1594 bool do_utf8 = DO_UTF8(sv);
2a782b5b 1595#ifdef DEBUGGING
9e55ce06
JH
1596 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1597 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1598#endif
6eb5f6b9
JH
1599
1600 PL_regcc = 0;
1601
1602 cache_re(prog);
1603#ifdef DEBUGGING
aea4f609 1604 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1605#endif
1606
1607 /* Be paranoid... */
1608 if (prog == NULL || startpos == NULL) {
1609 Perl_croak(aTHX_ "NULL regexp parameter");
1610 return 0;
1611 }
1612
1613 minlen = prog->minlen;
61a36c01 1614 if (strend - startpos < minlen) {
a72c7584
JH
1615 DEBUG_r(PerlIO_printf(Perl_debug_log,
1616 "String too short [regexec_flags]...\n"));
1617 goto phooey;
1aa99e6b 1618 }
6eb5f6b9 1619
6eb5f6b9
JH
1620 /* Check validity of program. */
1621 if (UCHARAT(prog->program) != REG_MAGIC) {
1622 Perl_croak(aTHX_ "corrupted regexp program");
1623 }
1624
1625 PL_reg_flags = 0;
1626 PL_reg_eval_set = 0;
1627 PL_reg_maxiter = 0;
1628
1629 if (prog->reganch & ROPT_UTF8)
1630 PL_reg_flags |= RF_utf8;
1631
1632 /* Mark beginning of line for ^ and lookbehind. */
1633 PL_regbol = startpos;
1634 PL_bostr = strbeg;
1635 PL_reg_sv = sv;
1636
1637 /* Mark end of line for $ (and such) */
1638 PL_regeol = strend;
1639
1640 /* see how far we have to get to not match where we matched before */
1641 PL_regtill = startpos+minend;
1642
1643 /* We start without call_cc context. */
1644 PL_reg_call_cc = 0;
1645
1646 /* If there is a "must appear" string, look for it. */
1647 s = startpos;
1648
1649 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1650 MAGIC *mg;
1651
1652 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1653 PL_reg_ganch = startpos;
1654 else if (sv && SvTYPE(sv) >= SVt_PVMG
1655 && SvMAGIC(sv)
14befaf4
DM
1656 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1657 && mg->mg_len >= 0) {
6eb5f6b9
JH
1658 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1659 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1660 if (s > PL_reg_ganch)
6eb5f6b9
JH
1661 goto phooey;
1662 s = PL_reg_ganch;
1663 }
1664 }
1665 else /* pos() not defined */
1666 PL_reg_ganch = strbeg;
1667 }
1668
33b8afdf 1669 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1670 re_scream_pos_data d;
1671
1672 d.scream_olds = &scream_olds;
1673 d.scream_pos = &scream_pos;
1674 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7
JH
1675 if (!s) {
1676 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1677 goto phooey; /* not present */
3fa9c3d7 1678 }
6eb5f6b9
JH
1679 }
1680
2a782b5b 1681 DEBUG_r({
9e55ce06
JH
1682 char *s0 = UTF ?
1683 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
c728cb41 1684 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1685 prog->precomp;
1686 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1687 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1688 UNI_DISPLAY_REGEX) : startpos;
9e55ce06 1689 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1690 if (!PL_colorset)
1691 reginitcolors();
1692 PerlIO_printf(Perl_debug_log,
9e55ce06 1693 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
2a782b5b 1694 PL_colors[4],PL_colors[5],PL_colors[0],
9e55ce06 1695 len0, len0, s0,
2a782b5b 1696 PL_colors[1],
9e55ce06 1697 len0 > 60 ? "..." : "",
2a782b5b 1698 PL_colors[0],
9e55ce06
JH
1699 (int)(len1 > 60 ? 60 : len1),
1700 s1, PL_colors[1],
1701 (len1 > 60 ? "..." : "")
2a782b5b
JH
1702 );
1703 });
6eb5f6b9
JH
1704
1705 /* Simplest case: anchored match need be tried only once. */
1706 /* [unless only anchor is BOL and multiline is set] */
1707 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1708 if (s == startpos && regtry(prog, startpos))
1709 goto got_it;
1710 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1711 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1712 {
1713 char *end;
1714
1715 if (minlen)
1716 dontbother = minlen - 1;
1aa99e6b 1717 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1718 /* for multiline we only have to try after newlines */
33b8afdf 1719 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1720 if (s == startpos)
1721 goto after_try;
1722 while (1) {
1723 if (regtry(prog, s))
1724 goto got_it;
1725 after_try:
1726 if (s >= end)
1727 goto phooey;
1728 if (prog->reganch & RE_USE_INTUIT) {
1729 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1730 if (!s)
1731 goto phooey;
1732 }
1733 else
1734 s++;
1735 }
1736 } else {
1737 if (s > startpos)
1738 s--;
1739 while (s < end) {
1740 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1741 if (regtry(prog, s))
1742 goto got_it;
1743 }
1744 }
1745 }
1746 }
1747 goto phooey;
1748 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1749 if (regtry(prog, PL_reg_ganch))
1750 goto got_it;
1751 goto phooey;
1752 }
1753
1754 /* Messy cases: unanchored match. */
33b8afdf 1755 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1756 /* we have /x+whatever/ */
1757 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1758 char ch;
bf93d4cc
GS
1759#ifdef DEBUGGING
1760 int did_match = 0;
1761#endif
33b8afdf
JH
1762 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1763 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1764 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1765
1aa99e6b 1766 if (do_utf8) {
6eb5f6b9
JH
1767 while (s < strend) {
1768 if (*s == ch) {
bf93d4cc 1769 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1770 if (regtry(prog, s)) goto got_it;
1771 s += UTF8SKIP(s);
1772 while (s < strend && *s == ch)
1773 s += UTF8SKIP(s);
1774 }
1775 s += UTF8SKIP(s);
1776 }
1777 }
1778 else {
1779 while (s < strend) {
1780 if (*s == ch) {
bf93d4cc 1781 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1782 if (regtry(prog, s)) goto got_it;
1783 s++;
1784 while (s < strend && *s == ch)
1785 s++;
1786 }
1787 s++;
1788 }
1789 }
b7953727 1790 DEBUG_r(if (!did_match)
bf93d4cc 1791 PerlIO_printf(Perl_debug_log,
b7953727
JH
1792 "Did not find anchored character...\n")
1793 );
6eb5f6b9
JH
1794 }
1795 /*SUPPRESS 560*/
33b8afdf
JH
1796 else if (prog->anchored_substr != Nullsv
1797 || prog->anchored_utf8 != Nullsv
1798 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1799 && prog->float_max_offset < strend - s)) {
1800 SV *must;
1801 I32 back_max;
1802 I32 back_min;
1803 char *last;
6eb5f6b9 1804 char *last1; /* Last position checked before */
bf93d4cc
GS
1805#ifdef DEBUGGING
1806 int did_match = 0;
1807#endif
33b8afdf
JH
1808 if (prog->anchored_substr || prog->anchored_utf8) {
1809 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1810 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1811 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1812 back_max = back_min = prog->anchored_offset;
1813 } else {
1814 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1815 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1816 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1817 back_max = prog->float_max_offset;
1818 back_min = prog->float_min_offset;
1819 }
1820 if (must == &PL_sv_undef)
1821 /* could not downgrade utf8 check substring, so must fail */
1822 goto phooey;
1823
1824 last = HOP3c(strend, /* Cannot start after this */
1825 -(I32)(CHR_SVLEN(must)
1826 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1827
1828 if (s > PL_bostr)
1829 last1 = HOPc(s, -1);
1830 else
1831 last1 = s - 1; /* bogus */
1832
1833 /* XXXX check_substr already used to find `s', can optimize if
1834 check_substr==must. */
1835 scream_pos = -1;
1836 dontbother = end_shift;
1837 strend = HOPc(strend, -dontbother);
1838 while ( (s <= last) &&
9041c2e3 1839 ((flags & REXEC_SCREAM)
1aa99e6b 1840 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1841 end_shift, &scream_pos, 0))
1aa99e6b 1842 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1843 (unsigned char*)strend, must,
6eb5f6b9 1844 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
bf93d4cc 1845 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1846 if (HOPc(s, -back_max) > last1) {
1847 last1 = HOPc(s, -back_min);
1848 s = HOPc(s, -back_max);
1849 }
1850 else {
1851 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1852
1853 last1 = HOPc(s, -back_min);
1854 s = t;
1855 }
1aa99e6b 1856 if (do_utf8) {
6eb5f6b9
JH
1857 while (s <= last1) {
1858 if (regtry(prog, s))
1859 goto got_it;
1860 s += UTF8SKIP(s);
1861 }
1862 }
1863 else {
1864 while (s <= last1) {
1865 if (regtry(prog, s))
1866 goto got_it;
1867 s++;
1868 }
1869 }
1870 }
b7953727
JH
1871 DEBUG_r(if (!did_match)
1872 PerlIO_printf(Perl_debug_log,
1873 "Did not find %s substr `%s%.*s%s'%s...\n",
33b8afdf 1874 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1875 ? "anchored" : "floating"),
1876 PL_colors[0],
1877 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1878 SvPVX(must),
b7953727
JH
1879 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1880 );
6eb5f6b9
JH
1881 goto phooey;
1882 }
155aba94 1883 else if ((c = prog->regstclass)) {
66e933ab
GS
1884 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1885 /* don't bother with what can't match */
6eb5f6b9 1886 strend = HOPc(strend, -(minlen - 1));
ffc61ed2
JH
1887 DEBUG_r({
1888 SV *prop = sv_newmortal();
9e55ce06
JH
1889 char *s0;
1890 char *s1;
1891 int len0;
1892 int len1;
1893
ffc61ed2 1894 regprop(prop, c);
9e55ce06
JH
1895 s0 = UTF ?
1896 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
c728cb41 1897 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1898 SvPVX(prop);
1899 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1900 s1 = UTF ?
c728cb41 1901 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1902 len1 = UTF ? SvCUR(dsv1) : strend - s;
1903 PerlIO_printf(Perl_debug_log,
1904 "Matching stclass `%*.*s' against `%*.*s'\n",
1905 len0, len0, s0,
1906 len1, len1, s1);
ffc61ed2 1907 });
6eb5f6b9
JH
1908 if (find_byclass(prog, c, s, strend, startpos, 0))
1909 goto got_it;
bf93d4cc 1910 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1911 }
1912 else {
1913 dontbother = 0;
33b8afdf
JH
1914 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1915 /* Trim the end. */
d6a28714 1916 char *last;
33b8afdf
JH
1917 SV* float_real;
1918
1919 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1920 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1921 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1922
1923 if (flags & REXEC_SCREAM) {
33b8afdf 1924 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1925 end_shift, &scream_pos, 1); /* last one */
1926 if (!last)
ffc61ed2 1927 last = scream_olds; /* Only one occurrence. */
b8c5462f 1928 }
d6a28714
JH
1929 else {
1930 STRLEN len;
33b8afdf 1931 char *little = SvPV(float_real, len);
d6a28714 1932
33b8afdf 1933 if (SvTAIL(float_real)) {
d6a28714
JH
1934 if (memEQ(strend - len + 1, little, len - 1))
1935 last = strend - len + 1;
1936 else if (!PL_multiline)
9041c2e3 1937 last = memEQ(strend - len, little, len)
d6a28714 1938 ? strend - len : Nullch;
b8c5462f 1939 else
d6a28714
JH
1940 goto find_last;
1941 } else {
1942 find_last:
9041c2e3 1943 if (len)
d6a28714 1944 last = rninstr(s, strend, little, little + len);
b8c5462f 1945 else
d6a28714 1946 last = strend; /* matching `$' */
b8c5462f 1947 }
b8c5462f 1948 }
bf93d4cc
GS
1949 if (last == NULL) {
1950 DEBUG_r(PerlIO_printf(Perl_debug_log,
1951 "%sCan't trim the tail, match fails (should not happen)%s\n",
1952 PL_colors[4],PL_colors[5]));
1953 goto phooey; /* Should not happen! */
1954 }
d6a28714
JH
1955 dontbother = strend - last + prog->float_min_offset;
1956 }
1957 if (minlen && (dontbother < minlen))
1958 dontbother = minlen - 1;
1959 strend -= dontbother; /* this one's always in bytes! */
1960 /* We don't know much -- general case. */
1aa99e6b 1961 if (do_utf8) {
d6a28714
JH
1962 for (;;) {
1963 if (regtry(prog, s))
1964 goto got_it;
1965 if (s >= strend)
1966 break;
b8c5462f 1967 s += UTF8SKIP(s);
d6a28714
JH
1968 };
1969 }
1970 else {
1971 do {
1972 if (regtry(prog, s))
1973 goto got_it;
1974 } while (s++ < strend);
1975 }
1976 }
1977
1978 /* Failure. */
1979 goto phooey;
1980
1981got_it:
1982 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1983
1984 if (PL_reg_eval_set) {
1985 /* Preserve the current value of $^R */
1986 if (oreplsv != GvSV(PL_replgv))
1987 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1988 restored, the value remains
1989 the same. */
acfe0abc 1990 restore_pos(aTHX_ 0);
d6a28714
JH
1991 }
1992
1993 /* make sure $`, $&, $', and $digit will work later */
1994 if ( !(flags & REXEC_NOT_FIRST) ) {
1995 if (RX_MATCH_COPIED(prog)) {
1996 Safefree(prog->subbeg);
1997 RX_MATCH_COPIED_off(prog);
1998 }
1999 if (flags & REXEC_COPY_STR) {
2000 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2001
2002 s = savepvn(strbeg, i);
2003 prog->subbeg = s;
2004 prog->sublen = i;
2005 RX_MATCH_COPIED_on(prog);
2006 }
2007 else {
2008 prog->subbeg = strbeg;
2009 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2010 }
2011 }
9041c2e3 2012
d6a28714
JH
2013 return 1;
2014
2015phooey:
bf93d4cc
GS
2016 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2017 PL_colors[4],PL_colors[5]));
d6a28714 2018 if (PL_reg_eval_set)
acfe0abc 2019 restore_pos(aTHX_ 0);
d6a28714
JH
2020 return 0;
2021}
2022
2023/*
2024 - regtry - try match at specific point
2025 */
2026STATIC I32 /* 0 failure, 1 success */
2027S_regtry(pTHX_ regexp *prog, char *startpos)
2028{
d6a28714
JH
2029 register I32 i;
2030 register I32 *sp;
2031 register I32 *ep;
2032 CHECKPOINT lastcp;
2033
02db2b7b
IZ
2034#ifdef DEBUGGING
2035 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2036#endif
d6a28714
JH
2037 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2038 MAGIC *mg;
2039
2040 PL_reg_eval_set = RS_init;
2041 DEBUG_r(DEBUG_s(
b900a521
JH
2042 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2043 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2044 ));
e8347627 2045 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2046 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2047 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2048 SAVETMPS;
2049 /* Apparently this is not needed, judging by wantarray. */
e8347627 2050 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2051 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2052
2053 if (PL_reg_sv) {
2054 /* Make $_ available to executed code. */
2055 if (PL_reg_sv != DEFSV) {
4d1ff10f 2056 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
d6a28714
JH
2057 SAVESPTR(DEFSV);
2058 DEFSV = PL_reg_sv;
b8c5462f 2059 }
d6a28714 2060
9041c2e3 2061 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2062 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2063 /* prepare for quick setting of pos */
14befaf4
DM
2064 sv_magic(PL_reg_sv, (SV*)0,
2065 PERL_MAGIC_regex_global, Nullch, 0);
2066 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2067 mg->mg_len = -1;
b8c5462f 2068 }
d6a28714
JH
2069 PL_reg_magic = mg;
2070 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2071 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2072 }
09687e5a 2073 if (!PL_reg_curpm) {
0f79a09d 2074 Newz(22,PL_reg_curpm, 1, PMOP);
09687e5a
AB
2075#ifdef USE_ITHREADS
2076 {
2077 SV* repointer = newSViv(0);
577e12cc 2078 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2079 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2080 av_push(PL_regex_padav,repointer);
2081 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2082 PL_regex_pad = AvARRAY(PL_regex_padav);
2083 }
2084#endif
2085 }
aaa362c4 2086 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2087 PL_reg_oldcurpm = PL_curpm;
2088 PL_curpm = PL_reg_curpm;
2089 if (RX_MATCH_COPIED(prog)) {
2090 /* Here is a serious problem: we cannot rewrite subbeg,
2091 since it may be needed if this match fails. Thus
2092 $` inside (?{}) could fail... */
2093 PL_reg_oldsaved = prog->subbeg;
2094 PL_reg_oldsavedlen = prog->sublen;
2095 RX_MATCH_COPIED_off(prog);
2096 }
2097 else
2098 PL_reg_oldsaved = Nullch;
2099 prog->subbeg = PL_bostr;
2100 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2101 }
2102 prog->startp[0] = startpos - PL_bostr;
2103 PL_reginput = startpos;
2104 PL_regstartp = prog->startp;
2105 PL_regendp = prog->endp;
2106 PL_reglastparen = &prog->lastparen;
a01268b5 2107 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714
JH
2108 prog->lastparen = 0;
2109 PL_regsize = 0;
2110 DEBUG_r(PL_reg_starttry = startpos);
2111 if (PL_reg_start_tmpl <= prog->nparens) {
2112 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2113 if(PL_reg_start_tmp)
2114 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2115 else
2116 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2117 }
2118
2119 /* XXXX What this code is doing here?!!! There should be no need
2120 to do this again and again, PL_reglastparen should take care of
3dd2943c 2121 this! --ilya*/
dafc8851
JH
2122
2123 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2124 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2125 * PL_reglastparen), is not needed at all by the test suite
2126 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2127 * enough, for building DynaLoader, or otherwise this
2128 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2129 * will happen. Meanwhile, this code *is* needed for the
2130 * above-mentioned test suite tests to succeed. The common theme
2131 * on those tests seems to be returning null fields from matches.
2132 * --jhi */
dafc8851 2133#if 1
d6a28714
JH
2134 sp = prog->startp;
2135 ep = prog->endp;
2136 if (prog->nparens) {
09e8ae3b 2137 for (i = prog->nparens; i > *PL_reglastparen; i--) {
d6a28714
JH
2138 *++sp = -1;
2139 *++ep = -1;
2140 }
2141 }
dafc8851 2142#endif
02db2b7b 2143 REGCP_SET(lastcp);
d6a28714
JH
2144 if (regmatch(prog->program + 1)) {
2145 prog->endp[0] = PL_reginput - PL_bostr;
2146 return 1;
2147 }
02db2b7b 2148 REGCP_UNWIND(lastcp);
d6a28714
JH
2149 return 0;
2150}
2151
02db2b7b
IZ
2152#define RE_UNWIND_BRANCH 1
2153#define RE_UNWIND_BRANCHJ 2
2154
2155union re_unwind_t;
2156
2157typedef struct { /* XX: makes sense to enlarge it... */
2158 I32 type;
2159 I32 prev;
2160 CHECKPOINT lastcp;
2161} re_unwind_generic_t;
2162
2163typedef struct {
2164 I32 type;
2165 I32 prev;
2166 CHECKPOINT lastcp;
2167 I32 lastparen;
2168 regnode *next;
2169 char *locinput;
2170 I32 nextchr;
2171#ifdef DEBUGGING
2172 int regindent;
2173#endif
2174} re_unwind_branch_t;
2175
2176typedef union re_unwind_t {
2177 I32 type;
2178 re_unwind_generic_t generic;
2179 re_unwind_branch_t branch;
2180} re_unwind_t;
2181
8ba1375e
MJD
2182#define sayYES goto yes
2183#define sayNO goto no
e0f9d4a8 2184#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2185#define sayYES_FINAL goto yes_final
2186#define sayYES_LOUD goto yes_loud
2187#define sayNO_FINAL goto no_final
2188#define sayNO_SILENT goto do_no
2189#define saySAME(x) if (x) goto yes; else goto no
2190
2191#define REPORT_CODE_OFF 24
2192
d6a28714
JH
2193/*
2194 - regmatch - main matching routine
2195 *
2196 * Conceptually the strategy is simple: check to see whether the current
2197 * node matches, call self recursively to see whether the rest matches,
2198 * and then act accordingly. In practice we make some effort to avoid
2199 * recursion, in particular by going through "ordinary" nodes (that don't
2200 * need to know whether the rest of the match failed) by a loop instead of
2201 * by recursion.
2202 */
2203/* [lwall] I've hoisted the register declarations to the outer block in order to
2204 * maybe save a little bit of pushing and popping on the stack. It also takes
2205 * advantage of machines that use a register save mask on subroutine entry.
2206 */
2207STATIC I32 /* 0 failure, 1 success */
2208S_regmatch(pTHX_ regnode *prog)
2209{
d6a28714
JH
2210 register regnode *scan; /* Current node. */
2211 regnode *next; /* Next node. */
2212 regnode *inner; /* Next node in internal branch. */
2213 register I32 nextchr; /* renamed nextchr - nextchar colides with
2214 function of same name */
2215 register I32 n; /* no or next */
b7953727
JH
2216 register I32 ln = 0; /* len or last */
2217 register char *s = Nullch; /* operand or save */
d6a28714 2218 register char *locinput = PL_reginput;
b7953727 2219 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2220 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2221 I32 unwind = 0;
b7953727 2222#if 0
02db2b7b 2223 I32 firstcp = PL_savestack_ix;
b7953727 2224#endif
53c4c00c 2225 register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2226#ifdef DEBUGGING
ce333219
JH
2227 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2228 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2229 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2a782b5b 2230#endif
02db2b7b 2231
d6a28714
JH
2232#ifdef DEBUGGING
2233 PL_regindent++;
2234#endif
2235
2236 /* Note that nextchr is a byte even in UTF */
2237 nextchr = UCHARAT(locinput);
2238 scan = prog;
2239 while (scan != NULL) {
8ba1375e 2240
2a782b5b 2241 DEBUG_r( {
d6a28714
JH
2242 SV *prop = sv_newmortal();
2243 int docolor = *PL_colors[0];
2244 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2245 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2246 /* The part of the string before starttry has one color
2247 (pref0_len chars), between starttry and current
2248 position another one (pref_len - pref0_len chars),
2249 after the current position the third one.
2250 We assume that pref0_len <= pref_len, otherwise we
2251 decrease pref0_len. */
9041c2e3 2252 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2253 ? (5 + taill) - l : locinput - PL_bostr;
2254 int pref0_len;
d6a28714 2255
df1ffd02 2256 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2257 pref_len++;
2258 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2259 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2260 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2261 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2262 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2263 l--;
d6a28714
JH
2264 if (pref0_len < 0)
2265 pref0_len = 0;
2266 if (pref0_len > pref_len)
2267 pref0_len = pref_len;
2268 regprop(prop, scan);
2a782b5b
JH
2269 {
2270 char *s0 =
df1ffd02 2271 do_utf8 ?
2a782b5b 2272 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2273 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2274 locinput - pref_len;
df1ffd02
JH
2275 int len0 = do_utf8 ? strlen(s0) : pref0_len;
2276 char *s1 = do_utf8 ?
2a782b5b 2277 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2278 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2279 locinput - pref_len + pref0_len;
df1ffd02
JH
2280 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2281 char *s2 = do_utf8 ?
2a782b5b 2282 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2283 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2284 locinput;
df1ffd02 2285 int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2286 PerlIO_printf(Perl_debug_log,
2287 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2288 (IV)(locinput - PL_bostr),
2289 PL_colors[4],
2290 len0, s0,
2291 PL_colors[5],
2292 PL_colors[2],
2293 len1, s1,
2294 PL_colors[3],
2295 (docolor ? "" : "> <"),
2296 PL_colors[0],
2297 len2, s2,
2298 PL_colors[1],
2299 15 - l - pref_len + 1,
2300 "",
2301 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2302 SvPVX(prop));
2303 }
2304 });
d6a28714
JH
2305
2306 next = scan + NEXT_OFF(scan);
2307 if (next == scan)
2308 next = NULL;
2309
2310 switch (OP(scan)) {
2311 case BOL:
12d33761
HS
2312 if (locinput == PL_bostr || (PL_multiline &&
2313 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
d6a28714
JH
2314 {
2315 /* regtill = regbol; */
b8c5462f
JH
2316 break;
2317 }
d6a28714
JH
2318 sayNO;
2319 case MBOL:
12d33761
HS
2320 if (locinput == PL_bostr ||
2321 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2322 {
b8c5462f
JH
2323 break;
2324 }
d6a28714
JH
2325 sayNO;
2326 case SBOL:
c2a73568 2327 if (locinput == PL_bostr)
b8c5462f 2328 break;
d6a28714
JH
2329 sayNO;
2330 case GPOS:
2331 if (locinput == PL_reg_ganch)
2332 break;
2333 sayNO;
2334 case EOL:
2335 if (PL_multiline)
2336 goto meol;
2337 else
2338 goto seol;
2339 case MEOL:
2340 meol:
2341 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2342 sayNO;
b8c5462f 2343 break;
d6a28714
JH
2344 case SEOL:
2345 seol:
2346 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2347 sayNO;
d6a28714 2348 if (PL_regeol - locinput > 1)
b8c5462f 2349 sayNO;
b8c5462f 2350 break;
d6a28714
JH
2351 case EOS:
2352 if (PL_regeol != locinput)
b8c5462f 2353 sayNO;
d6a28714 2354 break;
ffc61ed2 2355 case SANY:
d6a28714 2356 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2357 sayNO;
f33976b4
DB
2358 if (do_utf8) {
2359 locinput += PL_utf8skip[nextchr];
2360 if (locinput > PL_regeol)
2361 sayNO;
2362 nextchr = UCHARAT(locinput);
2363 }
2364 else
2365 nextchr = UCHARAT(++locinput);
2366 break;
2367 case CANY:
2368 if (!nextchr && locinput >= PL_regeol)
2369 sayNO;
b8c5462f 2370 nextchr = UCHARAT(++locinput);
a0d0e21e 2371 break;
ffc61ed2 2372 case REG_ANY:
1aa99e6b
IH
2373 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2374 sayNO;
2375 if (do_utf8) {
b8c5462f 2376 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2377 if (locinput > PL_regeol)
2378 sayNO;
a0ed51b3 2379 nextchr = UCHARAT(locinput);
a0ed51b3 2380 }
1aa99e6b
IH
2381 else
2382 nextchr = UCHARAT(++locinput);
a0ed51b3 2383 break;
d6a28714 2384 case EXACT:
cd439c50
IZ
2385 s = STRING(scan);
2386 ln = STR_LEN(scan);
1aa99e6b 2387 if (do_utf8 != (UTF!=0)) {
bc517b45 2388 /* The target and the pattern have differing utf8ness. */
1aa99e6b
IH
2389 char *l = locinput;
2390 char *e = s + ln;
bc517b45 2391 STRLEN ulen;
a72c7584 2392
5ff6fc6d
JH
2393 if (do_utf8) {
2394 /* The target is utf8, the pattern is not utf8. */
1aa99e6b
IH
2395 while (s < e) {
2396 if (l >= PL_regeol)
5ff6fc6d
JH
2397 sayNO;
2398 if (NATIVE_TO_UNI(*(U8*)s) !=
872c91ae
JH
2399 utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
2400 ckWARN(WARN_UTF8) ?
2401 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2402 sayNO;
bc517b45 2403 l += ulen;
5ff6fc6d 2404 s ++;
1aa99e6b 2405 }
5ff6fc6d
JH
2406 }
2407 else {
2408 /* The target is not utf8, the pattern is utf8. */
1aa99e6b
IH
2409 while (s < e) {
2410 if (l >= PL_regeol)
2411 sayNO;
5ff6fc6d 2412 if (NATIVE_TO_UNI(*((U8*)l)) !=
872c91ae
JH
2413 utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
2414 ckWARN(WARN_UTF8) ?
2415 0 : UTF8_ALLOW_ANY))
1aa99e6b 2416 sayNO;
bc517b45 2417 s += ulen;
a72c7584 2418 l ++;
1aa99e6b 2419 }
5ff6fc6d 2420 }
1aa99e6b
IH
2421 locinput = l;
2422 nextchr = UCHARAT(locinput);
2423 break;
2424 }
bc517b45 2425 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2426 /* Inline the first character, for speed. */
2427 if (UCHARAT(s) != nextchr)
2428 sayNO;
2429 if (PL_regeol - locinput < ln)
2430 sayNO;
2431 if (ln > 1 && memNE(s, locinput, ln))
2432 sayNO;
2433 locinput += ln;
2434 nextchr = UCHARAT(locinput);
2435 break;
2436 case EXACTFL:
b8c5462f
JH
2437 PL_reg_flags |= RF_tainted;
2438 /* FALL THROUGH */
d6a28714 2439 case EXACTF:
cd439c50
IZ
2440 s = STRING(scan);
2441 ln = STR_LEN(scan);
d6a28714 2442
d07ddd77
JH
2443 if (do_utf8 || UTF) {
2444 /* Either target or the pattern are utf8. */
d6a28714 2445 char *l = locinput;
d07ddd77 2446 char *e = PL_regeol;
bc517b45 2447
1feea2c7
JH
2448 if (ibcmp_utf8(s, 0, ln, UTF,
2449 l, &e, 0, do_utf8)) {
5486206c
JH
2450 /* One more case for the sharp s:
2451 * pack("U0U*", 0xDF) =~ /ss/i,
2452 * the 0xC3 0x9F are the UTF-8
2453 * byte sequence for the U+00DF. */
2454 if (!(do_utf8 &&
2455 toLOWER(s[0]) == 's' &&
2456 ln >= 2 &&
2457 toLOWER(s[1]) == 's' &&
2458 (U8)l[0] == 0xC3 &&
2459 e - l >= 2 &&
2460 (U8)l[1] == 0x9F))
2461 sayNO;
2462 }
d07ddd77
JH
2463 locinput = e;
2464 nextchr = UCHARAT(locinput);
2465 break;
a0ed51b3 2466 }
d6a28714 2467
bc517b45
JH
2468 /* Neither the target and the pattern are utf8. */
2469
d6a28714
JH
2470 /* Inline the first character, for speed. */
2471 if (UCHARAT(s) != nextchr &&
2472 UCHARAT(s) != ((OP(scan) == EXACTF)
2473 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2474 sayNO;
d6a28714 2475 if (PL_regeol - locinput < ln)
b8c5462f 2476 sayNO;
d6a28714
JH
2477 if (ln > 1 && (OP(scan) == EXACTF
2478 ? ibcmp(s, locinput, ln)
2479 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2480 sayNO;
d6a28714
JH
2481 locinput += ln;
2482 nextchr = UCHARAT(locinput);
a0d0e21e 2483 break;
d6a28714 2484 case ANYOF:
ffc61ed2 2485 if (do_utf8) {
9e55ce06
JH
2486 STRLEN inclasslen = PL_regeol - locinput;
2487
ba7b4546 2488 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2489 sayNO_ANYOF;
ffc61ed2
JH
2490 if (locinput >= PL_regeol)
2491 sayNO;
0f0076b4 2492 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2493 nextchr = UCHARAT(locinput);
e0f9d4a8 2494 break;
ffc61ed2
JH
2495 }
2496 else {
2497 if (nextchr < 0)
2498 nextchr = UCHARAT(locinput);
7d3e948e 2499 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2500 sayNO_ANYOF;
ffc61ed2
JH
2501 if (!nextchr && locinput >= PL_regeol)
2502 sayNO;
2503 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2504 break;
2505 }
2506 no_anyof:
2507 /* If we might have the case of the German sharp s
2508 * in a casefolding Unicode character class. */
2509
ebc501f0
JH
2510 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2511 locinput += SHARP_S_SKIP;
e0f9d4a8 2512 nextchr = UCHARAT(locinput);
ffc61ed2 2513 }
e0f9d4a8
JH
2514 else
2515 sayNO;
b8c5462f 2516 break;
d6a28714 2517 case ALNUML:
b8c5462f
JH
2518 PL_reg_flags |= RF_tainted;
2519 /* FALL THROUGH */
d6a28714 2520 case ALNUM:
b8c5462f 2521 if (!nextchr)
4633a7c4 2522 sayNO;
ffc61ed2 2523 if (do_utf8) {
ad24be35 2524 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2525 if (!(OP(scan) == ALNUM
3568d838 2526 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2527 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2528 {
2529 sayNO;
a0ed51b3 2530 }
b8c5462f 2531 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2532 nextchr = UCHARAT(locinput);
2533 break;
2534 }
ffc61ed2 2535 if (!(OP(scan) == ALNUM
d6a28714 2536 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2537 sayNO;
b8c5462f 2538 nextchr = UCHARAT(++locinput);
a0d0e21e 2539 break;
d6a28714 2540 case NALNUML:
b8c5462f
JH
2541 PL_reg_flags |= RF_tainted;
2542 /* FALL THROUGH */
d6a28714
JH
2543 case NALNUM:
2544 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2545 sayNO;
ffc61ed2 2546 if (do_utf8) {
8269fa76 2547 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2548 if (OP(scan) == NALNUM
3568d838 2549 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2550 : isALNUM_LC_utf8((U8*)locinput))
2551 {
b8c5462f 2552 sayNO;
d6a28714 2553 }
b8c5462f
JH
2554 locinput += PL_utf8skip[nextchr];
2555 nextchr = UCHARAT(locinput);
2556 break;
2557 }
ffc61ed2 2558 if (OP(scan) == NALNUM
d6a28714 2559 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2560 sayNO;
76e3520e 2561 nextchr = UCHARAT(++locinput);
a0d0e21e 2562 break;
d6a28714
JH
2563 case BOUNDL:
2564 case NBOUNDL:
3280af22 2565 PL_reg_flags |= RF_tainted;
bbce6d69 2566 /* FALL THROUGH */
d6a28714
JH
2567 case BOUND:
2568 case NBOUND:
2569 /* was last char in word? */
ffc61ed2 2570 if (do_utf8) {
12d33761
HS
2571 if (locinput == PL_bostr)
2572 ln = '\n';
ffc61ed2 2573 else {
b4f7163a 2574 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2575
b4f7163a 2576 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2577 }
2578 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2579 ln = isALNUM_uni(ln);
8269fa76 2580 LOAD_UTF8_CHARCLASS(alnum,"a");
3568d838 2581 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2582 }
2583 else {
9041c2e3 2584 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2585 n = isALNUM_LC_utf8((U8*)locinput);
2586 }
a0ed51b3 2587 }
d6a28714 2588 else {
12d33761
HS
2589 ln = (locinput != PL_bostr) ?
2590 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2591 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2592 ln = isALNUM(ln);
2593 n = isALNUM(nextchr);
2594 }
2595 else {
2596 ln = isALNUM_LC(ln);
2597 n = isALNUM_LC(nextchr);
2598 }
d6a28714 2599 }
ffc61ed2
JH
2600 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2601 OP(scan) == BOUNDL))
2602 sayNO;
a0ed51b3 2603 break;
d6a28714 2604 case SPACEL:
3280af22 2605 PL_reg_flags |= RF_tainted;
bbce6d69 2606 /* FALL THROUGH */
d6a28714 2607 case SPACE:
9442cb0e 2608 if (!nextchr)
4633a7c4 2609 sayNO;
1aa99e6b 2610 if (do_utf8) {
fd400ab9 2611 if (UTF8_IS_CONTINUED(nextchr)) {
8269fa76 2612 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2613 if (!(OP(scan) == SPACE
3568d838 2614 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2615 : isSPACE_LC_utf8((U8*)locinput)))
2616 {
2617 sayNO;
2618 }
2619 locinput += PL_utf8skip[nextchr];
2620 nextchr = UCHARAT(locinput);
2621 break;
d6a28714 2622 }
ffc61ed2
JH
2623 if (!(OP(scan) == SPACE
2624 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2625 sayNO;
2626 nextchr = UCHARAT(++locinput);
2627 }
2628 else {
2629 if (!(OP(scan) == SPACE
2630 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2631 sayNO;
2632 nextchr = UCHARAT(++locinput);
a0ed51b3 2633 }
a0ed51b3 2634 break;
d6a28714 2635 case NSPACEL:
3280af22 2636 PL_reg_flags |= RF_tainted;
bbce6d69 2637 /* FALL THROUGH */
d6a28714 2638 case NSPACE:
9442cb0e 2639 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2640 sayNO;
1aa99e6b 2641 if (do_utf8) {
8269fa76 2642 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2643 if (OP(scan) == NSPACE
3568d838 2644 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 2645 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
2646 {
2647 sayNO;
2648 }
2649 locinput += PL_utf8skip[nextchr];
2650 nextchr = UCHARAT(locinput);
2651 break;
a0ed51b3 2652 }
ffc61ed2 2653 if (OP(scan) == NSPACE
d6a28714 2654 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 2655 sayNO;
76e3520e 2656 nextchr = UCHARAT(++locinput);
a0d0e21e 2657 break;
d6a28714 2658 case DIGITL:
a0ed51b3
LW
2659 PL_reg_flags |= RF_tainted;
2660 /* FALL THROUGH */
d6a28714 2661 case DIGIT:
9442cb0e 2662 if (!nextchr)
a0ed51b3 2663 sayNO;
1aa99e6b 2664 if (do_utf8) {
8269fa76 2665 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2666 if (!(OP(scan) == DIGIT
3568d838 2667 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 2668 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 2669 {
a0ed51b3 2670 sayNO;
dfe13c55 2671 }
6f06b55f 2672 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2673 nextchr = UCHARAT(locinput);
2674 break;
2675 }
ffc61ed2 2676 if (!(OP(scan) == DIGIT
9442cb0e 2677 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
2678 sayNO;
2679 nextchr = UCHARAT(++locinput);
2680 break;
d6a28714 2681 case NDIGITL:
b8c5462f
JH
2682 PL_reg_flags |= RF_tainted;
2683 /* FALL THROUGH */
d6a28714 2684 case NDIGIT:
9442cb0e 2685 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2686 sayNO;
1aa99e6b 2687 if (do_utf8) {
8269fa76 2688 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2689 if (OP(scan) == NDIGIT
3568d838 2690 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
2691 : isDIGIT_LC_utf8((U8*)locinput))
2692 {
a0ed51b3 2693 sayNO;
9442cb0e 2694 }
6f06b55f 2695 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2696 nextchr = UCHARAT(locinput);
2697 break;
2698 }
ffc61ed2 2699 if (OP(scan) == NDIGIT
9442cb0e 2700 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
2701 sayNO;
2702 nextchr = UCHARAT(++locinput);
2703 break;
2704 case CLUMP:
b7c83a7e 2705 if (locinput >= PL_regeol)
a0ed51b3 2706 sayNO;
b7c83a7e
JH
2707 if (do_utf8) {
2708 LOAD_UTF8_CHARCLASS(mark,"~");
2709 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2710 sayNO;
2711 locinput += PL_utf8skip[nextchr];
2712 while (locinput < PL_regeol &&
2713 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2714 locinput += UTF8SKIP(locinput);
2715 if (locinput > PL_regeol)
2716 sayNO;
eb08e2da
JH
2717 }
2718 else
2719 locinput++;
a0ed51b3
LW
2720 nextchr = UCHARAT(locinput);
2721 break;
c8756f30 2722 case REFFL:
3280af22 2723 PL_reg_flags |= RF_tainted;
c8756f30 2724 /* FALL THROUGH */
c277df42 2725 case REF:
c8756f30 2726 case REFF:
c277df42 2727 n = ARG(scan); /* which paren pair */
cf93c79d 2728 ln = PL_regstartp[n];
2c2d71f5 2729 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
cf93c79d 2730 if (*PL_reglastparen < n || ln == -1)
af3f8c16 2731 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 2732 if (ln == PL_regendp[n])
a0d0e21e 2733 break;
a0ed51b3 2734
cf93c79d 2735 s = PL_bostr + ln;
1aa99e6b 2736 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 2737 char *l = locinput;
cf93c79d 2738 char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
2739 /*
2740 * Note that we can't do the "other character" lookup trick as
2741 * in the 8-bit case (no pun intended) because in Unicode we
2742 * have to map both upper and title case to lower case.
2743 */
2744 if (OP(scan) == REFF) {
a2a2844f 2745 STRLEN ulen1, ulen2;
e7ae6809
JH
2746 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2747 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a0ed51b3
LW
2748 while (s < e) {
2749 if (l >= PL_regeol)
2750 sayNO;
a2a2844f
JH
2751 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2752 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 2753 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 2754 sayNO;
a2a2844f
JH
2755 s += ulen1;
2756 l += ulen2;
a0ed51b3
LW
2757 }
2758 }
2759 locinput = l;
2760 nextchr = UCHARAT(locinput);
2761 break;
2762 }
2763
a0d0e21e 2764 /* Inline the first character, for speed. */
76e3520e 2765 if (UCHARAT(s) != nextchr &&
c8756f30
AK
2766 (OP(scan) == REF ||
2767 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 2768 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 2769 sayNO;
cf93c79d 2770 ln = PL_regendp[n] - ln;
3280af22 2771 if (locinput + ln > PL_regeol)
4633a7c4 2772 sayNO;
c8756f30
AK
2773 if (ln > 1 && (OP(scan) == REF
2774 ? memNE(s, locinput, ln)
2775 : (OP(scan) == REFF
2776 ? ibcmp(s, locinput, ln)
2777 : ibcmp_locale(s, locinput, ln))))
4633a7c4 2778 sayNO;
a0d0e21e 2779 locinput += ln;
76e3520e 2780 nextchr = UCHARAT(locinput);
a0d0e21e
LW
2781 break;
2782
2783 case NOTHING:
c277df42 2784 case TAIL:
a0d0e21e
LW
2785 break;
2786 case BACK:
2787 break;
c277df42
IZ
2788 case EVAL:
2789 {
2790 dSP;
533c011a 2791 OP_4tree *oop = PL_op;
3280af22
NIS
2792 COP *ocurcop = PL_curcop;
2793 SV **ocurpad = PL_curpad;
c277df42 2794 SV *ret;
9041c2e3 2795
c277df42 2796 n = ARG(scan);
533c011a 2797 PL_op = (OP_4tree*)PL_regdata->data[n];
d7d93a81 2798 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
dfad63ad 2799 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
cf93c79d 2800 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 2801
8e5e9ebe
RGS
2802 {
2803 SV **before = SP;
2804 CALLRUNOPS(aTHX); /* Scalar context. */
2805 SPAGAIN;
2806 if (SP == before)
2807 ret = Nullsv; /* protect against empty (?{}) blocks. */
2808 else {
2809 ret = POPs;
2810 PUTBACK;
2811 }
2812 }
2813
0f5d15d6
IZ
2814 PL_op = oop;
2815 PL_curpad = ocurpad;
2816 PL_curcop = ocurcop;
c277df42 2817 if (logical) {
0f5d15d6
IZ
2818 if (logical == 2) { /* Postponed subexpression. */
2819 regexp *re;
22c35a8c 2820 MAGIC *mg = Null(MAGIC*);
0f5d15d6 2821 re_cc_state state;
0f5d15d6
IZ
2822 CHECKPOINT cp, lastcp;
2823
2824 if(SvROK(ret) || SvRMAGICAL(ret)) {
2825 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2826
2827 if(SvMAGICAL(sv))
14befaf4 2828 mg = mg_find(sv, PERL_MAGIC_qr);
0f5d15d6
IZ
2829 }
2830 if (mg) {
2831 re = (regexp *)mg->mg_obj;
df0003d4 2832 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
2833 }
2834 else {
2835 STRLEN len;
2836 char *t = SvPV(ret, len);
2837 PMOP pm;
2838 char *oprecomp = PL_regprecomp;
2839 I32 osize = PL_regsize;
2840 I32 onpar = PL_regnpar;
2841
5fcd1c1b 2842 Zero(&pm, 1, PMOP);
cea2e8a9 2843 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
9041c2e3 2844 if (!(SvFLAGS(ret)
0f5d15d6 2845 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
14befaf4
DM
2846 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2847 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
2848 PL_regprecomp = oprecomp;
2849 PL_regsize = osize;
2850 PL_regnpar = onpar;
2851 }
2852 DEBUG_r(
9041c2e3 2853 PerlIO_printf(Perl_debug_log,
0f5d15d6
IZ
2854 "Entering embedded `%s%.60s%s%s'\n",
2855 PL_colors[0],
2856 re->precomp,
2857 PL_colors[1],
2858 (strlen(re->precomp) > 60 ? "..." : ""))
2859 );
2860 state.node = next;
2861 state.prev = PL_reg_call_cc;
2862 state.cc = PL_regcc;
2863 state.re = PL_reg_re;
2864
2ab05381 2865 PL_regcc = 0;
9041c2e3 2866
0f5d15d6 2867 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 2868 REGCP_SET(lastcp);
0f5d15d6
IZ
2869 cache_re(re);
2870 state.ss = PL_savestack_ix;
2871 *PL_reglastparen = 0;
a01268b5 2872 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
2873 PL_reg_call_cc = &state;
2874 PL_reginput = locinput;
2c2d71f5
JH
2875
2876 /* XXXX This is too dramatic a measure... */
2877 PL_reg_maxiter = 0;
2878
0f5d15d6 2879 if (regmatch(re->program + 1)) {
2c914db6
IZ
2880 /* Even though we succeeded, we need to restore
2881 global variables, since we may be wrapped inside
2882 SUSPEND, thus the match may be not finished yet. */
2883
2884 /* XXXX Do this only if SUSPENDed? */
2885 PL_reg_call_cc = state.prev;
2886 PL_regcc = state.cc;
2887 PL_reg_re = state.re;
2888 cache_re(PL_reg_re);
2889
2890 /* XXXX This is too dramatic a measure... */
2891 PL_reg_maxiter = 0;
2892
2893 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
2894 ReREFCNT_dec(re);
2895 regcpblow(cp);
2896 sayYES;
2897 }
0f5d15d6 2898 ReREFCNT_dec(re);
02db2b7b 2899 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
2900 regcppop();
2901 PL_reg_call_cc = state.prev;
2902 PL_regcc = state.cc;
2903 PL_reg_re = state.re;
d3790889 2904 cache_re(PL_reg_re);
2c2d71f5
JH
2905
2906 /* XXXX This is too dramatic a measure... */
2907 PL_reg_maxiter = 0;
2908
8e514ae6 2909 logical = 0;
0f5d15d6
IZ
2910 sayNO;
2911 }
c277df42 2912 sw = SvTRUE(ret);
0f5d15d6 2913 logical = 0;
a0ed51b3
LW
2914 }
2915 else
3280af22 2916 sv_setsv(save_scalar(PL_replgv), ret);
c277df42
IZ
2917 break;
2918 }
a0d0e21e 2919 case OPEN:
c277df42 2920 n = ARG(scan); /* which paren pair */
3280af22
NIS
2921 PL_reg_start_tmp[n] = locinput;
2922 if (n > PL_regsize)
2923 PL_regsize = n;
a0d0e21e
LW
2924 break;
2925 case CLOSE:
c277df42 2926 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
2927 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2928 PL_regendp[n] = locinput - PL_bostr;
3280af22
NIS
2929 if (n > *PL_reglastparen)
2930 *PL_reglastparen = n;
a01268b5 2931 *PL_reglastcloseparen = n;
a0d0e21e 2932 break;
c277df42
IZ
2933 case GROUPP:
2934 n = ARG(scan); /* which paren pair */
cf93c79d 2935 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
2936 break;
2937 case IFTHEN:
2c2d71f5 2938 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
2939 if (sw)
2940 next = NEXTOPER(NEXTOPER(scan));
2941 else {
2942 next = scan + ARG(scan);
2943 if (OP(next) == IFTHEN) /* Fake one. */
2944 next = NEXTOPER(NEXTOPER(next));
2945 }
2946 break;
2947 case LOGICAL:
0f5d15d6 2948 logical = scan->flags;
c277df42 2949 break;
2ab05381
IZ
2950/*******************************************************************
2951 PL_regcc contains infoblock about the innermost (...)* loop, and
2952 a pointer to the next outer infoblock.
2953
2954 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2955
2956 1) After matching X, regnode for CURLYX is processed;
2957
9041c2e3 2958 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
2959 regmatch() recursively with the starting point at WHILEM node;
2960
2961 3) Each hit of WHILEM node tries to match A and Z (in the order
2962 depending on the current iteration, min/max of {min,max} and
2963 greediness). The information about where are nodes for "A"
2964 and "Z" is read from the infoblock, as is info on how many times "A"
2965 was already matched, and greediness.
2966
2967 4) After A matches, the same WHILEM node is hit again.
2968
2969 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2970 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2971 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2972 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2973 of the external loop.
2974
2975 Currently present infoblocks form a tree with a stem formed by PL_curcc
2976 and whatever it mentions via ->next, and additional attached trees
2977 corresponding to temporarily unset infoblocks as in "5" above.
2978
9041c2e3 2979 In the following picture infoblocks for outer loop of
2ab05381
IZ
2980 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2981 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2982 infoblocks are drawn below the "reset" infoblock.
2983
2984 In fact in the picture below we do not show failed matches for Z and T
2985 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2986 more obvious *why* one needs to *temporary* unset infoblocks.]
2987
2988 Matched REx position InfoBlocks Comment
2989 (Y(A)*?Z)*?T x
2990 Y(A)*?Z)*?T x <- O
2991 Y (A)*?Z)*?T x <- O
2992 Y A)*?Z)*?T x <- O <- I
2993 YA )*?Z)*?T x <- O <- I
2994 YA A)*?Z)*?T x <- O <- I
2995 YAA )*?Z)*?T x <- O <- I
2996 YAA Z)*?T x <- O # Temporary unset I
2997 I
2998
2999 YAAZ Y(A)*?Z)*?T x <- O
3000 I
3001
3002 YAAZY (A)*?Z)*?T x <- O
3003 I
3004
3005 YAAZY A)*?Z)*?T x <- O <- I
3006 I
3007
3008 YAAZYA )*?Z)*?T x <- O <- I
3009 I
3010
3011 YAAZYA Z)*?T x <- O # Temporary unset I
3012 I,I
3013
3014 YAAZYAZ )*?T x <- O
3015 I,I
3016
3017 YAAZYAZ T x # Temporary unset O
3018 O
3019 I,I
3020
3021 YAAZYAZT x
3022 O
3023 I,I
3024 *******************************************************************/
a0d0e21e
LW
3025 case CURLYX: {
3026 CURCUR cc;
3280af22 3027 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3028 /* No need to save/restore up to this paren */
3029 I32 parenfloor = scan->flags;
c277df42
IZ
3030
3031 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3032 next += ARG(next);
3280af22
NIS
3033 cc.oldcc = PL_regcc;
3034 PL_regcc = &cc;
cb434fcc
IZ
3035 /* XXXX Probably it is better to teach regpush to support
3036 parenfloor > PL_regsize... */
3037 if (parenfloor > *PL_reglastparen)
3038 parenfloor = *PL_reglastparen; /* Pessimization... */
3039 cc.parenfloor = parenfloor;
a0d0e21e
LW
3040 cc.cur = -1;
3041 cc.min = ARG1(scan);
3042 cc.max = ARG2(scan);
c277df42 3043 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3044 cc.next = next;
3045 cc.minmod = minmod;
3046 cc.lastloc = 0;
3280af22 3047 PL_reginput = locinput;
a0d0e21e
LW
3048 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3049 regcpblow(cp);
3280af22 3050 PL_regcc = cc.oldcc;
4633a7c4 3051 saySAME(n);
a0d0e21e
LW
3052 }
3053 /* NOT REACHED */
3054 case WHILEM: {
3055 /*
3056 * This is really hard to understand, because after we match
3057 * what we're trying to match, we must make sure the rest of
2c2d71f5 3058 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3059 * to go back UP the parse tree by recursing ever deeper. And
3060 * if it fails, we have to reset our parent's current state
3061 * that we can try again after backing off.
3062 */
3063
c277df42 3064 CHECKPOINT cp, lastcp;
3280af22 3065 CURCUR* cc = PL_regcc;
c277df42
IZ
3066 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3067
4633a7c4 3068 n = cc->cur + 1; /* how many we know we matched */
3280af22 3069 PL_reginput = locinput;
a0d0e21e 3070
c277df42 3071 DEBUG_r(
9041c2e3
NIS
3072 PerlIO_printf(Perl_debug_log,
3073 "%*s %ld out of %ld..%ld cc=%lx\n",
3280af22 3074 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3075 (long)n, (long)cc->min,
c277df42
IZ
3076 (long)cc->max, (long)cc)
3077 );
4633a7c4 3078
a0d0e21e
LW
3079 /* If degenerate scan matches "", assume scan done. */
3080
579cf2c3 3081 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3082 PL_regcc = cc->oldcc;
2ab05381
IZ
3083 if (PL_regcc)
3084 ln = PL_regcc->cur;
c277df42 3085 DEBUG_r(
c3464db5
DD
3086 PerlIO_printf(Perl_debug_log,
3087 "%*s empty match detected, try continuation...\n",
3280af22 3088 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3089 );
a0d0e21e 3090 if (regmatch(cc->next))
4633a7c4 3091 sayYES;
2ab05381
IZ
3092 if (PL_regcc)
3093 PL_regcc->cur = ln;
3280af22 3094 PL_regcc = cc;
4633a7c4 3095 sayNO;
a0d0e21e
LW
3096 }
3097
3098 /* First just match a string of min scans. */
3099
3100 if (n < cc->min) {
3101 cc->cur = n;
3102 cc->lastloc = locinput;
4633a7c4
LW
3103 if (regmatch(cc->scan))
3104 sayYES;
3105 cc->cur = n - 1;
c277df42 3106 cc->lastloc = lastloc;
4633a7c4 3107 sayNO;
a0d0e21e
LW
3108 }
3109
2c2d71f5
JH
3110 if (scan->flags) {
3111 /* Check whether we already were at this position.
3112 Postpone detection until we know the match is not
3113 *that* much linear. */
3114 if (!PL_reg_maxiter) {
3115 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3116 PL_reg_leftiter = PL_reg_maxiter;
3117 }
3118 if (PL_reg_leftiter-- == 0) {
3119 I32 size = (PL_reg_maxiter + 7)/8;
3120 if (PL_reg_poscache) {
3121 if (PL_reg_poscache_size < size) {
3122 Renew(PL_reg_poscache, size, char);
3123 PL_reg_poscache_size = size;
3124 }
3125 Zero(PL_reg_poscache, size, char);
3126 }
3127 else {
3128 PL_reg_poscache_size = size;
3129 Newz(29, PL_reg_poscache, size, char);
3130 }
3131 DEBUG_r(
3132 PerlIO_printf(Perl_debug_log,
3133 "%sDetected a super-linear match, switching on caching%s...\n",
3134 PL_colors[4], PL_colors[5])
3135 );
3136 }
3137 if (PL_reg_leftiter < 0) {
3138 I32 o = locinput - PL_bostr, b;
3139
3140 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3141 b = o % 8;
3142 o /= 8;
3143 if (PL_reg_poscache[o] & (1<<b)) {
3144 DEBUG_r(
3145 PerlIO_printf(Perl_debug_log,
3146 "%*s already tried at this position...\n",
3147 REPORT_CODE_OFF+PL_regindent*2, "")
3148 );
7821416a 3149 sayNO_SILENT;
2c2d71f5
JH
3150 }
3151 PL_reg_poscache[o] |= (1<<b);
3152 }
3153 }
3154
a0d0e21e
LW
3155 /* Prefer next over scan for minimal matching. */
3156
3157 if (cc->minmod) {
3280af22 3158 PL_regcc = cc->oldcc;
2ab05381
IZ
3159 if (PL_regcc)
3160 ln = PL_regcc->cur;
5f05dabc 3161 cp = regcppush(cc->parenfloor);
02db2b7b 3162 REGCP_SET(lastcp);
5f05dabc 3163 if (regmatch(cc->next)) {
c277df42 3164 regcpblow(cp);
4633a7c4 3165 sayYES; /* All done. */
5f05dabc 3166 }
02db2b7b 3167 REGCP_UNWIND(lastcp);
5f05dabc 3168 regcppop();
2ab05381
IZ
3169 if (PL_regcc)
3170 PL_regcc->cur = ln;
3280af22 3171 PL_regcc = cc;
a0d0e21e 3172
c277df42 3173 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3174 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3175 && !(PL_reg_flags & RF_warned)) {
3176 PL_reg_flags |= RF_warned;
9014280d 3177 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3178 "Complex regular subexpression recursion",
3179 REG_INFTY - 1);
c277df42 3180 }
4633a7c4 3181 sayNO;
c277df42 3182 }
a687059c 3183
c277df42 3184 DEBUG_r(
c3464db5
DD
3185 PerlIO_printf(Perl_debug_log,
3186 "%*s trying longer...\n",
3280af22 3187 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3188 );
a0d0e21e 3189 /* Try scanning more and see if it helps. */
3280af22 3190 PL_reginput = locinput;
a0d0e21e
LW
3191 cc->cur = n;
3192 cc->lastloc = locinput;
5f05dabc 3193 cp = regcppush(cc->parenfloor);
02db2b7b 3194 REGCP_SET(lastcp);
5f05dabc 3195 if (regmatch(cc->scan)) {
c277df42 3196 regcpblow(cp);
4633a7c4 3197 sayYES;
5f05dabc 3198 }
02db2b7b 3199 REGCP_UNWIND(lastcp);
5f05dabc 3200 regcppop();
4633a7c4 3201 cc->cur = n - 1;
c277df42 3202 cc->lastloc = lastloc;
4633a7c4 3203 sayNO;
a0d0e21e
LW
3204 }
3205
3206 /* Prefer scan over next for maximal matching. */
3207
3208 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3209 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3210 cc->cur = n;
3211 cc->lastloc = locinput;
02db2b7b 3212 REGCP_SET(lastcp);
5f05dabc 3213 if (regmatch(cc->scan)) {
c277df42 3214 regcpblow(cp);
4633a7c4 3215 sayYES;
5f05dabc 3216 }
02db2b7b 3217 REGCP_UNWIND(lastcp);
a0d0e21e 3218 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3219 PL_reginput = locinput;
c277df42 3220 DEBUG_r(
c3464db5
DD
3221 PerlIO_printf(Perl_debug_log,
3222 "%*s failed, try continuation...\n",
3280af22 3223 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3224 );
3225 }
9041c2e3 3226 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3227 && !(PL_reg_flags & RF_warned)) {
3280af22 3228 PL_reg_flags |= RF_warned;
9014280d 3229 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3230 "Complex regular subexpression recursion",
3231 REG_INFTY - 1);
a0d0e21e
LW
3232 }
3233
3234 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3235 PL_regcc = cc->oldcc;
2ab05381
IZ
3236 if (PL_regcc)
3237 ln = PL_regcc->cur;
a0d0e21e 3238 if (regmatch(cc->next))
4633a7c4 3239 sayYES;
2ab05381
IZ
3240 if (PL_regcc)
3241 PL_regcc->cur = ln;
3280af22 3242 PL_regcc = cc;
4633a7c4 3243 cc->cur = n - 1;
c277df42 3244 cc->lastloc = lastloc;
4633a7c4 3245 sayNO;
a0d0e21e
LW
3246 }
3247 /* NOT REACHED */
9041c2e3 3248 case BRANCHJ:
c277df42
IZ
3249 next = scan + ARG(scan);
3250 if (next == scan)
3251 next = NULL;
3252 inner = NEXTOPER(NEXTOPER(scan));
3253 goto do_branch;
9041c2e3 3254 case BRANCH:
c277df42
IZ
3255 inner = NEXTOPER(scan);
3256 do_branch:
3257 {
c277df42
IZ
3258 c1 = OP(scan);
3259 if (OP(next) != c1) /* No choice. */
3260 next = inner; /* Avoid recursion. */
a0d0e21e 3261 else {
02db2b7b
IZ
3262 I32 lastparen = *PL_reglastparen;
3263 I32 unwind1;
3264 re_unwind_branch_t *uw;
3265
3266 /* Put unwinding data on stack */
3267 unwind1 = SSNEWt(1,re_unwind_branch_t);
3268 uw = SSPTRt(unwind1,re_unwind_branch_t);
3269 uw->prev = unwind;
3270 unwind = unwind1;
3271 uw->type = ((c1 == BRANCH)
3272 ? RE_UNWIND_BRANCH
3273 : RE_UNWIND_BRANCHJ);
3274 uw->lastparen = lastparen;
3275 uw->next = next;
3276 uw->locinput = locinput;
3277 uw->nextchr = nextchr;
3278#ifdef DEBUGGING
3279 uw->regindent = ++PL_regindent;
3280#endif
c277df42 3281
02db2b7b
IZ
3282 REGCP_SET(uw->lastcp);
3283
3284 /* Now go into the first branch */
3285 next = inner;
a687059c 3286 }
a0d0e21e
LW
3287 }
3288 break;
3289 case MINMOD:
3290 minmod = 1;
3291 break;
c277df42
IZ
3292 case CURLYM:
3293 {
00db4c45 3294 I32 l = 0;
c277df42 3295 CHECKPOINT lastcp;
9041c2e3 3296
c277df42
IZ
3297 /* We suppose that the next guy does not need
3298 backtracking: in particular, it is of constant length,
3299 and has no parenths to influence future backrefs. */
3300 ln = ARG1(scan); /* min to match */
3301 n = ARG2(scan); /* max to match */
c277df42
IZ
3302 paren = scan->flags;
3303 if (paren) {
3280af22
NIS
3304 if (paren > PL_regsize)
3305 PL_regsize = paren;
3306 if (paren > *PL_reglastparen)
3307 *PL_reglastparen = paren;
c277df42 3308 }
dc45a647 3309 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3310 if (paren)
3311 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3312 PL_reginput = locinput;
c277df42
IZ
3313 if (minmod) {
3314 minmod = 0;
3315 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3316 sayNO;
f31a99c8
HS
3317 /* if we matched something zero-length we don't need to
3318 backtrack - capturing parens are already defined, so
3319 the caveat in the maximal case doesn't apply
3320
3321 XXXX if ln == 0, we can redo this check first time
3322 through the following loop
3323 */
3324 if (ln && l == 0)
3325 n = ln; /* don't backtrack */
3280af22 3326 locinput = PL_reginput;
cca55fe3 3327 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3328 regnode *text_node = next;
3329
cca55fe3 3330 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3331
cca55fe3 3332 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3333 else {
cca55fe3
JP
3334 if (PL_regkind[(U8)OP(text_node)] == REF) {
3335 I32 n, ln;
3336 n = ARG(text_node); /* which paren pair */
3337 ln = PL_regstartp[n];
3338 /* assume yes if we haven't seen CLOSEn */
3339 if (
3340 *PL_reglastparen < n ||
3341 ln == -1 ||
3342 ln == PL_regendp[n]
3343 ) {
3344 c1 = c2 = -1000;
3345 goto assume_ok_MM;
3346 }
3347 c1 = *(PL_bostr + ln);
3348 }
3349 else { c1 = (U8)*STRING(text_node); }
af5decee 3350 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3351 c2 = PL_fold[c1];
af5decee 3352 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3353 c2 = PL_fold_locale[c1];
3354 else
3355 c2 = c1;
3356 }
a0ed51b3
LW
3357 }
3358 else
c277df42 3359 c1 = c2 = -1000;
cca55fe3 3360 assume_ok_MM:
02db2b7b 3361 REGCP_SET(lastcp);
5f4b28b2 3362 /* This may be improved if l == 0. */
c277df42
IZ
3363 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3364 /* If it could work, try it. */
3365 if (c1 == -1000 ||
3280af22
NIS
3366 UCHARAT(PL_reginput) == c1 ||
3367 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3368 {
3369 if (paren) {
f31a99c8 3370 if (ln) {
cf93c79d
IZ
3371 PL_regstartp[paren] =
3372 HOPc(PL_reginput, -l) - PL_bostr;
3373 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3374 }
3375 else
cf93c79d 3376 PL_regendp[paren] = -1;
c277df42
IZ
3377 }
3378 if (regmatch(next))
3379 sayYES;
02db2b7b 3380 REGCP_UNWIND(lastcp);
c277df42
IZ
3381 }
3382 /* Couldn't or didn't -- move forward. */
3280af22 3383 PL_reginput = locinput;
c277df42
IZ
3384 if (regrepeat_hard(scan, 1, &l)) {
3385 ln++;
3280af22 3386 locinput = PL_reginput;
c277df42
IZ
3387 }
3388 else
3389 sayNO;
3390 }
a0ed51b3
LW
3391 }
3392 else {
c277df42 3393 n = regrepeat_hard(scan, n, &l);
f31a99c8
HS
3394 /* if we matched something zero-length we don't need to
3395 backtrack, unless the minimum count is zero and we
3396 are capturing the result - in that case the capture
3397 being defined or not may affect later execution
3398 */
3399 if (n != 0 && l == 0 && !(paren && ln == 0))
3400 ln = n; /* don't backtrack */
3280af22 3401 locinput = PL_reginput;
c277df42 3402 DEBUG_r(
5c0ca799 3403 PerlIO_printf(Perl_debug_log,
faccc32b 3404 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3405 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3406 (IV) n, (IV)l)
c277df42
IZ
3407 );
3408 if (n >= ln) {
cca55fe3 3409 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3410 regnode *text_node = next;
3411
cca55fe3 3412 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3413
cca55fe3 3414 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3415 else {
cca55fe3
JP
3416 if (PL_regkind[(U8)OP(text_node)] == REF) {
3417 I32 n, ln;
3418 n = ARG(text_node); /* which paren pair */
3419 ln = PL_regstartp[n];
3420 /* assume yes if we haven't seen CLOSEn */
3421 if (
3422 *PL_reglastparen < n ||
3423 ln == -1 ||
3424 ln == PL_regendp[n]
3425 ) {
3426 c1 = c2 = -1000;
3427 goto assume_ok_REG;
3428 }
3429 c1 = *(PL_bostr + ln);
3430 }
3431 else { c1 = (U8)*STRING(text_node); }
3432
af5decee 3433 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3434 c2 = PL_fold[c1];
af5decee 3435 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3436 c2 = PL_fold_locale[c1];
3437 else
3438 c2 = c1;
3439 }
a0ed51b3
LW
3440 }
3441 else
c277df42
IZ
3442 c1 = c2 = -1000;
3443 }
cca55fe3 3444 assume_ok_REG:
02db2b7b 3445 REGCP_SET(lastcp);
c277df42
IZ
3446 while (n >= ln) {
3447 /* If it could work, try it. */
3448 if (c1 == -1000 ||
3280af22
NIS
3449 UCHARAT(PL_reginput) == c1 ||
3450 UCHARAT(PL_reginput) == c2)
a0ed51b3
LW
3451 {
3452 DEBUG_r(
c3464db5 3453 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3454 "%*s trying tail with n=%"IVdf"...\n",
3455 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3456 );
3457 if (paren) {
3458 if (n) {
cf93c79d
IZ
3459 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3460 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3461 }
a0ed51b3 3462 else
cf93c79d 3463 PL_regendp[paren] = -1;
c277df42 3464 }
a0ed51b3
LW
3465 if (regmatch(next))
3466 sayYES;
02db2b7b 3467 REGCP_UNWIND(lastcp);
a0ed51b3 3468 }
c277df42
IZ
3469 /* Couldn't or didn't -- back up. */
3470 n--;
dfe13c55 3471 locinput = HOPc(locinput, -l);
3280af22 3472 PL_reginput = locinput;
c277df42
IZ
3473 }
3474 }
3475 sayNO;
3476 break;
3477 }
3478 case CURLYN:
3479 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3480 if (paren > PL_regsize)
3481 PL_regsize = paren;
3482 if (paren > *PL_reglastparen)
3483 *PL_reglastparen = paren;
c277df42
IZ
3484 ln = ARG1(scan); /* min to match */
3485 n = ARG2(scan); /* max to match */
dc45a647 3486 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3487 goto repeat;
a0d0e21e 3488 case CURLY:
c277df42 3489 paren = 0;
a0d0e21e
LW
3490 ln = ARG1(scan); /* min to match */
3491 n = ARG2(scan); /* max to match */
dc45a647 3492 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3493 goto repeat;
3494 case STAR:
3495 ln = 0;
c277df42 3496 n = REG_INFTY;
a0d0e21e 3497 scan = NEXTOPER(scan);
c277df42 3498 paren = 0;
a0d0e21e
LW
3499 goto repeat;
3500 case PLUS:
c277df42
IZ
3501 ln = 1;
3502 n = REG_INFTY;
3503 scan = NEXTOPER(scan);
3504 paren = 0;
3505 repeat:
a0d0e21e
LW
3506 /*
3507 * Lookahead to avoid useless match attempts
3508 * when we know what character comes next.
3509 */
5f80c4cf
JP
3510
3511 /*
3512 * Used to only do .*x and .*?x, but now it allows
3513 * for )'s, ('s and (?{ ... })'s to be in the way
3514 * of the quantifier and the EXACT-like node. -- japhy
3515 */
3516
cca55fe3 3517 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3518 U8 *s;
3519 regnode *text_node = next;
3520
cca55fe3 3521 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3522
cca55fe3 3523 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3524 else {
cca55fe3
JP
3525 if (PL_regkind[(U8)OP(text_node)] == REF) {
3526 I32 n, ln;
3527 n = ARG(text_node); /* which paren pair */
3528 ln = PL_regstartp[n];
3529 /* assume yes if we haven't seen CLOSEn */
3530 if (
3531 *PL_reglastparen < n ||
3532 ln == -1 ||
3533 ln == PL_regendp[n]
3534 ) {
3535 c1 = c2 = -1000;
3536 goto assume_ok_easy;
3537 }
9246c65e 3538 s = (U8*)PL_bostr + ln;
cca55fe3
JP
3539 }
3540 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
3541
3542 if (!UTF) {
3543 c2 = c1 = *s;
f65d3ee7 3544 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3545 c2 = PL_fold[c1];
f65d3ee7 3546 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf 3547 c2 = PL_fold_locale[c1];
1aa99e6b 3548 }
5f80c4cf 3549 else { /* UTF */
f65d3ee7 3550 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3551 STRLEN ulen1, ulen2;
e7ae6809
JH
3552 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3553 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
3554
3555 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3556 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3557
872c91ae
JH
3558 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
3559 ckWARN(WARN_UTF8) ?
3560 0 : UTF8_ALLOW_ANY);